diff --git a/src/lib/agents/agentState.ts b/src/lib/agents/agentState.ts index c8f89ed..2df28b8 100644 --- a/src/lib/agents/agentState.ts +++ b/src/lib/agents/agentState.ts @@ -66,4 +66,12 @@ export const AgentState = Annotation.Root({ reducer: (x, y) => y ?? x, default: () => 'webSearch', }), + urlsToSummarize: Annotation({ + reducer: (x, y) => y ?? x, + default: () => [], + }), + summarizationIntent: Annotation({ + reducer: (x, y) => y ?? x, + default: () => '', + }), }); diff --git a/src/lib/agents/analyzerAgent.ts b/src/lib/agents/analyzerAgent.ts index bcbf84d..7a5d522 100644 --- a/src/lib/agents/analyzerAgent.ts +++ b/src/lib/agents/analyzerAgent.ts @@ -80,6 +80,38 @@ export class AnalyzerAgent { state.originalQuery = state.query; } + // Check for URLs first - if found and not yet processed, route to URL summarization + if (!state.urlsToSummarize || state.urlsToSummarize.length === 0) { + const urlRegex = /https?:\/\/[^\s]+/gi; + const urls = [...new Set(state.query.match(urlRegex) || [])]; + + if (urls.length > 0) { + console.log('URLs detected in initial query, routing to URL summarization'); + console.log(`URLs found: ${urls.join(', ')}`); + + // Emit URL detection event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URLS_DETECTED_ROUTING', + message: `Detected ${urls.length} URL(s) in query - processing content first`, + details: { + query: state.query, + urls: urls, + }, + }, + }); + + return new Command({ + goto: 'url_summarization', + update: { + urlsToSummarize: urls, + summarizationIntent: `Process the content from the provided URLs to help answer: ${state.query}`, + }, + }); + } + } + // Skip full analysis if this is the first run. //if (state.fullAnalysisAttempts > 0) { // Emit initial analysis event @@ -108,7 +140,7 @@ export class AnalyzerAgent { context: state.relevantDocuments .map( (doc, index) => - `${doc?.metadata?.title ? `${doc?.metadata?.title}` : ''}${doc.pageContent}`, + `${doc?.metadata?.title ? `${doc?.metadata?.title}` : ''}${doc?.metadata.url ? `${doc?.metadata?.url}` : ''}${doc.pageContent}`, ) .join('\n\n'), date: formatDateForLLM(new Date()), diff --git a/src/lib/agents/index.ts b/src/lib/agents/index.ts index 5bc5cdb..b6e5aff 100644 --- a/src/lib/agents/index.ts +++ b/src/lib/agents/index.ts @@ -5,3 +5,4 @@ export { SynthesizerAgent } from './synthesizerAgent'; export { TaskManagerAgent } from './taskManagerAgent'; export { FileSearchAgent } from './fileSearchAgent'; export { ContentRouterAgent } from './contentRouterAgent'; +export { URLSummarizationAgent } from './urlSummarizationAgent'; diff --git a/src/lib/agents/urlSummarizationAgent.ts b/src/lib/agents/urlSummarizationAgent.ts new file mode 100644 index 0000000..3a0ecf4 --- /dev/null +++ b/src/lib/agents/urlSummarizationAgent.ts @@ -0,0 +1,288 @@ +import { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import { AIMessage } from '@langchain/core/messages'; +import { Command, END } from '@langchain/langgraph'; +import { EventEmitter } from 'events'; +import { Document } from 'langchain/document'; +import { AgentState } from './agentState'; +import { getWebContent } from '../utils/documents'; +import { removeThinkingBlocks } from '../utils/contentUtils'; +import { setTemperature } from '../utils/modelUtils'; + +export class URLSummarizationAgent { + private llm: BaseChatModel; + private emitter: EventEmitter; + private systemInstructions: string; + private signal: AbortSignal; + + constructor( + llm: BaseChatModel, + emitter: EventEmitter, + systemInstructions: string, + signal: AbortSignal, + ) { + this.llm = llm; + this.emitter = emitter; + this.systemInstructions = systemInstructions; + this.signal = signal; + } + + /** + * URL processing agent node + */ + async execute(state: typeof AgentState.State): Promise { + try { + setTemperature(this.llm, 0); // Set temperature to 0 for deterministic output + + // Use pre-analyzed URLs from ContentRouterAgent + const urlsToProcess = state.urlsToSummarize || []; + const summarizationIntent = state.summarizationIntent || 'process content to help answer the user query'; + + if (urlsToProcess.length === 0) { + console.log('No URLs found for processing, routing back to content router'); + return new Command({ + goto: 'content_router', + update: { + messages: [new AIMessage('No URLs found for processing, routing to content router')], + }, + }); + } + + console.log(`URL processing detected. URLs: ${urlsToProcess.join(', ')}`); + console.log(`Processing intent: ${summarizationIntent}`); + + // Emit URL detection event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_PROCESSING_DETECTED', + message: `Processing ${urlsToProcess.length} URL(s) to extract content for analysis`, + details: { + query: state.query, + urls: urlsToProcess, + intent: summarizationIntent, + }, + }, + }); + + const documents: Document[] = []; + + // Process each URL + for (const url of urlsToProcess) { + if (this.signal.aborted) { + console.warn('URL summarization operation aborted by signal'); + break; + } + + try { + // Emit URL processing event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'PROCESSING_URL', + message: `Retrieving and processing content from: ${url}`, + details: { + query: state.query, + sourceUrl: url, + intent: summarizationIntent, + }, + }, + }); + + // Fetch full content using the enhanced web content retrieval + const webContent = await getWebContent(url, true); + + if (!webContent || !webContent.pageContent) { + console.warn(`No content retrieved from URL: ${url}`); + + // Emit URL processing failure event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_PROCESSING_FAILED', + message: `Failed to retrieve content from: ${url}`, + details: { + query: state.query, + sourceUrl: url, + reason: 'No content retrieved', + }, + }, + }); + continue; + } + + const contentLength = webContent.pageContent.length; + let finalContent: string; + let processingType: string; + + // If content is short (< 4000 chars), use it directly; otherwise summarize + if (contentLength < 4000) { + finalContent = webContent.pageContent; + processingType = 'url-direct-content'; + + console.log(`Content is short (${contentLength} chars), using directly without summarization`); + + // Emit direct content usage event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_DIRECT_CONTENT', + message: `Content is short (${contentLength} chars), using directly from: ${url}`, + details: { + query: state.query, + sourceUrl: url, + sourceTitle: webContent.metadata.title || 'Web Page', + contentLength: contentLength, + intent: summarizationIntent, + }, + }, + }); + } else { + // Content is long, summarize using LLM + console.log(`Content is long (${contentLength} chars), generating summary`); + + const systemPrompt = this.systemInstructions + ? `${this.systemInstructions}\n\n` + : ''; + + const summarizationPrompt = `${systemPrompt}You are a web content processor. Extract and summarize ONLY the information from the provided web page content that is relevant to the user's query. + +# Critical Instructions +- Output ONLY a summary of the web page content provided below +- Focus on information that relates to or helps answer the user's query +- Do NOT add pleasantries, greetings, or conversational elements +- Do NOT mention missing URLs, other pages, or content not provided +- Do NOT ask follow-up questions or suggest additional actions +- Do NOT add commentary about the user's request or query +- Present the information in a clear, well-structured format with key facts and details +- Include all relevant details that could help answer the user's question + +# User's Query: ${state.query} + +# Content Title: ${webContent.metadata.title || 'Web Page'} +# Content URL: ${url} + +# Web Page Content to Summarize: +${webContent.pageContent} + +Provide a comprehensive summary of the above web page content, focusing on information relevant to the user's query:`; + + const result = await this.llm.invoke(summarizationPrompt, { + signal: this.signal, + }); + + finalContent = removeThinkingBlocks(result.content as string); + processingType = 'url-content-extraction'; + } + + if (finalContent && finalContent.trim().length > 0) { + const document = new Document({ + pageContent: finalContent, + metadata: { + title: webContent.metadata.title || 'URL Content', + url: url, + source: url, + processingType: processingType, + processingIntent: summarizationIntent, + originalContentLength: contentLength, + }, + }); + + documents.push(document); + + // Emit successful URL processing event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_CONTENT_EXTRACTED', + message: `Successfully processed content from: ${url}`, + details: { + query: state.query, + sourceUrl: url, + sourceTitle: webContent.metadata.title || 'Web Page', + contentLength: finalContent.length, + originalContentLength: contentLength, + processingType: processingType, + intent: summarizationIntent, + }, + }, + }); + + console.log( + `Successfully processed content from ${url} (${finalContent.length} characters, ${processingType})`, + ); + } else { + console.warn(`No valid content generated for URL: ${url}`); + } + } catch (error) { + console.error(`Error processing URL ${url}:`, error); + + // Emit URL processing error event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_PROCESSING_ERROR', + message: `Error processing URL: ${url}`, + details: { + query: state.query, + sourceUrl: url, + error: error instanceof Error ? error.message : 'Unknown error', + }, + }, + }); + } + } + + if (documents.length === 0) { + const errorMessage = `No content could be retrieved or summarized from the provided URL(s): ${urlsToProcess.join(', ')}`; + console.error(errorMessage); + + return new Command({ + goto: 'analyzer', + update: { + messages: [new AIMessage(errorMessage)], + }, + }); + } + + // Emit completion event + this.emitter.emit('agent_action', { + type: 'agent_action', + data: { + action: 'URL_PROCESSING_COMPLETED', + message: `Successfully processed ${documents.length} URL(s) and extracted content`, + details: { + query: state.query, + processedUrls: urlsToProcess.length, + successfulExtractions: documents.length, + intent: summarizationIntent, + }, + }, + }); + + const responseMessage = `URL processing completed. Successfully processed ${documents.length} out of ${urlsToProcess.length} URLs.`; + console.log(responseMessage); + + return new Command({ + goto: 'analyzer', // Route to analyzer to continue with normal workflow after URL processing + update: { + messages: [new AIMessage(responseMessage)], + relevantDocuments: documents, + }, + }); + } catch (error) { + console.error('URL summarization error:', error); + const errorMessage = new AIMessage( + `URL summarization failed: ${error instanceof Error ? error.message : 'Unknown error'}`, + ); + + return new Command({ + goto: END, + update: { + messages: [errorMessage], + }, + }); + } finally { + setTemperature(this.llm, undefined); // Reset temperature to default + } + } +} diff --git a/src/lib/prompts/analyzer.ts b/src/lib/prompts/analyzer.ts index 9ccb67d..5f0c3f3 100644 --- a/src/lib/prompts/analyzer.ts +++ b/src/lib/prompts/analyzer.ts @@ -28,6 +28,7 @@ Consider the following when evaluating sufficiency: - If the user is requesting to avoid web searches → respond with \`good_content\` - If the user is asking you to be creative, such as writing a story, poem, or creative content → respond with \`good_content\` unless the context is clearly insufficient - If file documents contain complete information for file-specific queries → respond with \`good_content\` +- If the user is requesting specific web content and there is a source that corresponds to that request in the context, it can be considered sufficient even if the content is not exhaustive or looks like errors → respond with \`good_content\` ## Step 2: If content is insufficient, determine the type of missing information diff --git a/src/lib/search/agentSearch.ts b/src/lib/search/agentSearch.ts index 0d5b727..f9ed9d3 100644 --- a/src/lib/search/agentSearch.ts +++ b/src/lib/search/agentSearch.ts @@ -21,6 +21,7 @@ import { TaskManagerAgent, FileSearchAgent, ContentRouterAgent, + URLSummarizationAgent, } from '../agents'; /** @@ -37,6 +38,7 @@ export class AgentSearch { private synthesizerAgent: SynthesizerAgent; private fileSearchAgent: FileSearchAgent; private contentRouterAgent: ContentRouterAgent; + private urlSummarizationAgent: URLSummarizationAgent; private emitter: EventEmitter; private focusMode: string; @@ -95,6 +97,12 @@ export class AgentSearch { systemInstructions, signal, ); + this.urlSummarizationAgent = new URLSummarizationAgent( + llm, + emitter, + systemInstructions, + signal, + ); } /** @@ -102,6 +110,13 @@ export class AgentSearch { */ private createWorkflow() { const workflow = new StateGraph(AgentState) + .addNode( + 'url_summarization', + this.urlSummarizationAgent.execute.bind(this.urlSummarizationAgent), + { + ends: ['task_manager', 'analyzer'], + }, + ) .addNode( 'task_manager', this.taskManagerAgent.execute.bind(this.taskManagerAgent), @@ -134,7 +149,7 @@ export class AgentSearch { 'analyzer', this.analyzerAgent.execute.bind(this.analyzerAgent), { - ends: ['task_manager', 'synthesizer'], + ends: ['url_summarization', 'task_manager', 'synthesizer'], }, ) .addNode(