feat(agent): Implement URL summarization agent

2025-06-29 13:59:52 -06:00 · 2025-06-29 13:59:52 -06:00 · f88c650704
commit f88c650704
parent d66300e78e
6 changed files with 347 additions and 2 deletions
--- a/src/lib/agents/agentState.ts
+++ b/src/lib/agents/agentState.ts
@ -66,4 +66,12 @@ export const AgentState = Annotation.Root({
    reducer: (x, y) => y ?? x,
    default: () => 'webSearch',
  }),
+  urlsToSummarize: Annotation<string[]>({
+    reducer: (x, y) => y ?? x,
+    default: () => [],
+  }),
+  summarizationIntent: Annotation<string>({
+    reducer: (x, y) => y ?? x,
+    default: () => '',
+  }),
 });
--- a/src/lib/agents/analyzerAgent.ts
+++ b/src/lib/agents/analyzerAgent.ts
@ -80,6 +80,38 @@ export class AnalyzerAgent {
        state.originalQuery = state.query;
      }

+      // Check for URLs first - if found and not yet processed, route to URL summarization
+      if (!state.urlsToSummarize || state.urlsToSummarize.length === 0) {
+        const urlRegex = /https?:\/\/[^\s]+/gi;
+        const urls = [...new Set(state.query.match(urlRegex) || [])];
+        
+        if (urls.length > 0) {
+          console.log('URLs detected in initial query, routing to URL summarization');
+          console.log(`URLs found: ${urls.join(', ')}`);
+          
+          // Emit URL detection event
+          this.emitter.emit('agent_action', {
+            type: 'agent_action',
+            data: {
+              action: 'URLS_DETECTED_ROUTING',
+              message: `Detected ${urls.length} URL(s) in query - processing content first`,
+              details: {
+                query: state.query,
+                urls: urls,
+              },
+            },
+          });
+
+          return new Command({
+            goto: 'url_summarization',
+            update: {
+              urlsToSummarize: urls,
+              summarizationIntent: `Process the content from the provided URLs to help answer: ${state.query}`,
+            },
+          });
+        }
+      }
+
      // Skip full analysis if this is the first run.
      //if (state.fullAnalysisAttempts > 0) {
      // Emit initial analysis event
@ -108,7 +140,7 @@ export class AnalyzerAgent {
        context: state.relevantDocuments
          .map(
            (doc, index) =>
-              `<source${index + 1}>${doc?.metadata?.title ? `<title>${doc?.metadata?.title}</title>` : ''}<content>${doc.pageContent}</content></source${index + 1}>`,
+              `<source${index + 1}>${doc?.metadata?.title ? `<title>${doc?.metadata?.title}</title>` : ''}${doc?.metadata.url ? `<url>${doc?.metadata?.url}</url>` : ''}<content>${doc.pageContent}</content></source${index + 1}>`,
          )
          .join('\n\n'),
        date: formatDateForLLM(new Date()),
--- a/src/lib/agents/index.ts
+++ b/src/lib/agents/index.ts
@ -5,3 +5,4 @@ export { SynthesizerAgent } from './synthesizerAgent';
 export { TaskManagerAgent } from './taskManagerAgent';
 export { FileSearchAgent } from './fileSearchAgent';
 export { ContentRouterAgent } from './contentRouterAgent';
+export { URLSummarizationAgent } from './urlSummarizationAgent';
--- a/src/lib/agents/urlSummarizationAgent.ts
+++ b/src/lib/agents/urlSummarizationAgent.ts
@ -0,0 +1,288 @@
+import { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { AIMessage } from '@langchain/core/messages';
+import { Command, END } from '@langchain/langgraph';
+import { EventEmitter } from 'events';
+import { Document } from 'langchain/document';
+import { AgentState } from './agentState';
+import { getWebContent } from '../utils/documents';
+import { removeThinkingBlocks } from '../utils/contentUtils';
+import { setTemperature } from '../utils/modelUtils';
+
+export class URLSummarizationAgent {
+  private llm: BaseChatModel;
+  private emitter: EventEmitter;
+  private systemInstructions: string;
+  private signal: AbortSignal;
+
+  constructor(
+    llm: BaseChatModel,
+    emitter: EventEmitter,
+    systemInstructions: string,
+    signal: AbortSignal,
+  ) {
+    this.llm = llm;
+    this.emitter = emitter;
+    this.systemInstructions = systemInstructions;
+    this.signal = signal;
+  }
+
+  /**
+   * URL processing agent node
+   */
+  async execute(state: typeof AgentState.State): Promise<Command> {
+    try {
+      setTemperature(this.llm, 0); // Set temperature to 0 for deterministic output
+
+      // Use pre-analyzed URLs from ContentRouterAgent
+      const urlsToProcess = state.urlsToSummarize || [];
+      const summarizationIntent = state.summarizationIntent || 'process content to help answer the user query';
+
+      if (urlsToProcess.length === 0) {
+        console.log('No URLs found for processing, routing back to content router');
+        return new Command({
+          goto: 'content_router',
+          update: {
+            messages: [new AIMessage('No URLs found for processing, routing to content router')],
+          },
+        });
+      }
+
+      console.log(`URL processing detected. URLs: ${urlsToProcess.join(', ')}`);
+      console.log(`Processing intent: ${summarizationIntent}`);
+
+      // Emit URL detection event
+      this.emitter.emit('agent_action', {
+        type: 'agent_action',
+        data: {
+          action: 'URL_PROCESSING_DETECTED',
+          message: `Processing ${urlsToProcess.length} URL(s) to extract content for analysis`,
+          details: {
+            query: state.query,
+            urls: urlsToProcess,
+            intent: summarizationIntent,
+          },
+        },
+      });
+
+      const documents: Document[] = [];
+
+      // Process each URL
+      for (const url of urlsToProcess) {
+        if (this.signal.aborted) {
+          console.warn('URL summarization operation aborted by signal');
+          break;
+        }
+
+        try {
+          // Emit URL processing event
+          this.emitter.emit('agent_action', {
+            type: 'agent_action',
+            data: {
+              action: 'PROCESSING_URL',
+              message: `Retrieving and processing content from: ${url}`,
+              details: {
+                query: state.query,
+                sourceUrl: url,
+                intent: summarizationIntent,
+              },
+            },
+          });
+
+          // Fetch full content using the enhanced web content retrieval
+          const webContent = await getWebContent(url, true);
+
+          if (!webContent || !webContent.pageContent) {
+            console.warn(`No content retrieved from URL: ${url}`);
+            
+            // Emit URL processing failure event
+            this.emitter.emit('agent_action', {
+              type: 'agent_action',
+              data: {
+                action: 'URL_PROCESSING_FAILED',
+                message: `Failed to retrieve content from: ${url}`,
+                details: {
+                  query: state.query,
+                  sourceUrl: url,
+                  reason: 'No content retrieved',
+                },
+              },
+            });
+            continue;
+          }
+
+          const contentLength = webContent.pageContent.length;
+          let finalContent: string;
+          let processingType: string;
+
+          // If content is short (< 4000 chars), use it directly; otherwise summarize
+          if (contentLength < 4000) {
+            finalContent = webContent.pageContent;
+            processingType = 'url-direct-content';
+            
+            console.log(`Content is short (${contentLength} chars), using directly without summarization`);
+            
+            // Emit direct content usage event
+            this.emitter.emit('agent_action', {
+              type: 'agent_action',
+              data: {
+                action: 'URL_DIRECT_CONTENT',
+                message: `Content is short (${contentLength} chars), using directly from: ${url}`,
+                details: {
+                  query: state.query,
+                  sourceUrl: url,
+                  sourceTitle: webContent.metadata.title || 'Web Page',
+                  contentLength: contentLength,
+                  intent: summarizationIntent,
+                },
+              },
+            });
+          } else {
+            // Content is long, summarize using LLM
+            console.log(`Content is long (${contentLength} chars), generating summary`);
+            
+            const systemPrompt = this.systemInstructions
+              ? `${this.systemInstructions}\n\n`
+              : '';
+
+            const summarizationPrompt = `${systemPrompt}You are a web content processor. Extract and summarize ONLY the information from the provided web page content that is relevant to the user's query.
+
+# Critical Instructions
+- Output ONLY a summary of the web page content provided below
+- Focus on information that relates to or helps answer the user's query
+- Do NOT add pleasantries, greetings, or conversational elements
+- Do NOT mention missing URLs, other pages, or content not provided
+- Do NOT ask follow-up questions or suggest additional actions
+- Do NOT add commentary about the user's request or query
+- Present the information in a clear, well-structured format with key facts and details
+- Include all relevant details that could help answer the user's question
+
+# User's Query: ${state.query}
+
+# Content Title: ${webContent.metadata.title || 'Web Page'}
+# Content URL: ${url}
+
+# Web Page Content to Summarize:
+${webContent.pageContent}
+
+Provide a comprehensive summary of the above web page content, focusing on information relevant to the user's query:`;
+
+            const result = await this.llm.invoke(summarizationPrompt, {
+              signal: this.signal,
+            });
+
+            finalContent = removeThinkingBlocks(result.content as string);
+            processingType = 'url-content-extraction';
+          }
+
+          if (finalContent && finalContent.trim().length > 0) {
+            const document = new Document({
+              pageContent: finalContent,
+              metadata: {
+                title: webContent.metadata.title || 'URL Content',
+                url: url,
+                source: url,
+                processingType: processingType,
+                processingIntent: summarizationIntent,
+                originalContentLength: contentLength,
+              },
+            });
+
+            documents.push(document);
+
+            // Emit successful URL processing event
+            this.emitter.emit('agent_action', {
+              type: 'agent_action',
+              data: {
+                action: 'URL_CONTENT_EXTRACTED',
+                message: `Successfully processed content from: ${url}`,
+                details: {
+                  query: state.query,
+                  sourceUrl: url,
+                  sourceTitle: webContent.metadata.title || 'Web Page',
+                  contentLength: finalContent.length,
+                  originalContentLength: contentLength,
+                  processingType: processingType,
+                  intent: summarizationIntent,
+                },
+              },
+            });
+
+            console.log(
+              `Successfully processed content from ${url} (${finalContent.length} characters, ${processingType})`,
+            );
+          } else {
+            console.warn(`No valid content generated for URL: ${url}`);
+          }
+        } catch (error) {
+          console.error(`Error processing URL ${url}:`, error);
+          
+          // Emit URL processing error event
+          this.emitter.emit('agent_action', {
+            type: 'agent_action',
+            data: {
+              action: 'URL_PROCESSING_ERROR',
+              message: `Error processing URL: ${url}`,
+              details: {
+                query: state.query,
+                sourceUrl: url,
+                error: error instanceof Error ? error.message : 'Unknown error',
+              },
+            },
+          });
+        }
+      }
+
+      if (documents.length === 0) {
+        const errorMessage = `No content could be retrieved or summarized from the provided URL(s): ${urlsToProcess.join(', ')}`;
+        console.error(errorMessage);
+
+        return new Command({
+          goto: 'analyzer',
+          update: {
+            messages: [new AIMessage(errorMessage)],
+          },
+        });
+      }
+
+      // Emit completion event
+      this.emitter.emit('agent_action', {
+        type: 'agent_action',
+        data: {
+          action: 'URL_PROCESSING_COMPLETED',
+          message: `Successfully processed ${documents.length} URL(s) and extracted content`,
+          details: {
+            query: state.query,
+            processedUrls: urlsToProcess.length,
+            successfulExtractions: documents.length,
+            intent: summarizationIntent,
+          },
+        },
+      });
+
+      const responseMessage = `URL processing completed. Successfully processed ${documents.length} out of ${urlsToProcess.length} URLs.`;
+      console.log(responseMessage);
+
+      return new Command({
+        goto: 'analyzer', // Route to analyzer to continue with normal workflow after URL processing
+        update: {
+          messages: [new AIMessage(responseMessage)],
+          relevantDocuments: documents,
+        },
+      });
+    } catch (error) {
+      console.error('URL summarization error:', error);
+      const errorMessage = new AIMessage(
+        `URL summarization failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
+      );
+
+      return new Command({
+        goto: END,
+        update: {
+          messages: [errorMessage],
+        },
+      });
+    } finally {
+      setTemperature(this.llm, undefined); // Reset temperature to default
+    }
+  }
+}
--- a/src/lib/prompts/analyzer.ts
+++ b/src/lib/prompts/analyzer.ts
@ -28,6 +28,7 @@ Consider the following when evaluating sufficiency:
 - If the user is requesting to avoid web searches → respond with \`good_content\`
 - If the user is asking you to be creative, such as writing a story, poem, or creative content → respond with \`good_content\` unless the context is clearly insufficient
 - If file documents contain complete information for file-specific queries → respond with \`good_content\`
+- If the user is requesting specific web content and there is a source that corresponds to that request in the context, it can be considered sufficient even if the content is not exhaustive or looks like errors → respond with \`good_content\`

 ## Step 2: If content is insufficient, determine the type of missing information

--- a/src/lib/search/agentSearch.ts
+++ b/src/lib/search/agentSearch.ts
@ -21,6 +21,7 @@ import {
  TaskManagerAgent,
  FileSearchAgent,
  ContentRouterAgent,
+  URLSummarizationAgent,
 } from '../agents';

 /**
@ -37,6 +38,7 @@ export class AgentSearch {
  private synthesizerAgent: SynthesizerAgent;
  private fileSearchAgent: FileSearchAgent;
  private contentRouterAgent: ContentRouterAgent;
+  private urlSummarizationAgent: URLSummarizationAgent;
  private emitter: EventEmitter;
  private focusMode: string;

@ -95,6 +97,12 @@ export class AgentSearch {
      systemInstructions,
      signal,
    );
+    this.urlSummarizationAgent = new URLSummarizationAgent(
+      llm,
+      emitter,
+      systemInstructions,
+      signal,
+    );
  }

  /**
@ -102,6 +110,13 @@ export class AgentSearch {
   */
  private createWorkflow() {
    const workflow = new StateGraph(AgentState)
+      .addNode(
+        'url_summarization',
+        this.urlSummarizationAgent.execute.bind(this.urlSummarizationAgent),
+        {
+          ends: ['task_manager', 'analyzer'],
+        },
+      )
      .addNode(
        'task_manager',
        this.taskManagerAgent.execute.bind(this.taskManagerAgent),
@ -134,7 +149,7 @@ export class AgentSearch {
        'analyzer',
        this.analyzerAgent.execute.bind(this.analyzerAgent),
        {
-          ends: ['task_manager', 'synthesizer'],
+          ends: ['url_summarization', 'task_manager', 'synthesizer'],
        },
      )
      .addNode(