diff --git a/src/components/MessageInputActions/Optimization.tsx b/src/components/MessageInputActions/Optimization.tsx index 0d6e10b..f2f1edc 100644 --- a/src/components/MessageInputActions/Optimization.tsx +++ b/src/components/MessageInputActions/Optimization.tsx @@ -11,19 +11,22 @@ const OptimizationModes = [ { key: 'speed', title: 'Speed', - description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.', + description: + 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.', icon: , }, { key: 'balanced', title: 'Balanced', - description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.', + description: + 'Find the right balance between speed and accuracy. Medium effort retrieving web content.', icon: , }, { key: 'quality', title: 'Quality', - description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.', + description: + 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.', icon: ( \` and \`\` tags in your thinking. Those tags should only be used in the final output. +You are an AI suggestion generator for an AI powered search engine. -Provide these suggestions separated by newlines between the XML tags and . For example: +# Instructions +- You will be given a conversation below +- Generate 5 total suggestions based on the conversation + - Three of the suggestions should be relevant to the conversation so it can be used by the user to ask the chat model for more information + - Two of the suggestions should still be relevant to the conversation but could optionally steer the conversation in a different direction + - The suggestions should be in the form of questions + - The suggestions should not be something that is already in the conversation +- The conversation history is provided in the conversation section below + # Output Format +- If you are a thinking or reasoning AI, you should avoid using \`\` and \`\` tags in your thinking. Those tags should only be used in the final output. +- Provide these suggestions separated by newlines between the XML tags and . For example: +- Make sure each suggestion is a single line and does not contain any newlines or any formatting +- Example output is provided in the example section below + + Tell me more about SpaceX and their recent projects What is the latest news on SpaceX? Who is the CEO of SpaceX? + -Conversation: + {chat_history} + `; type SuggestionGeneratorInput = { diff --git a/src/lib/search/metaSearchAgent.ts b/src/lib/search/metaSearchAgent.ts index 8798c54..aabb2ac 100644 --- a/src/lib/search/metaSearchAgent.ts +++ b/src/lib/search/metaSearchAgent.ts @@ -22,7 +22,11 @@ import LineOutputParser from '../outputParsers/lineOutputParser'; import LineListOutputParser from '../outputParsers/listLineOutputParser'; import { searchSearxng } from '../searxng'; import computeSimilarity from '../utils/computeSimilarity'; -import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents'; +import { + getDocumentsFromLinks, + getWebContent, + getWebContentLite, +} from '../utils/documents'; import formatChatHistoryAsString from '../utils/formatHistory'; import { getModelName } from '../utils/modelUtils'; @@ -99,70 +103,71 @@ class MetaSearchAgent implements MetaSearchAgentType { llm, this.strParser, RunnableLambda.from(async (input: string) => { - //console.log(`LLM response for initial web search:"${input}"`); - const linksOutputParser = new LineListOutputParser({ - key: 'links', - }); - - const questionOutputParser = new LineOutputParser({ - key: 'answer', - }); - - const links = await linksOutputParser.parse(input); - let question = await questionOutputParser.parse(input); - - //console.log('question', question); - - if (question === 'not_needed') { - return { query: '', docs: [] }; - } - - if (links.length > 0) { - if (question.length === 0) { - question = 'summarize'; - } - - let docs: Document[] = []; - - const linkDocs = await getDocumentsFromLinks({ links }); - - const docGroups: Document[] = []; - - linkDocs.map((doc) => { - const URLDocExists = docGroups.find( - (d) => - d.metadata.url === doc.metadata.url && - d.metadata.totalDocs < 10, - ); - - if (!URLDocExists) { - docGroups.push({ - ...doc, - metadata: { - ...doc.metadata, - totalDocs: 1, - }, - }); - } - - const docIndex = docGroups.findIndex( - (d) => - d.metadata.url === doc.metadata.url && - d.metadata.totalDocs < 10, - ); - - if (docIndex !== -1) { - docGroups[docIndex].pageContent = - docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; - docGroups[docIndex].metadata.totalDocs += 1; - } + try { + //console.log(`LLM response for initial web search:"${input}"`); + const linksOutputParser = new LineListOutputParser({ + key: 'links', }); - this.emitProgress(emitter, 20, `Summarizing content`); + const questionOutputParser = new LineOutputParser({ + key: 'answer', + }); - await Promise.all( - docGroups.map(async (doc) => { - const res = await llm.invoke(` + const links = await linksOutputParser.parse(input); + let question = await questionOutputParser.parse(input); + + //console.log('question', question); + + if (question === 'not_needed') { + return { query: '', docs: [] }; + } + + if (links.length > 0) { + if (question.length === 0) { + question = 'summarize'; + } + + let docs: Document[] = []; + + const linkDocs = await getDocumentsFromLinks({ links }); + + const docGroups: Document[] = []; + + linkDocs.map((doc) => { + const URLDocExists = docGroups.find( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (!URLDocExists) { + docGroups.push({ + ...doc, + metadata: { + ...doc.metadata, + totalDocs: 1, + }, + }); + } + + const docIndex = docGroups.findIndex( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (docIndex !== -1) { + docGroups[docIndex].pageContent = + docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; + docGroups[docIndex].metadata.totalDocs += 1; + } + }); + + this.emitProgress(emitter, 20, `Summarizing content`); + + await Promise.all( + docGroups.map(async (doc) => { + const res = await llm.invoke(` You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query. If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary. @@ -223,50 +228,55 @@ class MetaSearchAgent implements MetaSearchAgentType { Make sure to answer the query in the summary. `); - const document = new Document({ - pageContent: res.content as string, - metadata: { - title: doc.metadata.title, - url: doc.metadata.url, - }, - }); + const document = new Document({ + pageContent: res.content as string, + metadata: { + title: doc.metadata.title, + url: doc.metadata.url, + }, + }); - docs.push(document); - }), - ); - - return { query: question, docs: docs }; - } else { - this.emitProgress(emitter, 20, `Searching the web`); - if (this.config.additionalSearchCriteria) { - question = `${question} ${this.config.additionalSearchCriteria}`; - } - - const searxngResult = await searchSearxng(question, { - language: 'en', - engines: this.config.activeEngines, - }); - - // Store the SearXNG URL for later use in emitting to the client - this.searxngUrl = searxngResult.searchUrl; - - const documents = searxngResult.results.map( - (result) => - new Document({ - pageContent: - result.content || - (this.config.activeEngines.includes('youtube') - ? result.title - : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */, - metadata: { - title: result.title, - url: result.url, - ...(result.img_src && { img_src: result.img_src }), - }, + docs.push(document); }), - ); + ); - return { query: question, docs: documents, searchQuery: question }; + return { query: question, docs: docs }; + } else { + if (this.config.additionalSearchCriteria) { + question = `${question} ${this.config.additionalSearchCriteria}`; + } + this.emitProgress(emitter, 20, `Searching the web: "${question}"`); + + const searxngResult = await searchSearxng(question, { + language: 'en', + engines: this.config.activeEngines, + }); + + // Store the SearXNG URL for later use in emitting to the client + this.searxngUrl = searxngResult.searchUrl; + + const documents = searxngResult.results.map( + (result) => + new Document({ + pageContent: + result.content || + (this.config.activeEngines.includes('youtube') + ? result.title + : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */, + metadata: { + title: result.title, + url: result.url, + ...(result.img_src && { img_src: result.img_src }), + }, + }), + ); + + return { query: question, docs: documents, searchQuery: question }; + } + } catch (error) { + console.error('Error in search retriever chain:', error); + emitter.emit('error', JSON.stringify({ data: error })); + throw error; } }), ]); @@ -360,6 +370,103 @@ class MetaSearchAgent implements MetaSearchAgentType { }); } + private async checkIfEnoughInformation( + docs: Document[], + query: string, + llm: BaseChatModel, + emitter: eventEmitter, + ): Promise { + const formattedDocs = this.processDocs(docs); + + const response = + await llm.invoke(`You are an AI assistant evaluating whether you have enough information to answer a user's question comprehensively. + +Based on the following sources, determine if you have sufficient information to provide a detailed, accurate answer to the query: "${query}" + +Sources: +${formattedDocs} + +Look for: +1. Key facts and details directly relevant to the query +2. Multiple perspectives or sources if the topic is complex +3. Up-to-date information if the query requires current data +4. Sufficient context to understand the topic fully + +Output ONLY \`yes\` if you have enough information to answer comprehensively, or \`no\` if more information would significantly improve the answer.`); + + const answerParser = new LineOutputParser({ + key: 'answer', + }); + const responseText = await answerParser.parse( + (response.content as string).trim().toLowerCase(), + ); + if (responseText !== 'yes') { + console.log( + `LLM response for checking if we have enough information: "${response.content}"`, + ); + } else { + console.log( + 'LLM response indicates we have enough information to answer the query.', + ); + } + return responseText === 'yes'; + } + + private async processSource( + doc: Document, + query: string, + llm: BaseChatModel, + summaryParser: LineOutputParser, + ): Promise { + try { + const url = doc.metadata.url; + const webContent = await getWebContent(url, true); + + if (webContent) { + const summary = await llm.invoke(` +You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage +Your summary should: +- Be thorough and comprehensive, capturing all key points +- Format the content using markdown, including headings, lists, and tables +- Include specific details, numbers, and quotes when relevant +- Be concise and to the point, avoiding unnecessary fluff +- Answer the user's query, which is: ${query} +- Output your answer in an XML format, with the summary inside the \`summary\` XML tag +- If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed + - E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X" + - Make sure the reason the source is not needed is very specific and detailed +- Include useful links to external resources, if applicable + +Here is the content to summarize: +${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent} + `); + + const summarizedContent = await summaryParser.parse( + summary.content as string, + ); + + if (summarizedContent.toLocaleLowerCase().startsWith('not_needed')) { + console.log( + `LLM response for URL "${url}" indicates it's not needed:`, + summarizedContent, + ); + return null; + } + + return new Document({ + pageContent: summarizedContent, + metadata: { + ...webContent.metadata, + url: url, + }, + }); + } + } catch (error) { + console.error(`Error processing URL ${doc.metadata.url}:`, error); + } + return null; + } + private async rerankDocs( query: string, docs: Document[], @@ -477,7 +584,7 @@ class MetaSearchAgent implements MetaSearchAgentType { ...sortedDocs, ...docsWithContent.slice(0, 15 - sortedDocs.length), ]; - + this.emitProgress(emitter, 60, `Enriching sources`); sortedDocs = await Promise.all( sortedDocs.map(async (doc) => { @@ -510,84 +617,63 @@ class MetaSearchAgent implements MetaSearchAgentType { return sortedDocs; } else if (optimizationMode === 'quality') { - this.emitProgress(emitter, 30, 'Ranking sources...'); - const summaryParser = new LineOutputParser({ key: 'summary', }); - // Get full content and generate detailed summaries for top results sequentially const enhancedDocs: Document[] = []; const maxEnhancedDocs = 5; - for (let i = 0; i < docsWithContent.length; i++) { + + // Process sources one by one until we have enough information or hit the max + for ( + let i = 0; + i < docsWithContent.length && enhancedDocs.length < maxEnhancedDocs; + i++ + ) { if (signal.aborted) { return []; } - if (enhancedDocs.length >= maxEnhancedDocs) { - break; // Limit to 5 documents - } - const result = docsWithContent[i]; + + const currentProgress = enhancedDocs.length * 10 + 40; this.emitProgress( emitter, - enhancedDocs.length * 10 + 40, - `Deep analyzing sources: ${enhancedDocs.length + 1}/${maxEnhancedDocs}`, + currentProgress, + `Deep analyzing: ${enhancedDocs.length} relevant sources found so far`, ); - try { - const url = result.metadata.url; - const webContent = await getWebContent(url, true); + const result = docsWithContent[i]; + const processedDoc = await this.processSource( + result, + query, + llm, + summaryParser, + ); - if (webContent) { - // Generate a detailed summary using the LLM - const summary = await llm.invoke(` - You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage - Your summary should: - - Be thorough and comprehensive, capturing all key points - - Format the content using markdown, including headings, lists, and tables - - Include specific details, numbers, and quotes when relevant - - Be concise and to the point, avoiding unnecessary fluff - - Answer the user's query, which is: ${query} - - Output your answer in an XML format, with the summary inside the \`summary\` XML tag - - If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed - - E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X" - - Make sure the reason the source is not needed is very specific and detailed - - Include useful links to external resources, if applicable + if (processedDoc) { + enhancedDocs.push(processedDoc); - Here is the content to summarize: - ${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent} - `); - - const summarizedContent = await summaryParser.parse( - summary.content as string, + // After getting initial 2 sources or adding a new one, check if we have enough info + if (enhancedDocs.length >= 2) { + this.emitProgress( + emitter, + currentProgress, + `Checking if we have enough information to answer the query`, ); - - if ( - summarizedContent.toLocaleLowerCase().startsWith('not_needed') - ) { - console.log( - `LLM response for URL "${url}" indicates it's not needed:`, - summarizedContent, - ); - continue; // Skip this document if not needed + const hasEnoughInfo = await this.checkIfEnoughInformation( + enhancedDocs, + query, + llm, + emitter, + ); + if (hasEnoughInfo) { + break; } - - //console.log(`LLM response for URL "${url}":`, summarizedContent); - enhancedDocs.push( - new Document({ - pageContent: summarizedContent, - metadata: { - ...webContent.metadata, - url: url, - }, - }), - ); } - } catch (error) { - console.error(`Error processing URL ${result.metadata.url}:`, error); } } + this.emitProgress(emitter, 95, `Ranking attached files`); // Add relevant file documents const fileDocs = await getRankedDocs(queryEmbedding, true, false, 8); diff --git a/src/lib/utils/documents.ts b/src/lib/utils/documents.ts index e85b438..2af06c2 100644 --- a/src/lib/utils/documents.ts +++ b/src/lib/utils/documents.ts @@ -121,10 +121,11 @@ export const getWebContent = async ( getHtml: boolean = false, ): Promise => { let crawledContent: CrawledContent | null = null; - const crawler = new PlaywrightCrawler({ + const crawler = new PlaywrightCrawler( + { async requestHandler({ page }) { // Wait for the content to load - await page.waitForLoadState('networkidle', {timeout: 10000}); + await page.waitForLoadState('networkidle', { timeout: 10000 }); // Allow some time for dynamic content to load await page.waitForTimeout(3000); @@ -138,24 +139,32 @@ export const getWebContent = async ( // Use Readability to parse the page content const content = await page.content(); const dom = new JSDOM(content, { url }); - const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse(); + const reader = new Readability(dom.window.document, { + charThreshold: 25, + }).parse(); const crawleeContent: CrawledContent = { text: reader?.textContent || '', title, - html: getHtml ? reader?.content || await page.content() : undefined, + html: getHtml + ? reader?.content || (await page.content()) + : undefined, }; crawledContent = crawleeContent; } catch (error) { - console.error(`Failed to parse content with Readability for URL: ${url}`, error); + console.error( + `Failed to parse content with Readability for URL: ${url}`, + error, + ); } - }, maxRequestsPerCrawl: 1, maxRequestRetries: 2, retryOnBlocked: true, maxSessionRotations: 3, - }, new Configuration({ persistStorage: false })); + }, + new Configuration({ persistStorage: false }), + ); try { await crawler.run([url]); @@ -168,11 +177,12 @@ export const getWebContent = async ( const content = crawledContent as CrawledContent; // Normalize the text content - const normalizedText = content?.text - ?.split('\n') - .map((line: string) => line.trim()) - .filter((line: string) => line.length > 0) - .join('\n') || ''; + const normalizedText = + content?.text + ?.split('\n') + .map((line: string) => line.trim()) + .filter((line: string) => line.length > 0) + .join('\n') || ''; // Create a Document with the parsed content const returnDoc = new Document({ @@ -184,10 +194,10 @@ export const getWebContent = async ( }, }); - - console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`); + console.log( + `Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`, + ); return returnDoc; - } catch (error) { console.error(`Error fetching/parsing URL ${url}:`, error); return null; @@ -209,7 +219,7 @@ export const getWebContentLite = async ( getHtml: boolean = false, ): Promise => { try { - const response = await fetch(url, {timeout: 5000}); + const response = await fetch(url, { timeout: 5000 }); const html = await response.text(); // Create a DOM from the fetched HTML @@ -247,4 +257,4 @@ export const getWebContentLite = async ( console.error(`Error fetching/parsing URL ${url}:`); //, error); return null; } -}; \ No newline at end of file +};