diff --git a/src/components/MessageInputActions/Optimization.tsx b/src/components/MessageInputActions/Optimization.tsx
index 0d6e10b..f2f1edc 100644
--- a/src/components/MessageInputActions/Optimization.tsx
+++ b/src/components/MessageInputActions/Optimization.tsx
@@ -11,19 +11,22 @@ const OptimizationModes = [
{
key: 'speed',
title: 'Speed',
- description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
+ description:
+ 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
icon: ,
},
{
key: 'balanced',
title: 'Balanced',
- description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
+ description:
+ 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
icon: ,
},
{
key: 'quality',
title: 'Quality',
- description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
+ description:
+ 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
icon: (
\` and \`\` tags in your thinking. Those tags should only be used in the final output.
+You are an AI suggestion generator for an AI powered search engine.
-Provide these suggestions separated by newlines between the XML tags and . For example:
+# Instructions
+- You will be given a conversation below
+- Generate 5 total suggestions based on the conversation
+ - Three of the suggestions should be relevant to the conversation so it can be used by the user to ask the chat model for more information
+ - Two of the suggestions should still be relevant to the conversation but could optionally steer the conversation in a different direction
+ - The suggestions should be in the form of questions
+ - The suggestions should not be something that is already in the conversation
+- The conversation history is provided in the conversation section below
+ # Output Format
+- If you are a thinking or reasoning AI, you should avoid using \`\` and \`\` tags in your thinking. Those tags should only be used in the final output.
+- Provide these suggestions separated by newlines between the XML tags and . For example:
+- Make sure each suggestion is a single line and does not contain any newlines or any formatting
+- Example output is provided in the example section below
+
+
Tell me more about SpaceX and their recent projects
What is the latest news on SpaceX?
Who is the CEO of SpaceX?
+
-Conversation:
+
{chat_history}
+
`;
type SuggestionGeneratorInput = {
diff --git a/src/lib/search/metaSearchAgent.ts b/src/lib/search/metaSearchAgent.ts
index 8798c54..aabb2ac 100644
--- a/src/lib/search/metaSearchAgent.ts
+++ b/src/lib/search/metaSearchAgent.ts
@@ -22,7 +22,11 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
import LineListOutputParser from '../outputParsers/listLineOutputParser';
import { searchSearxng } from '../searxng';
import computeSimilarity from '../utils/computeSimilarity';
-import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
+import {
+ getDocumentsFromLinks,
+ getWebContent,
+ getWebContentLite,
+} from '../utils/documents';
import formatChatHistoryAsString from '../utils/formatHistory';
import { getModelName } from '../utils/modelUtils';
@@ -99,70 +103,71 @@ class MetaSearchAgent implements MetaSearchAgentType {
llm,
this.strParser,
RunnableLambda.from(async (input: string) => {
- //console.log(`LLM response for initial web search:"${input}"`);
- const linksOutputParser = new LineListOutputParser({
- key: 'links',
- });
-
- const questionOutputParser = new LineOutputParser({
- key: 'answer',
- });
-
- const links = await linksOutputParser.parse(input);
- let question = await questionOutputParser.parse(input);
-
- //console.log('question', question);
-
- if (question === 'not_needed') {
- return { query: '', docs: [] };
- }
-
- if (links.length > 0) {
- if (question.length === 0) {
- question = 'summarize';
- }
-
- let docs: Document[] = [];
-
- const linkDocs = await getDocumentsFromLinks({ links });
-
- const docGroups: Document[] = [];
-
- linkDocs.map((doc) => {
- const URLDocExists = docGroups.find(
- (d) =>
- d.metadata.url === doc.metadata.url &&
- d.metadata.totalDocs < 10,
- );
-
- if (!URLDocExists) {
- docGroups.push({
- ...doc,
- metadata: {
- ...doc.metadata,
- totalDocs: 1,
- },
- });
- }
-
- const docIndex = docGroups.findIndex(
- (d) =>
- d.metadata.url === doc.metadata.url &&
- d.metadata.totalDocs < 10,
- );
-
- if (docIndex !== -1) {
- docGroups[docIndex].pageContent =
- docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
- docGroups[docIndex].metadata.totalDocs += 1;
- }
+ try {
+ //console.log(`LLM response for initial web search:"${input}"`);
+ const linksOutputParser = new LineListOutputParser({
+ key: 'links',
});
- this.emitProgress(emitter, 20, `Summarizing content`);
+ const questionOutputParser = new LineOutputParser({
+ key: 'answer',
+ });
- await Promise.all(
- docGroups.map(async (doc) => {
- const res = await llm.invoke(`
+ const links = await linksOutputParser.parse(input);
+ let question = await questionOutputParser.parse(input);
+
+ //console.log('question', question);
+
+ if (question === 'not_needed') {
+ return { query: '', docs: [] };
+ }
+
+ if (links.length > 0) {
+ if (question.length === 0) {
+ question = 'summarize';
+ }
+
+ let docs: Document[] = [];
+
+ const linkDocs = await getDocumentsFromLinks({ links });
+
+ const docGroups: Document[] = [];
+
+ linkDocs.map((doc) => {
+ const URLDocExists = docGroups.find(
+ (d) =>
+ d.metadata.url === doc.metadata.url &&
+ d.metadata.totalDocs < 10,
+ );
+
+ if (!URLDocExists) {
+ docGroups.push({
+ ...doc,
+ metadata: {
+ ...doc.metadata,
+ totalDocs: 1,
+ },
+ });
+ }
+
+ const docIndex = docGroups.findIndex(
+ (d) =>
+ d.metadata.url === doc.metadata.url &&
+ d.metadata.totalDocs < 10,
+ );
+
+ if (docIndex !== -1) {
+ docGroups[docIndex].pageContent =
+ docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
+ docGroups[docIndex].metadata.totalDocs += 1;
+ }
+ });
+
+ this.emitProgress(emitter, 20, `Summarizing content`);
+
+ await Promise.all(
+ docGroups.map(async (doc) => {
+ const res = await llm.invoke(`
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
@@ -223,50 +228,55 @@ class MetaSearchAgent implements MetaSearchAgentType {
Make sure to answer the query in the summary.
`);
- const document = new Document({
- pageContent: res.content as string,
- metadata: {
- title: doc.metadata.title,
- url: doc.metadata.url,
- },
- });
+ const document = new Document({
+ pageContent: res.content as string,
+ metadata: {
+ title: doc.metadata.title,
+ url: doc.metadata.url,
+ },
+ });
- docs.push(document);
- }),
- );
-
- return { query: question, docs: docs };
- } else {
- this.emitProgress(emitter, 20, `Searching the web`);
- if (this.config.additionalSearchCriteria) {
- question = `${question} ${this.config.additionalSearchCriteria}`;
- }
-
- const searxngResult = await searchSearxng(question, {
- language: 'en',
- engines: this.config.activeEngines,
- });
-
- // Store the SearXNG URL for later use in emitting to the client
- this.searxngUrl = searxngResult.searchUrl;
-
- const documents = searxngResult.results.map(
- (result) =>
- new Document({
- pageContent:
- result.content ||
- (this.config.activeEngines.includes('youtube')
- ? result.title
- : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
- metadata: {
- title: result.title,
- url: result.url,
- ...(result.img_src && { img_src: result.img_src }),
- },
+ docs.push(document);
}),
- );
+ );
- return { query: question, docs: documents, searchQuery: question };
+ return { query: question, docs: docs };
+ } else {
+ if (this.config.additionalSearchCriteria) {
+ question = `${question} ${this.config.additionalSearchCriteria}`;
+ }
+ this.emitProgress(emitter, 20, `Searching the web: "${question}"`);
+
+ const searxngResult = await searchSearxng(question, {
+ language: 'en',
+ engines: this.config.activeEngines,
+ });
+
+ // Store the SearXNG URL for later use in emitting to the client
+ this.searxngUrl = searxngResult.searchUrl;
+
+ const documents = searxngResult.results.map(
+ (result) =>
+ new Document({
+ pageContent:
+ result.content ||
+ (this.config.activeEngines.includes('youtube')
+ ? result.title
+ : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
+ metadata: {
+ title: result.title,
+ url: result.url,
+ ...(result.img_src && { img_src: result.img_src }),
+ },
+ }),
+ );
+
+ return { query: question, docs: documents, searchQuery: question };
+ }
+ } catch (error) {
+ console.error('Error in search retriever chain:', error);
+ emitter.emit('error', JSON.stringify({ data: error }));
+ throw error;
}
}),
]);
@@ -360,6 +370,103 @@ class MetaSearchAgent implements MetaSearchAgentType {
});
}
+ private async checkIfEnoughInformation(
+ docs: Document[],
+ query: string,
+ llm: BaseChatModel,
+ emitter: eventEmitter,
+ ): Promise {
+ const formattedDocs = this.processDocs(docs);
+
+ const response =
+ await llm.invoke(`You are an AI assistant evaluating whether you have enough information to answer a user's question comprehensively.
+
+Based on the following sources, determine if you have sufficient information to provide a detailed, accurate answer to the query: "${query}"
+
+Sources:
+${formattedDocs}
+
+Look for:
+1. Key facts and details directly relevant to the query
+2. Multiple perspectives or sources if the topic is complex
+3. Up-to-date information if the query requires current data
+4. Sufficient context to understand the topic fully
+
+Output ONLY \`yes\` if you have enough information to answer comprehensively, or \`no\` if more information would significantly improve the answer.`);
+
+ const answerParser = new LineOutputParser({
+ key: 'answer',
+ });
+ const responseText = await answerParser.parse(
+ (response.content as string).trim().toLowerCase(),
+ );
+ if (responseText !== 'yes') {
+ console.log(
+ `LLM response for checking if we have enough information: "${response.content}"`,
+ );
+ } else {
+ console.log(
+ 'LLM response indicates we have enough information to answer the query.',
+ );
+ }
+ return responseText === 'yes';
+ }
+
+ private async processSource(
+ doc: Document,
+ query: string,
+ llm: BaseChatModel,
+ summaryParser: LineOutputParser,
+ ): Promise {
+ try {
+ const url = doc.metadata.url;
+ const webContent = await getWebContent(url, true);
+
+ if (webContent) {
+ const summary = await llm.invoke(`
+You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage
+Your summary should:
+- Be thorough and comprehensive, capturing all key points
+- Format the content using markdown, including headings, lists, and tables
+- Include specific details, numbers, and quotes when relevant
+- Be concise and to the point, avoiding unnecessary fluff
+- Answer the user's query, which is: ${query}
+- Output your answer in an XML format, with the summary inside the \`summary\` XML tag
+- If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed
+ - E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X"
+ - Make sure the reason the source is not needed is very specific and detailed
+- Include useful links to external resources, if applicable
+
+Here is the content to summarize:
+${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent}
+ `);
+
+ const summarizedContent = await summaryParser.parse(
+ summary.content as string,
+ );
+
+ if (summarizedContent.toLocaleLowerCase().startsWith('not_needed')) {
+ console.log(
+ `LLM response for URL "${url}" indicates it's not needed:`,
+ summarizedContent,
+ );
+ return null;
+ }
+
+ return new Document({
+ pageContent: summarizedContent,
+ metadata: {
+ ...webContent.metadata,
+ url: url,
+ },
+ });
+ }
+ } catch (error) {
+ console.error(`Error processing URL ${doc.metadata.url}:`, error);
+ }
+ return null;
+ }
+
private async rerankDocs(
query: string,
docs: Document[],
@@ -477,7 +584,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
...sortedDocs,
...docsWithContent.slice(0, 15 - sortedDocs.length),
];
-
+
this.emitProgress(emitter, 60, `Enriching sources`);
sortedDocs = await Promise.all(
sortedDocs.map(async (doc) => {
@@ -510,84 +617,63 @@ class MetaSearchAgent implements MetaSearchAgentType {
return sortedDocs;
} else if (optimizationMode === 'quality') {
- this.emitProgress(emitter, 30, 'Ranking sources...');
-
const summaryParser = new LineOutputParser({
key: 'summary',
});
- // Get full content and generate detailed summaries for top results sequentially
const enhancedDocs: Document[] = [];
const maxEnhancedDocs = 5;
- for (let i = 0; i < docsWithContent.length; i++) {
+
+ // Process sources one by one until we have enough information or hit the max
+ for (
+ let i = 0;
+ i < docsWithContent.length && enhancedDocs.length < maxEnhancedDocs;
+ i++
+ ) {
if (signal.aborted) {
return [];
}
- if (enhancedDocs.length >= maxEnhancedDocs) {
- break; // Limit to 5 documents
- }
- const result = docsWithContent[i];
+
+ const currentProgress = enhancedDocs.length * 10 + 40;
this.emitProgress(
emitter,
- enhancedDocs.length * 10 + 40,
- `Deep analyzing sources: ${enhancedDocs.length + 1}/${maxEnhancedDocs}`,
+ currentProgress,
+ `Deep analyzing: ${enhancedDocs.length} relevant sources found so far`,
);
- try {
- const url = result.metadata.url;
- const webContent = await getWebContent(url, true);
+ const result = docsWithContent[i];
+ const processedDoc = await this.processSource(
+ result,
+ query,
+ llm,
+ summaryParser,
+ );
- if (webContent) {
- // Generate a detailed summary using the LLM
- const summary = await llm.invoke(`
- You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage
- Your summary should:
- - Be thorough and comprehensive, capturing all key points
- - Format the content using markdown, including headings, lists, and tables
- - Include specific details, numbers, and quotes when relevant
- - Be concise and to the point, avoiding unnecessary fluff
- - Answer the user's query, which is: ${query}
- - Output your answer in an XML format, with the summary inside the \`summary\` XML tag
- - If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed
- - E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X"
- - Make sure the reason the source is not needed is very specific and detailed
- - Include useful links to external resources, if applicable
+ if (processedDoc) {
+ enhancedDocs.push(processedDoc);
- Here is the content to summarize:
- ${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent}
- `);
-
- const summarizedContent = await summaryParser.parse(
- summary.content as string,
+ // After getting initial 2 sources or adding a new one, check if we have enough info
+ if (enhancedDocs.length >= 2) {
+ this.emitProgress(
+ emitter,
+ currentProgress,
+ `Checking if we have enough information to answer the query`,
);
-
- if (
- summarizedContent.toLocaleLowerCase().startsWith('not_needed')
- ) {
- console.log(
- `LLM response for URL "${url}" indicates it's not needed:`,
- summarizedContent,
- );
- continue; // Skip this document if not needed
+ const hasEnoughInfo = await this.checkIfEnoughInformation(
+ enhancedDocs,
+ query,
+ llm,
+ emitter,
+ );
+ if (hasEnoughInfo) {
+ break;
}
-
- //console.log(`LLM response for URL "${url}":`, summarizedContent);
- enhancedDocs.push(
- new Document({
- pageContent: summarizedContent,
- metadata: {
- ...webContent.metadata,
- url: url,
- },
- }),
- );
}
- } catch (error) {
- console.error(`Error processing URL ${result.metadata.url}:`, error);
}
}
+ this.emitProgress(emitter, 95, `Ranking attached files`);
// Add relevant file documents
const fileDocs = await getRankedDocs(queryEmbedding, true, false, 8);
diff --git a/src/lib/utils/documents.ts b/src/lib/utils/documents.ts
index e85b438..2af06c2 100644
--- a/src/lib/utils/documents.ts
+++ b/src/lib/utils/documents.ts
@@ -121,10 +121,11 @@ export const getWebContent = async (
getHtml: boolean = false,
): Promise => {
let crawledContent: CrawledContent | null = null;
- const crawler = new PlaywrightCrawler({
+ const crawler = new PlaywrightCrawler(
+ {
async requestHandler({ page }) {
// Wait for the content to load
- await page.waitForLoadState('networkidle', {timeout: 10000});
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
// Allow some time for dynamic content to load
await page.waitForTimeout(3000);
@@ -138,24 +139,32 @@ export const getWebContent = async (
// Use Readability to parse the page content
const content = await page.content();
const dom = new JSDOM(content, { url });
- const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
+ const reader = new Readability(dom.window.document, {
+ charThreshold: 25,
+ }).parse();
const crawleeContent: CrawledContent = {
text: reader?.textContent || '',
title,
- html: getHtml ? reader?.content || await page.content() : undefined,
+ html: getHtml
+ ? reader?.content || (await page.content())
+ : undefined,
};
crawledContent = crawleeContent;
} catch (error) {
- console.error(`Failed to parse content with Readability for URL: ${url}`, error);
+ console.error(
+ `Failed to parse content with Readability for URL: ${url}`,
+ error,
+ );
}
-
},
maxRequestsPerCrawl: 1,
maxRequestRetries: 2,
retryOnBlocked: true,
maxSessionRotations: 3,
- }, new Configuration({ persistStorage: false }));
+ },
+ new Configuration({ persistStorage: false }),
+ );
try {
await crawler.run([url]);
@@ -168,11 +177,12 @@ export const getWebContent = async (
const content = crawledContent as CrawledContent;
// Normalize the text content
- const normalizedText = content?.text
- ?.split('\n')
- .map((line: string) => line.trim())
- .filter((line: string) => line.length > 0)
- .join('\n') || '';
+ const normalizedText =
+ content?.text
+ ?.split('\n')
+ .map((line: string) => line.trim())
+ .filter((line: string) => line.length > 0)
+ .join('\n') || '';
// Create a Document with the parsed content
const returnDoc = new Document({
@@ -184,10 +194,10 @@ export const getWebContent = async (
},
});
-
- console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
+ console.log(
+ `Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`,
+ );
return returnDoc;
-
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`, error);
return null;
@@ -209,7 +219,7 @@ export const getWebContentLite = async (
getHtml: boolean = false,
): Promise => {
try {
- const response = await fetch(url, {timeout: 5000});
+ const response = await fetch(url, { timeout: 5000 });
const html = await response.text();
// Create a DOM from the fetched HTML
@@ -247,4 +257,4 @@ export const getWebContentLite = async (
console.error(`Error fetching/parsing URL ${url}:`); //, error);
return null;
}
-};
\ No newline at end of file
+};