feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.

This commit is contained in:
Willie Zutz 2025-05-24 14:37:19 -06:00
parent 044f30a547
commit 87a7ffb445
10 changed files with 4580 additions and 549 deletions

View file

@ -22,7 +22,7 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
import LineListOutputParser from '../outputParsers/listLineOutputParser';
import { searchSearxng } from '../searxng';
import computeSimilarity from '../utils/computeSimilarity';
import { getDocumentsFromLinks, getWebContent } from '../utils/documents';
import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
import formatChatHistoryAsString from '../utils/formatHistory';
import { getModelName } from '../utils/modelUtils';
@ -483,7 +483,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
this.emitProgress(emitter, 60, `Enriching sources`);
sortedDocs = await Promise.all(
sortedDocs.map(async (doc) => {
const webContent = await getWebContent(doc.metadata.url);
const webContent = await getWebContentLite(doc.metadata.url);
const chunks =
webContent?.pageContent
.match(/.{1,500}/g)
@ -610,7 +610,7 @@ ${docs[index].metadata?.url.toLowerCase().includes('file') ? '' : '\n<url>' + do
</${index + 1}>\n`,
)
.join('\n');
// console.log('Processed docs:', fullDocs);
console.log('Processed docs:', fullDocs);
return fullDocs;
}