feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.

This commit is contained in:
Willie Zutz 2025-05-24 14:37:19 -06:00
parent 044f30a547
commit 87a7ffb445
10 changed files with 4580 additions and 549 deletions

View file

@ -1,9 +1,10 @@
FROM node:20.18.0-slim AS builder
FROM --platform=linux/amd64 node:20-slim AS builder
WORKDIR /home/perplexica
COPY package.json yarn.lock ./
RUN yarn install --frozen-lockfile --network-timeout 600000
ENV NEXT_TELEMETRY_DISABLED=1
COPY tsconfig.json next.config.mjs next-env.d.ts postcss.config.js drizzle.config.ts tailwind.config.ts ./
COPY src ./src
@ -12,7 +13,9 @@ COPY public ./public
RUN mkdir -p /home/perplexica/data
RUN yarn build
FROM node:20.18.0-slim
FROM --platform=linux/amd64 node:20-slim
ENV NEXT_TELEMETRY_DISABLED=1
WORKDIR /home/perplexica
@ -22,6 +25,11 @@ COPY --from=builder /home/perplexica/.next/static ./public/_next/static
COPY --from=builder /home/perplexica/.next/standalone ./
COPY --from=builder /home/perplexica/data ./data
RUN mkdir /home/perplexica/uploads
RUN mkdir /home/perplexica/uploads && \
npx -y playwright install chromium --with-deps && \
npm install playwright && \
apt-get update && \
apt-get install -y procps && \
apt-get clean && rm -rf /var/lib/apt/lists/*
CMD ["node", "server.js"]

View file

@ -14,6 +14,8 @@ services:
build:
context: .
dockerfile: app.dockerfile
platforms:
- linux/amd64
environment:
- SEARXNG_API_URL=http://searxng:8080
ports:

View file

@ -59,8 +59,9 @@ The API accepts a JSON object in the request body, where you define the focus mo
- **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:
- `speed`: Prioritize speed and return the fastest answer.
- `balanced`: Provide a balanced answer with good speed and reasonable quality.
- `speed`: Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content. - Only uses SearXNG result previews.
- `balanced`: Find the right balance between speed and accuracy. Medium effort retrieving web content. - Uses web scraping technologies to retrieve partial content from full web pages.
- `quality`: Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time. - Uses web scraping technologies to retrieve and summarize full web content.
- **`query`** (string, required): The search query or question.

View file

@ -8,7 +8,7 @@ const nextConfig = {
},
],
},
serverExternalPackages: ['pdf-parse'],
serverExternalPackages: ['pdf-parse', 'crawlee', 'playwright'],
};
export default nextConfig;

3051
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -31,6 +31,7 @@
"clsx": "^2.1.0",
"compute-cosine-similarity": "^1.1.0",
"compute-dot": "^1.1.0",
"crawlee": "^3.13.5",
"drizzle-orm": "^0.40.1",
"html-to-text": "^9.0.5",
"jsdom": "^26.1.0",
@ -40,6 +41,7 @@
"next": "^15.2.2",
"next-themes": "^0.3.0",
"pdf-parse": "^1.1.1",
"playwright": "*",
"react": "^18",
"react-dom": "^18",
"react-syntax-highlighter": "^15.6.1",

View file

@ -11,19 +11,19 @@ const OptimizationModes = [
{
key: 'speed',
title: 'Speed',
description: 'Prioritize speed and get the quickest possible answer.',
description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
icon: <Zap size={20} className="text-[#FF9800]" />,
},
{
key: 'balanced',
title: 'Balanced',
description: 'Find the right balance between speed and accuracy',
description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
icon: <Sliders size={20} className="text-[#4CAF50]" />,
},
{
key: 'quality',
title: 'Quality',
description: 'Get the most thorough and accurate answer',
description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
icon: (
<Star
size={16}

View file

@ -22,7 +22,7 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
import LineListOutputParser from '../outputParsers/listLineOutputParser';
import { searchSearxng } from '../searxng';
import computeSimilarity from '../utils/computeSimilarity';
import { getDocumentsFromLinks, getWebContent } from '../utils/documents';
import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
import formatChatHistoryAsString from '../utils/formatHistory';
import { getModelName } from '../utils/modelUtils';
@ -483,7 +483,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
this.emitProgress(emitter, 60, `Enriching sources`);
sortedDocs = await Promise.all(
sortedDocs.map(async (doc) => {
const webContent = await getWebContent(doc.metadata.url);
const webContent = await getWebContentLite(doc.metadata.url);
const chunks =
webContent?.pageContent
.match(/.{1,500}/g)
@ -610,7 +610,7 @@ ${docs[index].metadata?.url.toLowerCase().includes('file') ? '' : '\n<url>' + do
</${index + 1}>\n`,
)
.join('\n');
// console.log('Processed docs:', fullDocs);
console.log('Processed docs:', fullDocs);
return fullDocs;
}

View file

@ -3,8 +3,9 @@ import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse';
import { JSDOM } from 'jsdom';
import { Configuration, Dataset, PlaywrightCrawler } from 'crawlee';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import fetch from 'node-fetch';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
@ -101,12 +102,114 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
return docs;
};
interface CrawledContent {
text: string;
title: string;
html?: string;
}
/**
* Fetches web content from a given URL using Crawlee and Playwright. Parses it using Readability.
* Returns a Document object containing the parsed text and metadata.
*
* @param url - The URL to fetch content from.
* @param getHtml - Whether to include the HTML content in the metadata.
* @returns A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContent = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
let crawledContent: CrawledContent | null = null;
const crawler = new PlaywrightCrawler({
async requestHandler({ page }) {
// Wait for the content to load
await page.waitForLoadState('networkidle', {timeout: 10000});
// Allow some time for dynamic content to load
await page.waitForTimeout(3000);
console.log(`Crawling URL: ${url}`);
// Get the page title
const title = await page.title();
try {
// Use Readability to parse the page content
const content = await page.content();
const dom = new JSDOM(content, { url });
const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
const crawleeContent: CrawledContent = {
text: reader?.textContent || '',
title,
html: getHtml ? reader?.content || await page.content() : undefined,
};
crawledContent = crawleeContent;
} catch (error) {
console.error(`Failed to parse content with Readability for URL: ${url}`, error);
}
},
maxRequestsPerCrawl: 1,
maxRequestRetries: 2,
retryOnBlocked: true,
maxSessionRotations: 3,
}, new Configuration({ persistStorage: false }));
try {
const response = await fetch(url, { timeout: 5000 });
await crawler.run([url]);
if (!crawledContent) {
console.warn(`Failed to parse article content for URL: ${url}`);
return null;
}
const content = crawledContent as CrawledContent;
// Normalize the text content
const normalizedText = content?.text
?.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n') || '';
// Create a Document with the parsed content
const returnDoc = new Document({
pageContent: normalizedText,
metadata: {
html: content?.html,
title: content?.title,
url: url,
},
});
console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
return returnDoc;
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`, error);
return null;
} finally {
await crawler.teardown();
}
};
/**
* Fetches web content from a given URL and parses it using Readability.
* Returns a Document object containing the parsed text and metadata.
*
* @param {string} url - The URL to fetch content from.
* @param {boolean} getHtml - Whether to include the HTML content in the metadata.
* @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContentLite = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
try {
const response = await fetch(url, {timeout: 5000});
const html = await response.text();
// Create a DOM from the fetched HTML
@ -124,7 +227,6 @@ export const getWebContent = async (
return null;
}
// Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
const normalizedText =
article?.textContent
?.split('\n')
@ -139,16 +241,10 @@ export const getWebContent = async (
html: getHtml ? article.content : undefined,
title: article.title || originalTitle,
url: url,
excerpt: article.excerpt || undefined,
byline: article.byline || undefined,
siteName: article.siteName || undefined,
readingTime: article.length
? Math.ceil(article.length / 1000)
: undefined,
},
});
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`); //, error);
return null;
}
};
};

1925
yarn.lock

File diff suppressed because it is too large Load diff