feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.

2025-05-24 14:37:19 -06:00 · 2025-05-24 14:37:19 -06:00 · 87a7ffb445
commit 87a7ffb445
parent 044f30a547
10 changed files with 4580 additions and 549 deletions
--- a/app.dockerfile
+++ b/app.dockerfile
@ -1,9 +1,10 @@
-FROM node:20.18.0-slim AS builder
+FROM --platform=linux/amd64 node:20-slim AS builder

 WORKDIR /home/perplexica

 COPY package.json yarn.lock ./
 RUN yarn install --frozen-lockfile --network-timeout 600000
+ENV NEXT_TELEMETRY_DISABLED=1

 COPY tsconfig.json next.config.mjs next-env.d.ts postcss.config.js drizzle.config.ts tailwind.config.ts ./
 COPY src ./src
@ -12,7 +13,9 @@ COPY public ./public
 RUN mkdir -p /home/perplexica/data
 RUN yarn build

-FROM node:20.18.0-slim
+FROM --platform=linux/amd64 node:20-slim
+
+ENV NEXT_TELEMETRY_DISABLED=1

 WORKDIR /home/perplexica

@ -22,6 +25,11 @@ COPY --from=builder /home/perplexica/.next/static ./public/_next/static
 COPY --from=builder /home/perplexica/.next/standalone ./
 COPY --from=builder /home/perplexica/data ./data

-RUN mkdir /home/perplexica/uploads
+RUN mkdir /home/perplexica/uploads && \
+    npx -y playwright install chromium --with-deps && \
+    npm install playwright && \
+    apt-get update && \
+    apt-get install -y procps && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*

 CMD ["node", "server.js"]
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -14,6 +14,8 @@ services:
    build:
      context: .
      dockerfile: app.dockerfile
+      platforms:
+        - linux/amd64
    environment:
      - SEARXNG_API_URL=http://searxng:8080
    ports:
--- a/docs/API/SEARCH.md
+++ b/docs/API/SEARCH.md
@ -59,8 +59,9 @@ The API accepts a JSON object in the request body, where you define the focus mo

 - **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:

-  - `speed`: Prioritize speed and return the fastest answer.
-  - `balanced`: Provide a balanced answer with good speed and reasonable quality.
+  - `speed`: Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content. - Only uses SearXNG result previews.
+  - `balanced`: Find the right balance between speed and accuracy. Medium effort retrieving web content. - Uses web scraping technologies to retrieve partial content from full web pages.
+  - `quality`: Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time. - Uses web scraping technologies to retrieve and summarize full web content.

 - **`query`** (string, required): The search query or question.

--- a/next.config.mjs
+++ b/next.config.mjs
@ -8,7 +8,7 @@ const nextConfig = {
      },
    ],
  },
-  serverExternalPackages: ['pdf-parse'],
+  serverExternalPackages: ['pdf-parse', 'crawlee', 'playwright'],
 };

 export default nextConfig;
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -31,6 +31,7 @@
    "clsx": "^2.1.0",
    "compute-cosine-similarity": "^1.1.0",
    "compute-dot": "^1.1.0",
+    "crawlee": "^3.13.5",
    "drizzle-orm": "^0.40.1",
    "html-to-text": "^9.0.5",
    "jsdom": "^26.1.0",
@ -40,6 +41,7 @@
    "next": "^15.2.2",
    "next-themes": "^0.3.0",
    "pdf-parse": "^1.1.1",
+    "playwright": "*",
    "react": "^18",
    "react-dom": "^18",
    "react-syntax-highlighter": "^15.6.1",
--- a/src/components/MessageInputActions/Optimization.tsx
+++ b/src/components/MessageInputActions/Optimization.tsx
@ -11,19 +11,19 @@ const OptimizationModes = [
  {
    key: 'speed',
    title: 'Speed',
-    description: 'Prioritize speed and get the quickest possible answer.',
+    description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
    icon: <Zap size={20} className="text-[#FF9800]" />,
  },
  {
    key: 'balanced',
    title: 'Balanced',
-    description: 'Find the right balance between speed and accuracy',
+    description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
    icon: <Sliders size={20} className="text-[#4CAF50]" />,
  },
  {
    key: 'quality',
    title: 'Quality',
-    description: 'Get the most thorough and accurate answer',
+    description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
    icon: (
      <Star
        size={16}
--- a/src/lib/search/metaSearchAgent.ts
+++ b/src/lib/search/metaSearchAgent.ts
@ -22,7 +22,7 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
 import LineListOutputParser from '../outputParsers/listLineOutputParser';
 import { searchSearxng } from '../searxng';
 import computeSimilarity from '../utils/computeSimilarity';
-import { getDocumentsFromLinks, getWebContent } from '../utils/documents';
+import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
 import formatChatHistoryAsString from '../utils/formatHistory';
 import { getModelName } from '../utils/modelUtils';

@ -483,7 +483,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
      this.emitProgress(emitter, 60, `Enriching sources`);
      sortedDocs = await Promise.all(
        sortedDocs.map(async (doc) => {
-          const webContent = await getWebContent(doc.metadata.url);
+          const webContent = await getWebContentLite(doc.metadata.url);
          const chunks =
            webContent?.pageContent
              .match(/.{1,500}/g)
@ -610,7 +610,7 @@ ${docs[index].metadata?.url.toLowerCase().includes('file') ? '' : '\n<url>' + do
 </${index + 1}>\n`,
      )
      .join('\n');
-    // console.log('Processed docs:', fullDocs);
+    console.log('Processed docs:', fullDocs);
    return fullDocs;
  }

--- a/src/lib/utils/documents.ts
+++ b/src/lib/utils/documents.ts
@ -3,8 +3,9 @@ import { htmlToText } from 'html-to-text';
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import pdfParse from 'pdf-parse';
-import { JSDOM } from 'jsdom';
+import { Configuration, Dataset, PlaywrightCrawler } from 'crawlee';
 import { Readability } from '@mozilla/readability';
+import { JSDOM } from 'jsdom';
 import fetch from 'node-fetch';

 export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
@ -101,12 +102,114 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
  return docs;
 };

+interface CrawledContent {
+  text: string;
+  title: string;
+  html?: string;
+}
+
+/**
+ * Fetches web content from a given URL using Crawlee and Playwright. Parses it using Readability.
+ * Returns a Document object containing the parsed text and metadata.
+ *
+ * @param url - The URL to fetch content from.
+ * @param getHtml - Whether to include the HTML content in the metadata.
+ * @returns A Promise that resolves to a Document object or null if parsing fails.
+ */
 export const getWebContent = async (
  url: string,
  getHtml: boolean = false,
 ): Promise<Document | null> => {
+  let crawledContent: CrawledContent | null = null;
+  const crawler = new PlaywrightCrawler({
+      async requestHandler({ page }) {
+        // Wait for the content to load
+        await page.waitForLoadState('networkidle', {timeout: 10000});
+
+        // Allow some time for dynamic content to load
+        await page.waitForTimeout(3000);
+
+        console.log(`Crawling URL: ${url}`);
+
+        // Get the page title
+        const title = await page.title();
+
+        try {
+          // Use Readability to parse the page content
+          const content = await page.content();
+          const dom = new JSDOM(content, { url });
+          const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
+          const crawleeContent: CrawledContent = {
+            text: reader?.textContent || '',
+            title,
+            html: getHtml ? reader?.content || await page.content() : undefined,
+          };
+
+          crawledContent = crawleeContent;
+        } catch (error) {
+          console.error(`Failed to parse content with Readability for URL: ${url}`, error);
+        }
+
+      },
+      maxRequestsPerCrawl: 1,
+      maxRequestRetries: 2,
+      retryOnBlocked: true,
+      maxSessionRotations: 3,
+    }, new Configuration({ persistStorage: false }));
+
  try {
-    const response = await fetch(url, { timeout: 5000 });
+    await crawler.run([url]);
+
+    if (!crawledContent) {
+      console.warn(`Failed to parse article content for URL: ${url}`);
+      return null;
+    }
+
+    const content = crawledContent as CrawledContent;
+
+    // Normalize the text content
+    const normalizedText = content?.text
+      ?.split('\n')
+      .map((line: string) => line.trim())
+      .filter((line: string) => line.length > 0)
+      .join('\n') || '';
+
+    // Create a Document with the parsed content
+    const returnDoc = new Document({
+      pageContent: normalizedText,
+      metadata: {
+        html: content?.html,
+        title: content?.title,
+        url: url,
+      },
+    });
+
+
+    console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
+    return returnDoc;
+
+  } catch (error) {
+    console.error(`Error fetching/parsing URL ${url}:`, error);
+    return null;
+  } finally {
+    await crawler.teardown();
+  }
+};
+
+/**
+ * Fetches web content from a given URL and parses it using Readability.
+ * Returns a Document object containing the parsed text and metadata.
+ *
+ * @param {string} url - The URL to fetch content from.
+ * @param {boolean} getHtml - Whether to include the HTML content in the metadata.
+ * @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
+ */
+export const getWebContentLite = async (
+  url: string,
+  getHtml: boolean = false,
+): Promise<Document | null> => {
+  try {
+    const response = await fetch(url, {timeout: 5000});
    const html = await response.text();

    // Create a DOM from the fetched HTML
@ -124,7 +227,6 @@ export const getWebContent = async (
      return null;
    }

-    // Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
    const normalizedText =
      article?.textContent
        ?.split('\n')
@ -139,16 +241,10 @@ export const getWebContent = async (
        html: getHtml ? article.content : undefined,
        title: article.title || originalTitle,
        url: url,
-        excerpt: article.excerpt || undefined,
-        byline: article.byline || undefined,
-        siteName: article.siteName || undefined,
-        readingTime: article.length
-          ? Math.ceil(article.length / 1000)
-          : undefined,
      },
    });
  } catch (error) {
    console.error(`Error fetching/parsing URL ${url}:`); //, error);
    return null;
  }
-};
+};
--- a/yarn.lock
+++ b/yarn.lock