2025-06-09 23:00:25 -06:00
|
|
|
import { Document } from '@langchain/core/documents';
|
|
|
|
|
import { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
2025-06-22 23:59:29 -06:00
|
|
|
import { z } from 'zod';
|
2025-06-09 23:00:25 -06:00
|
|
|
import { formatDateForLLM } from '../utils';
|
|
|
|
|
import { getWebContent } from './documents';
|
|
|
|
|
|
2025-06-15 10:56:57 -06:00
|
|
|
export type SummarizeResult = {
|
|
|
|
|
document: Document | null;
|
|
|
|
|
notRelevantReason?: string;
|
|
|
|
|
};
|
|
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
// Zod schema for structured summary output
|
|
|
|
|
const SummarySchema = z.object({
|
|
|
|
|
isRelevant: z.boolean().describe('Whether the content is relevant to the user query'),
|
|
|
|
|
summary: z.string().describe('Detailed summary of the content in markdown format, or explanation if not relevant'),
|
|
|
|
|
notRelevantReason: z.string().optional().describe('Specific reason why content is not relevant (only if isRelevant is false)')
|
|
|
|
|
});
|
|
|
|
|
|
2025-06-09 23:00:25 -06:00
|
|
|
export const summarizeWebContent = async (
|
|
|
|
|
url: string,
|
|
|
|
|
query: string,
|
|
|
|
|
llm: BaseChatModel,
|
|
|
|
|
systemInstructions: string,
|
|
|
|
|
signal: AbortSignal,
|
2025-06-15 10:56:57 -06:00
|
|
|
): Promise<SummarizeResult> => {
|
2025-06-09 23:00:25 -06:00
|
|
|
try {
|
|
|
|
|
// Helper function to summarize content and check relevance
|
|
|
|
|
const summarizeContent = async (
|
|
|
|
|
content: Document,
|
2025-06-15 10:56:57 -06:00
|
|
|
): Promise<SummarizeResult> => {
|
2025-06-09 23:00:25 -06:00
|
|
|
const systemPrompt = systemInstructions
|
|
|
|
|
? `${systemInstructions}\n\n`
|
|
|
|
|
: '';
|
|
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
// Create structured LLM with Zod schema
|
|
|
|
|
const structuredLLM = llm.withStructuredOutput(SummarySchema);
|
|
|
|
|
|
|
|
|
|
let result = null;
|
2025-06-09 23:00:25 -06:00
|
|
|
for (let i = 0; i < 2; i++) {
|
|
|
|
|
try {
|
|
|
|
|
console.log(
|
|
|
|
|
`Summarizing content from URL: ${url} using ${i === 0 ? 'html' : 'text'}`,
|
|
|
|
|
);
|
2025-06-22 23:59:29 -06:00
|
|
|
|
|
|
|
|
const prompt = `${systemPrompt}You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage.
|
2025-06-09 23:00:25 -06:00
|
|
|
|
|
|
|
|
# Instructions
|
2025-06-22 23:59:29 -06:00
|
|
|
- Determine if the content is relevant to the user's query
|
|
|
|
|
- You do not need to provide a full answer to the query, partial answers are acceptable
|
|
|
|
|
- If relevant, create a thorough and comprehensive summary capturing all key points
|
2025-06-09 23:00:25 -06:00
|
|
|
- Include specific details, numbers, and quotes when relevant
|
|
|
|
|
- Be concise and to the point, avoiding unnecessary fluff
|
2025-06-22 23:59:29 -06:00
|
|
|
- Format the summary using markdown with headings and lists
|
2025-06-09 23:00:25 -06:00
|
|
|
- Include useful links to external resources, if applicable
|
2025-06-22 23:59:29 -06:00
|
|
|
- If the content is not relevant, set isRelevant to false and provide a specific reason
|
|
|
|
|
|
|
|
|
|
# Response Format
|
|
|
|
|
You must return a JSON object with:
|
|
|
|
|
- isRelevant: boolean indicating if content is relevant to the query
|
|
|
|
|
- summary: string with detailed markdown summary if relevant, or explanation if not relevant
|
|
|
|
|
- notRelevantReason: string explaining why content is not relevant (only if isRelevant is false)
|
2025-06-09 23:00:25 -06:00
|
|
|
|
|
|
|
|
Today's date is ${formatDateForLLM(new Date())}
|
|
|
|
|
|
|
|
|
|
Here is the query you need to answer: ${query}
|
|
|
|
|
|
|
|
|
|
Here is the content to summarize:
|
2025-06-22 23:59:29 -06:00
|
|
|
${i === 0 ? content.metadata.html : content.pageContent}`;
|
|
|
|
|
|
|
|
|
|
result = await structuredLLM.invoke(prompt, { signal });
|
2025-06-09 23:00:25 -06:00
|
|
|
break;
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(
|
|
|
|
|
`Error summarizing content from URL ${url} ${i === 0 ? 'using html' : 'using text'}:`,
|
|
|
|
|
error,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
if (!result) {
|
|
|
|
|
console.error(`No summary result returned for URL: ${url}`);
|
2025-06-17 00:20:05 -06:00
|
|
|
return {
|
|
|
|
|
document: null,
|
|
|
|
|
notRelevantReason: 'No summary content returned from LLM',
|
|
|
|
|
};
|
2025-06-09 23:00:25 -06:00
|
|
|
}
|
|
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
// Check if content is relevant
|
|
|
|
|
if (!result.isRelevant) {
|
2025-06-09 23:00:25 -06:00
|
|
|
console.log(
|
2025-06-22 23:59:29 -06:00
|
|
|
`LLM response for URL "${url}" indicates it's not relevant:`,
|
|
|
|
|
result.notRelevantReason || result.summary,
|
2025-06-09 23:00:25 -06:00
|
|
|
);
|
2025-06-17 00:20:05 -06:00
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
return {
|
|
|
|
|
document: null,
|
|
|
|
|
notRelevantReason: result.notRelevantReason || result.summary
|
|
|
|
|
};
|
|
|
|
|
}
|
2025-06-17 00:20:05 -06:00
|
|
|
|
2025-06-22 23:59:29 -06:00
|
|
|
// Content is relevant, create document with summary
|
|
|
|
|
if (!result.summary || result.summary.trim().length === 0) {
|
|
|
|
|
console.error(`No summary content in relevant response for URL: ${url}`);
|
|
|
|
|
return {
|
|
|
|
|
document: null,
|
|
|
|
|
notRelevantReason: 'Summary content was empty',
|
|
|
|
|
};
|
2025-06-09 23:00:25 -06:00
|
|
|
}
|
|
|
|
|
|
2025-06-17 00:20:05 -06:00
|
|
|
return {
|
2025-06-15 10:56:57 -06:00
|
|
|
document: new Document({
|
2025-06-22 23:59:29 -06:00
|
|
|
pageContent: result.summary,
|
2025-06-15 10:56:57 -06:00
|
|
|
metadata: {
|
|
|
|
|
...content.metadata,
|
|
|
|
|
url: url,
|
2025-06-16 23:59:59 -06:00
|
|
|
processingType: 'full-content',
|
2025-06-15 10:56:57 -06:00
|
|
|
},
|
|
|
|
|
}),
|
2025-06-17 00:20:05 -06:00
|
|
|
notRelevantReason: undefined,
|
2025-06-15 10:56:57 -06:00
|
|
|
};
|
2025-06-09 23:00:25 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// // First try the lite approach
|
|
|
|
|
// let webContent = await getWebContentLite(url, true);
|
|
|
|
|
|
|
|
|
|
// // Try lite content first
|
|
|
|
|
// if (webContent) {
|
|
|
|
|
// console.log(`Trying lite content extraction for URL: ${url}`);
|
|
|
|
|
// const liteResult = await summarizeContent(webContent);
|
|
|
|
|
|
|
|
|
|
// if (liteResult) {
|
|
|
|
|
// console.log(`Successfully used lite content for URL: ${url}`);
|
|
|
|
|
// return liteResult;
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// // If lite content is not relevant, try full content
|
|
|
|
|
// console.log(`Lite content not relevant for URL ${url}, trying full content extraction`);
|
|
|
|
|
const webContent = await getWebContent(url, true);
|
|
|
|
|
|
|
|
|
|
// Process full content or return null if no content
|
|
|
|
|
if (
|
|
|
|
|
(webContent &&
|
|
|
|
|
webContent.pageContent &&
|
|
|
|
|
webContent.pageContent.trim().length > 0) ||
|
|
|
|
|
(webContent?.metadata.html && webContent.metadata.html.trim().length > 0)
|
|
|
|
|
) {
|
|
|
|
|
console.log(`Using full content extraction for URL: ${url}`);
|
|
|
|
|
return await summarizeContent(webContent);
|
|
|
|
|
} else {
|
|
|
|
|
console.log(`No valid content found for URL: ${url}`);
|
2025-06-17 00:20:05 -06:00
|
|
|
return {
|
|
|
|
|
document: null,
|
|
|
|
|
notRelevantReason: 'No valid content found at the URL',
|
|
|
|
|
};
|
2025-06-09 23:00:25 -06:00
|
|
|
}
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error processing URL ${url}:`, error);
|
2025-06-17 00:20:05 -06:00
|
|
|
return {
|
|
|
|
|
document: null,
|
|
|
|
|
notRelevantReason: `Error processing URL: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
|
|
|
};
|
2025-06-09 23:00:25 -06:00
|
|
|
}
|
|
|
|
|
};
|