2025-03-18 10:24:16 +05:30
import axios from 'axios' ;
import { htmlToText } from 'html-to-text' ;
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter' ;
import { Document } from '@langchain/core/documents' ;
import pdfParse from 'pdf-parse' ;
2025-05-24 14:37:19 -06:00
import { Configuration , Dataset , PlaywrightCrawler } from 'crawlee' ;
2025-05-23 18:03:35 -06:00
import { Readability } from '@mozilla/readability' ;
2025-05-24 14:37:19 -06:00
import { JSDOM } from 'jsdom' ;
2025-05-23 18:03:35 -06:00
import fetch from 'node-fetch' ;
2025-03-18 10:24:16 +05:30
export const getDocumentsFromLinks = async ( { links } : { links : string [ ] } ) = > {
const splitter = new RecursiveCharacterTextSplitter ( ) ;
let docs : Document [ ] = [ ] ;
await Promise . all (
links . map ( async ( link ) = > {
link =
link . startsWith ( 'http://' ) || link . startsWith ( 'https://' )
? link
: ` https:// ${ link } ` ;
try {
const res = await axios . get ( link , {
responseType : 'arraybuffer' ,
} ) ;
const isPdf = res . headers [ 'content-type' ] === 'application/pdf' ;
if ( isPdf ) {
const pdfText = await pdfParse ( res . data ) ;
const parsedText = pdfText . text
. replace ( /(\r\n|\n|\r)/gm , ' ' )
. replace ( /\s+/g , ' ' )
. trim ( ) ;
const splittedText = await splitter . splitText ( parsedText ) ;
const title = 'PDF Document' ;
const linkDocs = splittedText . map ( ( text ) = > {
return new Document ( {
pageContent : text ,
metadata : {
title : title ,
url : link ,
} ,
} ) ;
} ) ;
docs . push ( . . . linkDocs ) ;
return ;
}
const parsedText = htmlToText ( res . data . toString ( 'utf8' ) , {
selectors : [
{
selector : 'a' ,
options : {
ignoreHref : true ,
} ,
} ,
] ,
} )
. replace ( /(\r\n|\n|\r)/gm , ' ' )
. replace ( /\s+/g , ' ' )
. trim ( ) ;
const splittedText = await splitter . splitText ( parsedText ) ;
const title = res . data
. toString ( 'utf8' )
2025-02-15 16:48:21 +08:00
. match ( /<title.*>(.*?)<\/title>/ ) ? . [ 1 ] ;
2025-03-18 10:24:16 +05:30
const linkDocs = splittedText . map ( ( text ) = > {
return new Document ( {
pageContent : text ,
metadata : {
title : title || link ,
url : link ,
} ,
} ) ;
} ) ;
docs . push ( . . . linkDocs ) ;
} catch ( err ) {
2025-03-18 11:29:04 +05:30
console . error (
'An error occurred while getting documents from links: ' ,
err ,
) ;
2025-03-18 10:24:16 +05:30
docs . push (
new Document ( {
pageContent : ` Failed to retrieve content from the link: ${ err } ` ,
metadata : {
title : 'Failed to retrieve content' ,
url : link ,
} ,
} ) ,
) ;
}
} ) ,
) ;
return docs ;
} ;
2025-05-23 18:03:35 -06:00
2025-05-24 14:37:19 -06:00
interface CrawledContent {
text : string ;
title : string ;
html? : string ;
}
/ * *
* Fetches web content from a given URL using Crawlee and Playwright . Parses it using Readability .
* Returns a Document object containing the parsed text and metadata .
*
* @param url - The URL to fetch content from .
* @param getHtml - Whether to include the HTML content in the metadata .
* @returns A Promise that resolves to a Document object or null if parsing fails .
* /
2025-05-23 18:03:35 -06:00
export const getWebContent = async (
url : string ,
getHtml : boolean = false ,
) : Promise < Document | null > = > {
2025-05-24 14:37:19 -06:00
let crawledContent : CrawledContent | null = null ;
const crawler = new PlaywrightCrawler ( {
async requestHandler ( { page } ) {
// Wait for the content to load
await page . waitForLoadState ( 'networkidle' , { timeout : 10000 } ) ;
// Allow some time for dynamic content to load
await page . waitForTimeout ( 3000 ) ;
console . log ( ` Crawling URL: ${ url } ` ) ;
// Get the page title
const title = await page . title ( ) ;
try {
// Use Readability to parse the page content
const content = await page . content ( ) ;
const dom = new JSDOM ( content , { url } ) ;
const reader = new Readability ( dom . window . document , { charThreshold : 25 } ) . parse ( ) ;
const crawleeContent : CrawledContent = {
text : reader?.textContent || '' ,
title ,
html : getHtml ? reader ? . content || await page . content ( ) : undefined ,
} ;
crawledContent = crawleeContent ;
} catch ( error ) {
console . error ( ` Failed to parse content with Readability for URL: ${ url } ` , error ) ;
}
} ,
maxRequestsPerCrawl : 1 ,
maxRequestRetries : 2 ,
retryOnBlocked : true ,
maxSessionRotations : 3 ,
} , new Configuration ( { persistStorage : false } ) ) ;
2025-05-23 18:03:35 -06:00
try {
2025-05-24 14:37:19 -06:00
await crawler . run ( [ url ] ) ;
if ( ! crawledContent ) {
console . warn ( ` Failed to parse article content for URL: ${ url } ` ) ;
return null ;
}
const content = crawledContent as CrawledContent ;
// Normalize the text content
const normalizedText = content ? . text
? . split ( '\n' )
. map ( ( line : string ) = > line . trim ( ) )
. filter ( ( line : string ) = > line . length > 0 )
. join ( '\n' ) || '' ;
// Create a Document with the parsed content
const returnDoc = new Document ( {
pageContent : normalizedText ,
metadata : {
html : content?.html ,
title : content?.title ,
url : url ,
} ,
} ) ;
console . log ( ` Got content with Crawlee and Readability, URL: ${ url } , Text Length: ${ returnDoc . pageContent . length } , html Length: ${ returnDoc . metadata . html ? . length || 0 } ` ) ;
return returnDoc ;
} catch ( error ) {
console . error ( ` Error fetching/parsing URL ${ url } : ` , error ) ;
return null ;
} finally {
await crawler . teardown ( ) ;
}
} ;
/ * *
* Fetches web content from a given URL and parses it using Readability .
* Returns a Document object containing the parsed text and metadata .
*
* @param { string } url - The URL to fetch content from .
* @param { boolean } getHtml - Whether to include the HTML content in the metadata .
* @returns { Promise < Document | null > } A Promise that resolves to a Document object or null if parsing fails .
* /
export const getWebContentLite = async (
url : string ,
getHtml : boolean = false ,
) : Promise < Document | null > = > {
try {
const response = await fetch ( url , { timeout : 5000 } ) ;
2025-05-23 18:03:35 -06:00
const html = await response . text ( ) ;
// Create a DOM from the fetched HTML
const dom = new JSDOM ( html , { url } ) ;
// Get title before we modify the DOM
const originalTitle = dom . window . document . title ;
// Use Readability to parse the article content
const reader = new Readability ( dom . window . document , { charThreshold : 25 } ) ;
const article = reader . parse ( ) ;
if ( ! article ) {
console . warn ( ` Failed to parse article content for URL: ${ url } ` ) ;
return null ;
}
const normalizedText =
article ? . textContent
? . split ( '\n' )
. map ( ( line ) = > line . trim ( ) )
. filter ( ( line ) = > line . length > 0 )
. join ( '\n' ) || '' ;
// Create a Document with the parsed content
return new Document ( {
pageContent : normalizedText || '' ,
metadata : {
html : getHtml ? article.content : undefined ,
title : article.title || originalTitle ,
url : url ,
} ,
} ) ;
} catch ( error ) {
console . error ( ` Error fetching/parsing URL ${ url } : ` ) ; //, error);
return null ;
}
2025-05-24 14:37:19 -06:00
} ;