Add PDF search feature and update Docker configuration

This commit is contained in:
Khanhlq 2025-03-17 12:54:10 +07:00
parent 060c68a900
commit 4d438f06cd
7 changed files with 265 additions and 14 deletions

View file

@ -1,15 +1,25 @@
FROM node:20.18.0-alpine
FROM node:20.18.0-alpine as base
WORKDIR /home/perplexica
# Development stage
FROM base as development
ENV NODE_ENV=development
ARG NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
ARG NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
COPY ui/package.json ui/yarn.lock ./
RUN yarn install
EXPOSE 3000
CMD ["yarn", "dev"]
WORKDIR /home/perplexica
# Production stage
FROM base as production
ARG NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
ARG NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
COPY ui /home/perplexica/
RUN yarn install --frozen-lockfile
RUN yarn build
CMD ["yarn", "start"]

View file

@ -1,17 +1,22 @@
FROM node:18-slim
FROM node:18-slim as base
WORKDIR /home/perplexica
RUN mkdir -p /home/perplexica/data /home/perplexica/uploads
# Development stage
FROM base as development
ENV NODE_ENV=development
COPY package.json yarn.lock ./
RUN yarn install --frozen-lockfile --network-timeout 600000
EXPOSE 3001
CMD ["yarn", "dev"]
# Production stage
FROM base as production
COPY src /home/perplexica/src
COPY tsconfig.json /home/perplexica/
COPY drizzle.config.ts /home/perplexica/
COPY package.json /home/perplexica/
COPY yarn.lock /home/perplexica/
RUN mkdir /home/perplexica/data
RUN mkdir /home/perplexica/uploads
RUN yarn install --frozen-lockfile --network-timeout 600000
RUN yarn build
CMD ["yarn", "start"]

View file

@ -13,9 +13,11 @@ services:
build:
context: .
dockerfile: backend.dockerfile
target: development
image: itzcrazykns1337/perplexica-backend:main
environment:
- SEARXNG_API_URL=http://searxng:8080
- NODE_ENV=development
depends_on:
- searxng
ports:
@ -24,27 +26,41 @@ services:
- backend-dbstore:/home/perplexica/data
- uploads:/home/perplexica/uploads
- ./config.toml:/home/perplexica/config.toml
- ./src:/home/perplexica/src
- ./package.json:/home/perplexica/package.json
- ./yarn.lock:/home/perplexica/yarn.lock
- ./tsconfig.json:/home/perplexica/tsconfig.json
- ./drizzle.config.ts:/home/perplexica/drizzle.config.ts
extra_hosts:
- 'host.docker.internal:host-gateway'
networks:
- perplexica-network
restart: unless-stopped
command: yarn dev
perplexica-frontend:
build:
context: .
dockerfile: app.dockerfile
target: development
args:
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
image: itzcrazykns1337/perplexica-frontend:main
depends_on:
- perplexica-backend
environment:
- NODE_ENV=development
ports:
- 3000:3000
volumes:
- ./ui:/home/perplexica
- /home/perplexica/node_modules
- /home/perplexica/.next
networks:
- perplexica-network
restart: unless-stopped
command: yarn dev
networks:
perplexica-network:

View file

@ -40,6 +40,7 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
metadata: {
title: title,
url: link,
type: 'pdf', // Add this type indicator
},
});
});

View file

@ -19,6 +19,7 @@ import MessageSources from './MessageSources';
import SearchImages from './SearchImages';
import SearchVideos from './SearchVideos';
import { useSpeech } from 'react-text-to-speech';
import { SearchPDFs } from './SearchPDFs';
const MessageBox = ({
message,
@ -63,6 +64,7 @@ const MessageBox = ({
setParsedMessage(message.content);
}, [message.content, message.sources, message.role]);
const { speechStatus, start, stop } = useSpeech({ text: speechMessage });
return (
@ -119,6 +121,7 @@ const MessageBox = ({
>
{parsedMessage}
</Markdown>
{loading && isLast ? null : (
<div className="flex flex-row items-center justify-between w-full text-black dark:text-white py-4 -mx-2">
<div className="flex flex-row items-center space-x-1">
@ -187,6 +190,7 @@ const MessageBox = ({
</div>
</>
)}
{/* PDF Documents Section - Moved from sidebar to under the answer */}
</div>
</div>
<div className="lg:sticky lg:top-20 flex flex-col items-center space-y-3 w-full lg:w-3/12 z-30 h-full pb-4">
@ -198,6 +202,10 @@ const MessageBox = ({
chatHistory={history.slice(0, messageIndex - 1)}
query={history[messageIndex - 1].content}
/>
<SearchPDFs
query={history[messageIndex - 1]?.content}
chatHistory={history.slice(0, messageIndex - 1)}
/>
</div>
</div>
)}

View file

@ -69,11 +69,15 @@ const MessageSources = ({ sources }: { sources: Document[] }) => {
<div className="flex flex-row items-center space-x-1">
{sources.slice(3, 6).map((source, i) => {
return source.metadata.url === 'File' ? (
<div className="bg-dark-200 hover:bg-dark-100 transition duration-200 flex items-center justify-center w-6 h-6 rounded-full">
<div
key={`source-icon-${i}`}
className="bg-dark-200 hover:bg-dark-100 transition duration-200 flex items-center justify-center w-6 h-6 rounded-full"
>
<File size={12} className="text-white/70" />
</div>
) : (
<img
key={`source-icon-${i}`}
src={`https://s2.googleusercontent.com/s2/favicons?domain_url=${source.metadata.url}`}
width={16}
height={16}
@ -139,7 +143,7 @@ const MessageSources = ({ sources }: { sources: Document[] }) => {
</p>
</div>
<div className="flex flex-row items-center space-x-1 text-black/50 dark:text-white/50 text-xs">
<div className="bg-black/50 dark:bg-white/50 h-[4px] w-[4px] rounded-full" />
<div className="bg-black/50 dark:bg.white/50 h-[4px] w-[4px] rounded-full" />
<span>{i + 1}</span>
</div>
</div>

View file

@ -0,0 +1,207 @@
/* eslint-disable @next/next/no-img-element */
'use client';
import { File, ExternalLink, PlusIcon, Download } from 'lucide-react';
import { useState } from 'react';
import { Message } from './ChatWindow';
type PDF = {
title: string;
url: string;
type?: 'academic' | 'document' | 'article';
};
export const SearchPDFs = ({
query,
chatHistory,
}: {
query: string;
chatHistory: Message[];
}) => {
const [pdfs, setPdfs] = useState<PDF[] | null>(null);
const [loading, setLoading] = useState(false);
const getPDFType = (title: string): 'academic' | 'document' | 'article' => {
const lowerTitle = title.toLowerCase();
if (lowerTitle.includes('paper') || lowerTitle.includes('journal') || lowerTitle.includes('research')) {
return 'academic';
}
if (lowerTitle.includes('article') || lowerTitle.includes('blog')) {
return 'article';
}
return 'document';
};
const getTypeColor = (type: 'academic' | 'document' | 'article') => {
switch (type) {
case 'academic':
return 'bg-blue-500/10 text-blue-500';
case 'article':
return 'bg-green-500/10 text-green-500';
default:
return 'bg-red-500/10 text-red-500';
}
};
const formatChatHistory = (history: Message[]) => {
return history.map(msg => {
return [msg.role === 'user' ? 'human' : 'ai', msg.content];
});
};
const searchForPDFs = async () => {
setLoading(true);
const chatModelProvider = localStorage.getItem('chatModelProvider');
const chatModel = localStorage.getItem('chatModel');
const customOpenAIBaseURL = localStorage.getItem('openAIBaseURL');
const customOpenAIKey = localStorage.getItem('openAIApiKey');
try {
const res = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/search`,
{
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
query: `PDF documents about: ${query}`,
focusMode: 'webSearch',
optimizationMode: 'balanced',
history: formatChatHistory(chatHistory),
chatModel: {
provider: chatModelProvider,
model: chatModel,
...(chatModelProvider === 'custom_openai' && {
customOpenAIKey: customOpenAIKey,
customOpenAIBaseURL: customOpenAIBaseURL,
}),
},
}),
},
);
const data = await res.json();
console.log("Search response:", data);
// Extract PDF results from the message and sources
let pdfResults: PDF[] = [];
// Check for PDF URLs in sources
if (data.sources && Array.isArray(data.sources)) {
pdfResults = data.sources
.filter((source: any) =>
source.metadata?.url?.toLowerCase().endsWith('.pdf') ||
source.metadata?.title?.includes('PDF') ||
source.metadata?.url?.includes('.pdf')
)
.map((source: any) => ({
title: source.metadata.title || 'PDF Document',
url: source.metadata.url,
type: getPDFType(source.metadata.title || '')
}));
}
setPdfs(pdfResults);
} catch (error) {
console.error('Error searching for PDFs:', error);
setPdfs([]);
} finally {
setLoading(false);
}
};
return (
<>
{!loading && pdfs === null && (
<button
id="search-pdfs"
onClick={searchForPDFs}
className="border border-dashed border-light-200 dark:border-dark-200 hover:bg-light-200 dark:hover:bg-dark-200 active:scale-95 duration-200 transition px-4 py-2 flex flex-row items-center justify-between rounded-lg dark:text-white text-sm w-full"
>
<div className="flex flex-row items-center space-x-2">
<File size={17} />
<p>Search PDFs</p>
</div>
<PlusIcon className="text-[#24A0ED]" size={17} />
</button>
)}
{loading && (
<div className="grid grid-cols-2 gap-2">
{[...Array(4)].map((_, i) => (
<div
key={i}
className="bg-light-secondary dark:bg-dark-secondary h-32 w-full rounded-lg animate-pulse aspect-video object-cover"
/>
))}
</div>
)}
{pdfs !== null && pdfs.length > 0 && (
<div className="bg-light-secondary dark:bg-dark-secondary w-full rounded-lg p-4 flex flex-col space-y-4">
<div className="flex flex-row items-center justify-between">
<div className="flex flex-row items-center space-x-2">
<File />
<h3 className="font-medium text-black dark:text-white">PDF Documents</h3>
<span className="text-xs text-black/50 dark:text-white/50">({pdfs.length})</span>
</div>
</div>
<div className="flex flex-col space-y-2">
{pdfs.map((pdf, i) => (
<div
key={i}
className="bg-light-100 dark:bg-dark-100 rounded-lg p-3 hover:bg-light-200 dark:hover:bg-dark-200 transition duration-200"
>
<div className="flex flex-row items-center justify-between">
<div className="flex flex-row items-center space-x-3 flex-grow min-w-0">
<div className={`p-2 rounded-md ${getTypeColor(pdf.type || 'document')}`}>
<File size={16} />
</div>
<div className="flex flex-col space-y-1 flex-grow min-w-0">
<p className="text-sm font-medium text-black dark:text-white truncate" title={pdf.title}>
{pdf.title}
</p>
<p className="text-xs text-black/50 dark:text-white/50 truncate">
{new URL(pdf.url).hostname}
</p>
</div>
</div>
<div className="flex flex-row items-center space-x-2 flex-shrink-0 ml-2">
<a
href={pdf.url}
target="_blank"
rel="noreferrer"
className="p-2 text-black/70 dark:text-white/70 rounded-xl hover:bg-light-secondary dark:hover:bg-dark-secondary transition duration-200 hover:text-black dark:hover:text-white"
>
<ExternalLink size={16} />
</a>
<a
href={pdf.url}
download
className="p-2 text-black/70 dark:text-white/70 rounded-xl hover:bg-light-secondary dark:hover:bg-dark-secondary transition duration-200 hover:text-black dark:hover:text-white"
>
<Download size={16} />
</a>
</div>
</div>
</div>
))}
</div>
</div>
)}
{pdfs !== null && pdfs.length === 0 && (
<div className="bg-light-secondary dark:bg-dark-secondary w-full rounded-lg p-4 flex flex-col space-y-2">
<div className="flex flex-row items-center space-x-2">
<File />
<h3 className="font-medium text-black dark:text-white">PDF Documents</h3>
</div>
<p className="text-sm text-black/60 dark:text-white/60">No PDF documents found related to your query.</p>
</div>
)}
</>
);
};