Quick-start Auto MCP
by teddynote-lab
Verified
from typing import List, Optional, Any
import os
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma
from rag.base import RetrievalChain
class PDFRetrievalChain(RetrievalChain):
"""
PDF-specific implementation of the RetrievalChain.
This class specializes in loading, splitting, and indexing PDF documents
for retrieval.
"""
def __init__(self,
source_uri: List[str],
persist_directory: Optional[str] = None,
**kwargs) -> None:
"""
Initialize a PDF retrieval chain.
Args:
source_uri: List of PDF file paths
persist_directory: Directory to persist vector store
**kwargs: Additional keyword arguments for the base RetrievalChain
"""
super().__init__(source_uri=source_uri, persist_directory=persist_directory, **kwargs)
def load_documents(self, source_uris: List[str]) -> List[Document]:
"""
Load PDF documents from file paths.
Args:
source_uris: List of PDF file paths
Returns:
List of loaded documents
"""
docs = []
for source_uri in source_uris:
if not os.path.exists(source_uri):
print(f"File not found: {source_uri}")
continue
print(f"Loading PDF: {source_uri}")
loader = PDFPlumberLoader(source_uri)
docs.extend(loader.load())
return docs
def create_text_splitter(self) -> RecursiveCharacterTextSplitter:
"""
Create a text splitter optimized for PDF documents.
Returns:
A text splitter instance suitable for PDFs
"""
return RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=50
)
def create_vectorstore(self, split_docs: List[Document]) -> Any:
"""
Create a vector store from split PDF documents.
Args:
split_docs: Split document chunks
Returns:
A vector store instance
Raises:
ValueError: If there are no split documents
"""
if not split_docs:
raise ValueError("No split documents available.")
if self.persist_directory:
os.makedirs(self.persist_directory, exist_ok=True)
if os.path.exists(self.persist_directory) and any(os.listdir(self.persist_directory)):
print(f"Loading existing vector store: {self.persist_directory}")
return Chroma(
persist_directory=self.persist_directory,
embedding_function=self.create_embedding()
)
print("Creating new vector store...")
vectorstore = Chroma.from_documents(
documents=split_docs,
embedding=self.create_embedding(),
persist_directory=self.persist_directory
)
return vectorstore