from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict
from .config import Config
class TextChunker:
def __init__(self):
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " ", ""]
)
def chunk_text(self, text: str, metadata: Dict = None) -> List[Dict]:
"""Split text into chunks with metadata"""
chunks = self.splitter.split_text(text)
result = []
for i, chunk in enumerate(chunks):
chunk_data = {
"text": chunk,
"chunk_index": i,
"metadata": metadata or {}
}
result.append(chunk_data)
return result
def chunk_document(self, doc_data: Dict, source_path: str) -> List[Dict]:
"""Chunk document with page tracking"""
base_metadata = {
**doc_data["metadata"],
"source_path": source_path,
"doc_type": source_path.split('.')[-1]
}
# If PDF with pages, track page numbers
if "pages" in doc_data:
chunks = []
for page_info in doc_data["pages"]:
page_chunks = self.chunk_text(
page_info["text"],
{**base_metadata, "page": page_info["page"]}
)
chunks.extend(page_chunks)
return chunks
else:
return self.chunk_text(doc_data["text"], base_metadata)