"""Document chunking with overlap and page tracking."""
from .models import PageText, Chunk
class Chunker:
"""Split documents into overlapping chunks."""
def __init__(self, chunk_size: int = 400, overlap: int = 100):
"""
Args:
chunk_size: Target chunk size in tokens (estimated as chars/4)
overlap: Overlap between chunks in tokens
"""
self.chunk_chars = chunk_size * 4
self.overlap_chars = overlap * 4
def chunk(self, pages: list[PageText]) -> list[Chunk]:
"""
Split pages into overlapping chunks.
Attempts to break at sentence boundaries when possible.
Tracks which page each chunk primarily belongs to.
"""
if not pages:
return []
# Concatenate all text
full_text = "\n".join(p.text for p in pages)
# Build page boundary index
page_boundaries = [(p.char_start, p.page_num) for p in pages]
chunks = []
start = 0
chunk_idx = 0
while start < len(full_text):
end = min(start + self.chunk_chars, len(full_text))
# Try to break at sentence boundary in last 20% of chunk
if end < len(full_text):
search_start = start + int(self.chunk_chars * 0.8)
best_break = end
for punct in ['. ', '.\n', '? ', '?\n', '! ', '!\n']:
pos = full_text.rfind(punct, search_start, end)
if pos != -1:
best_break = pos + len(punct)
break
end = best_break
chunk_text = full_text[start:end].strip()
if chunk_text:
# Find page number for chunk start
page_num = 1
for offset, pnum in page_boundaries:
if offset <= start:
page_num = pnum
else:
break
chunks.append(Chunk(
text=chunk_text,
chunk_index=chunk_idx,
page_num=page_num,
char_start=start,
char_end=end,
))
chunk_idx += 1
# Move start with overlap
start = end - self.overlap_chars
if start <= chunks[-1].char_start if chunks else 0:
# Prevent infinite loop on very small documents
start = end
return chunks