chunking.py•2.35 kB
"""Text chunking utilities with token-aware splitting."""
from typing import List
import tiktoken
class TextChunker:
"""Token-aware text chunker using tiktoken."""
def __init__(
self,
chunk_size: int = 512,
chunk_overlap: int = 50,
encoding_name: str = "cl100k_base" # GPT-4 tokenizer
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.encoding = tiktoken.get_encoding(encoding_name)
def chunk_text(self, text: str) -> List[str]:
"""Split text into overlapping chunks based on token count.
Args:
text: The text to chunk
Returns:
List of text chunks
"""
# Encode text to tokens
tokens = self.encoding.encode(text)
chunks = []
start = 0
while start < len(tokens):
# Get chunk of tokens
end = start + self.chunk_size
chunk_tokens = tokens[start:end]
# Decode back to text
chunk_text = self.encoding.decode(chunk_tokens)
chunks.append(chunk_text)
# Move start position with overlap
start += self.chunk_size - self.chunk_overlap
return chunks
def count_tokens(self, text: str) -> int:
"""Count tokens in text."""
return len(self.encoding.encode(text))
def needs_chunking(self, text: str) -> bool:
"""Check if text needs to be chunked."""
return self.count_tokens(text) > self.chunk_size
class SimpleChunker:
"""Simple character-based chunker (fallback if tiktoken unavailable)."""
def __init__(
self,
chunk_size: int = 2000, # characters
chunk_overlap: int = 200
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_text(self, text: str) -> List[str]:
"""Split text into overlapping chunks by character count."""
chunks = []
start = 0
while start < len(text):
end = start + self.chunk_size
chunk = text[start:end]
chunks.append(chunk)
start += self.chunk_size - self.chunk_overlap
return chunks
def needs_chunking(self, text: str) -> bool:
"""Check if text needs to be chunked."""
return len(text) > self.chunk_size