Skip to main content
Glama
chunking.py2.35 kB
"""Text chunking utilities with token-aware splitting.""" from typing import List import tiktoken class TextChunker: """Token-aware text chunker using tiktoken.""" def __init__( self, chunk_size: int = 512, chunk_overlap: int = 50, encoding_name: str = "cl100k_base" # GPT-4 tokenizer ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.encoding = tiktoken.get_encoding(encoding_name) def chunk_text(self, text: str) -> List[str]: """Split text into overlapping chunks based on token count. Args: text: The text to chunk Returns: List of text chunks """ # Encode text to tokens tokens = self.encoding.encode(text) chunks = [] start = 0 while start < len(tokens): # Get chunk of tokens end = start + self.chunk_size chunk_tokens = tokens[start:end] # Decode back to text chunk_text = self.encoding.decode(chunk_tokens) chunks.append(chunk_text) # Move start position with overlap start += self.chunk_size - self.chunk_overlap return chunks def count_tokens(self, text: str) -> int: """Count tokens in text.""" return len(self.encoding.encode(text)) def needs_chunking(self, text: str) -> bool: """Check if text needs to be chunked.""" return self.count_tokens(text) > self.chunk_size class SimpleChunker: """Simple character-based chunker (fallback if tiktoken unavailable).""" def __init__( self, chunk_size: int = 2000, # characters chunk_overlap: int = 200 ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def chunk_text(self, text: str) -> List[str]: """Split text into overlapping chunks by character count.""" chunks = [] start = 0 while start < len(text): end = start + self.chunk_size chunk = text[start:end] chunks.append(chunk) start += self.chunk_size - self.chunk_overlap return chunks def needs_chunking(self, text: str) -> bool: """Check if text needs to be chunked.""" return len(text) > self.chunk_size

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/timerickson/personal-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server