Skip to main content
Glama
kalpalathika

MCP Enhanced Data Retrieval System

by kalpalathika
chunking.py6.95 kB
""" Token-aware content chunking for MCP responses. PURPOSE: -------- Implements 1500-token context chunking as required by Milestone 1 (slide 6, 9). Splits large GitHub content (README, code files) into manageable chunks. WHY CHUNKING: ------------- From research (slide 6): "Large context assemblies (3000+ tokens) create substantial TTFT delays affecting user experience" Solution: "Chunked context delivery at 1500-token boundaries" """ import tiktoken import structlog from typing import List logger = structlog.get_logger() # ============================================================================ # SECTION 1: TOKEN COUNTER # ============================================================================ class TokenCounter: """ Counts tokens in text using tiktoken (same tokenizer as Claude/GPT). WHY TIKTOKEN: - Accurate token counting (matches what LLMs actually see) - Handles special characters, emojis, code properly """ def __init__(self): """Initialize the token counter with cl100k_base encoding (used by GPT-4, Claude-3).""" try: self.encoding = tiktoken.get_encoding("cl100k_base") logger.info("Token counter initialized", encoding="cl100k_base") except Exception as e: logger.warning("Could not load cl100k_base, using gpt2", error=str(e)) self.encoding = tiktoken.get_encoding("gpt2") def count(self, text: str) -> int: """ Count tokens in text. EXAMPLE: "Hello world!" → 3 tokens ["Hello", " world", "!"] Args: text: Text to count Returns: Number of tokens """ return len(self.encoding.encode(text)) # ============================================================================ # SECTION 2: CONTENT CHUNKER # ============================================================================ class ContentChunker: """ Splits large content into fixed-size chunks of 1500 tokens. STRATEGY: Simple paragraph-based splitting - Split by paragraphs (\\n\\n) - Keep adding paragraphs until we hit 1500 token limit - Start new chunk when limit reached """ def __init__(self, max_tokens: int = 1500): """ Initialize chunker. Args: max_tokens: Maximum tokens per chunk (default 1500 from milestone) """ self.max_tokens = max_tokens self.counter = TokenCounter() logger.info("ContentChunker initialized", max_tokens=max_tokens) def chunk(self, text: str) -> List[str]: """ Split text into chunks of max_tokens size. ALGORITHM: 1. Check if text fits in one chunk → return as-is 2. Otherwise, split by paragraphs 3. Build chunks by adding paragraphs until limit reached Args: text: Text to chunk Returns: List of text chunks, each under max_tokens """ # Quick check: does it fit in one chunk? total_tokens = self.counter.count(text) if total_tokens <= self.max_tokens: logger.info("Content fits in single chunk", tokens=total_tokens) return [text] logger.info("Chunking content", total_tokens=total_tokens, max_tokens=self.max_tokens, estimated_chunks=(total_tokens // self.max_tokens) + 1) # Split into chunks chunks = [] current_chunk = "" current_tokens = 0 # Split by paragraphs first paragraphs = text.split('\n\n') for para in paragraphs: para_with_spacing = para + '\n\n' para_tokens = self.counter.count(para_with_spacing) # Will adding this paragraph exceed the limit? if current_tokens + para_tokens > self.max_tokens: # Save current chunk if it has content if current_chunk: chunks.append(current_chunk.strip()) # If paragraph itself is larger than max_tokens, split by lines if para_tokens > self.max_tokens: line_chunks = self._chunk_by_lines(para, self.max_tokens) chunks.extend(line_chunks[:-1]) # Add all but last current_chunk = line_chunks[-1] + '\n\n' # Last becomes start of next current_tokens = self.counter.count(current_chunk) else: # Start new chunk with this paragraph current_chunk = para_with_spacing current_tokens = para_tokens else: # Add paragraph to current chunk current_chunk += para_with_spacing current_tokens += para_tokens # Don't forget the last chunk if current_chunk.strip(): chunks.append(current_chunk.strip()) logger.info("Chunking complete", num_chunks=len(chunks)) return chunks def _chunk_by_lines(self, text: str, max_tokens: int) -> List[str]: """ Split text by lines when paragraphs are too large. Args: text: Text to split max_tokens: Token limit Returns: List of chunks """ chunks = [] current_chunk = "" current_tokens = 0 lines = text.split('\n') for line in lines: line_with_newline = line + '\n' line_tokens = self.counter.count(line_with_newline) if current_tokens + line_tokens > max_tokens: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = line_with_newline current_tokens = line_tokens else: current_chunk += line_with_newline current_tokens += line_tokens if current_chunk.strip(): chunks.append(current_chunk.strip()) return chunks # ============================================================================ # SECTION 3: CONVENIENCE API # ============================================================================ # Global singleton instance _chunker = None def chunk_content(text: str, max_tokens: int = 1500) -> List[str]: """ Chunk text into pieces of max_tokens size. USAGE EXAMPLE: -------------- from src.utils.chunking import chunk_content # Get large README readme = "..." # 5000 tokens # Chunk it chunks = chunk_content(readme) # Returns: [chunk1 (1500 tokens), chunk2 (1500 tokens), chunk3 (1500 tokens), chunk4 (500 tokens)] Args: text: Text to chunk max_tokens: Maximum tokens per chunk (default 1500) Returns: List of text chunks """ global _chunker if _chunker is None: _chunker = ContentChunker(max_tokens) return _chunker.chunk(text)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kalpalathika/MCP-Enhanced-Data-Retrieval-System'

If you have feedback or need assistance with the MCP directory API, please join our Discord server