MCP Enhanced Data Retrieval System

chunking.py•6.78 KiB

""" Token-aware content chunking for MCP responses. PURPOSE: -------- Implements 1500-token context chunking as required by Milestone 1 (slide 6, 9). Splits large GitHub content (README, code files) into manageable chunks. WHY CHUNKING: ------------- From research (slide 6): "Large context assemblies (3000+ tokens) create substantial TTFT delays affecting user experience" Solution: "Chunked context delivery at 1500-token boundaries" """ import tiktoken import structlog from typing import List logger = structlog.get_logger() # ============================================================================ # SECTION 1: TOKEN COUNTER # ============================================================================ class TokenCounter: """ Counts tokens in text using tiktoken (same tokenizer as Claude/GPT). WHY TIKTOKEN: - Accurate token counting (matches what LLMs actually see) - Handles special characters, emojis, code properly """ def __init__(self): """Initialize the token counter with cl100k_base encoding (used by GPT-4, Claude-3).""" try: self.encoding = tiktoken.get_encoding("cl100k_base") logger.info("Token counter initialized", encoding="cl100k_base") except Exception as e: logger.warning("Could not load cl100k_base, using gpt2", error=str(e)) self.encoding = tiktoken.get_encoding("gpt2") def count(self, text: str) -> int: """ Count tokens in text. EXAMPLE: "Hello world!" → 3 tokens ["Hello", " world", "!"] Args: text: Text to count Returns: Number of tokens """ return len(self.encoding.encode(text)) # ============================================================================ # SECTION 2: CONTENT CHUNKER # ============================================================================ class ContentChunker: """ Splits large content into fixed-size chunks of 1500 tokens. STRATEGY: Simple paragraph-based splitting - Split by paragraphs (\\n\\n) - Keep adding paragraphs until we hit 1500 token limit - Start new chunk when limit reached """ def __init__(self, max_tokens: int = 1500): """ Initialize chunker. Args: max_tokens: Maximum tokens per chunk (default 1500 from milestone) """ self.max_tokens = max_tokens self.counter = TokenCounter() logger.info("ContentChunker initialized", max_tokens=max_tokens) def chunk(self, text: str) -> List[str]: """ Split text into chunks of max_tokens size. ALGORITHM: 1. Check if text fits in one chunk → return as-is 2. Otherwise, split by paragraphs 3. Build chunks by adding paragraphs until limit reached Args: text: Text to chunk Returns: List of text chunks, each under max_tokens """ # Quick check: does it fit in one chunk? total_tokens = self.counter.count(text) if total_tokens <= self.max_tokens: logger.info("Content fits in single chunk", tokens=total_tokens) return [text] logger.info("Chunking content", total_tokens=total_tokens, max_tokens=self.max_tokens, estimated_chunks=(total_tokens // self.max_tokens) + 1) # Split into chunks chunks = [] current_chunk = "" current_tokens = 0 # Split by paragraphs first paragraphs = text.split('\n\n') for para in paragraphs: para_with_spacing = para + '\n\n' para_tokens = self.counter.count(para_with_spacing) # Will adding this paragraph exceed the limit? if current_tokens + para_tokens > self.max_tokens: # Save current chunk if it has content if current_chunk: chunks.append(current_chunk.strip()) # If paragraph itself is larger than max_tokens, split by lines if para_tokens > self.max_tokens: line_chunks = self._chunk_by_lines(para, self.max_tokens) chunks.extend(line_chunks[:-1]) # Add all but last current_chunk = line_chunks[-1] + '\n\n' # Last becomes start of next current_tokens = self.counter.count(current_chunk) else: # Start new chunk with this paragraph current_chunk = para_with_spacing current_tokens = para_tokens else: # Add paragraph to current chunk current_chunk += para_with_spacing current_tokens += para_tokens # Don't forget the last chunk if current_chunk.strip(): chunks.append(current_chunk.strip()) logger.info("Chunking complete", num_chunks=len(chunks)) return chunks def _chunk_by_lines(self, text: str, max_tokens: int) -> List[str]: """ Split text by lines when paragraphs are too large. Args: text: Text to split max_tokens: Token limit Returns: List of chunks """ chunks = [] current_chunk = "" current_tokens = 0 lines = text.split('\n') for line in lines: line_with_newline = line + '\n' line_tokens = self.counter.count(line_with_newline) if current_tokens + line_tokens > max_tokens: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = line_with_newline current_tokens = line_tokens else: current_chunk += line_with_newline current_tokens += line_tokens if current_chunk.strip(): chunks.append(current_chunk.strip()) return chunks # ============================================================================ # SECTION 3: CONVENIENCE API # ============================================================================ # Global singleton instance _chunker = None def chunk_content(text: str, max_tokens: int = 1500) -> List[str]: """ Chunk text into pieces of max_tokens size. USAGE EXAMPLE: -------------- from src.utils.chunking import chunk_content # Get large README readme = "..." # 5000 tokens # Chunk it chunks = chunk_content(readme) # Returns: [chunk1 (1500 tokens), chunk2 (1500 tokens), chunk3 (1500 tokens), chunk4 (500 tokens)] Args: text: Text to chunk max_tokens: Maximum tokens per chunk (default 1500) Returns: List of text chunks """ global _chunker if _chunker is None: _chunker = ContentChunker(max_tokens) return _chunker.chunk(text)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kalpalathika/MCP-Enhanced-Data-Retrieval-System'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunking.py•6.78 KiB