"""
Token-aware content chunking for MCP responses.
PURPOSE:
--------
Implements 1500-token context chunking as required by Milestone 1 (slide 6, 9).
Splits large GitHub content (README, code files) into manageable chunks.
WHY CHUNKING:
-------------
From research (slide 6): "Large context assemblies (3000+ tokens) create
substantial TTFT delays affecting user experience"
Solution: "Chunked context delivery at 1500-token boundaries"
"""
import tiktoken
import structlog
from typing import List
logger = structlog.get_logger()
# ============================================================================
# SECTION 1: TOKEN COUNTER
# ============================================================================
class TokenCounter:
"""
Counts tokens in text using tiktoken (same tokenizer as Claude/GPT).
WHY TIKTOKEN:
- Accurate token counting (matches what LLMs actually see)
- Handles special characters, emojis, code properly
"""
def __init__(self):
"""Initialize the token counter with cl100k_base encoding (used by GPT-4, Claude-3)."""
try:
self.encoding = tiktoken.get_encoding("cl100k_base")
logger.info("Token counter initialized", encoding="cl100k_base")
except Exception as e:
logger.warning("Could not load cl100k_base, using gpt2", error=str(e))
self.encoding = tiktoken.get_encoding("gpt2")
def count(self, text: str) -> int:
"""
Count tokens in text.
EXAMPLE:
"Hello world!" → 3 tokens ["Hello", " world", "!"]
Args:
text: Text to count
Returns:
Number of tokens
"""
return len(self.encoding.encode(text))
# ============================================================================
# SECTION 2: CONTENT CHUNKER
# ============================================================================
class ContentChunker:
"""
Splits large content into fixed-size chunks of 1500 tokens.
STRATEGY: Simple paragraph-based splitting
- Split by paragraphs (\\n\\n)
- Keep adding paragraphs until we hit 1500 token limit
- Start new chunk when limit reached
"""
def __init__(self, max_tokens: int = 1500):
"""
Initialize chunker.
Args:
max_tokens: Maximum tokens per chunk (default 1500 from milestone)
"""
self.max_tokens = max_tokens
self.counter = TokenCounter()
logger.info("ContentChunker initialized", max_tokens=max_tokens)
def chunk(self, text: str) -> List[str]:
"""
Split text into chunks of max_tokens size.
ALGORITHM:
1. Check if text fits in one chunk → return as-is
2. Otherwise, split by paragraphs
3. Build chunks by adding paragraphs until limit reached
Args:
text: Text to chunk
Returns:
List of text chunks, each under max_tokens
"""
# Quick check: does it fit in one chunk?
total_tokens = self.counter.count(text)
if total_tokens <= self.max_tokens:
logger.info("Content fits in single chunk", tokens=total_tokens)
return [text]
logger.info("Chunking content",
total_tokens=total_tokens,
max_tokens=self.max_tokens,
estimated_chunks=(total_tokens // self.max_tokens) + 1)
# Split into chunks
chunks = []
current_chunk = ""
current_tokens = 0
# Split by paragraphs first
paragraphs = text.split('\n\n')
for para in paragraphs:
para_with_spacing = para + '\n\n'
para_tokens = self.counter.count(para_with_spacing)
# Will adding this paragraph exceed the limit?
if current_tokens + para_tokens > self.max_tokens:
# Save current chunk if it has content
if current_chunk:
chunks.append(current_chunk.strip())
# If paragraph itself is larger than max_tokens, split by lines
if para_tokens > self.max_tokens:
line_chunks = self._chunk_by_lines(para, self.max_tokens)
chunks.extend(line_chunks[:-1]) # Add all but last
current_chunk = line_chunks[-1] + '\n\n' # Last becomes start of next
current_tokens = self.counter.count(current_chunk)
else:
# Start new chunk with this paragraph
current_chunk = para_with_spacing
current_tokens = para_tokens
else:
# Add paragraph to current chunk
current_chunk += para_with_spacing
current_tokens += para_tokens
# Don't forget the last chunk
if current_chunk.strip():
chunks.append(current_chunk.strip())
logger.info("Chunking complete", num_chunks=len(chunks))
return chunks
def _chunk_by_lines(self, text: str, max_tokens: int) -> List[str]:
"""
Split text by lines when paragraphs are too large.
Args:
text: Text to split
max_tokens: Token limit
Returns:
List of chunks
"""
chunks = []
current_chunk = ""
current_tokens = 0
lines = text.split('\n')
for line in lines:
line_with_newline = line + '\n'
line_tokens = self.counter.count(line_with_newline)
if current_tokens + line_tokens > max_tokens:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = line_with_newline
current_tokens = line_tokens
else:
current_chunk += line_with_newline
current_tokens += line_tokens
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
# ============================================================================
# SECTION 3: CONVENIENCE API
# ============================================================================
# Global singleton instance
_chunker = None
def chunk_content(text: str, max_tokens: int = 1500) -> List[str]:
"""
Chunk text into pieces of max_tokens size.
USAGE EXAMPLE:
--------------
from src.utils.chunking import chunk_content
# Get large README
readme = "..." # 5000 tokens
# Chunk it
chunks = chunk_content(readme)
# Returns: [chunk1 (1500 tokens), chunk2 (1500 tokens), chunk3 (1500 tokens), chunk4 (500 tokens)]
Args:
text: Text to chunk
max_tokens: Maximum tokens per chunk (default 1500)
Returns:
List of text chunks
"""
global _chunker
if _chunker is None:
_chunker = ContentChunker(max_tokens)
return _chunker.chunk(text)