Skip to main content
Glama

MCP Indexer

by gkatechis
chunker.py10.9 kB
""" Code chunking strategy for semantic search Respects code boundaries (functions, classes) while maintaining target chunk size of ~200-300 tokens. Adds context metadata. """ from dataclasses import dataclass from typing import List, Optional from mcpindexer.parser import ParsedFile, CodeSymbol @dataclass class CodeChunk: """Represents a chunk of code for embedding""" chunk_id: str # Unique identifier file_path: str repo_name: str language: str chunk_type: str # function, class, module, file code_text: str start_line: int end_line: int # Context metadata symbol_name: Optional[str] # Name of function/class if applicable parent_class: Optional[str] # Parent class name if method imports: List[str] # Relevant imports for this chunk # For retrieval context_text: str # Enhanced text with file path and structure token_count: int # Approximate token count class CodeChunker: """ Chunks code at logical boundaries while respecting token limits """ TARGET_MIN_TOKENS = 100 # Minimum chunk size TARGET_MAX_TOKENS = 300 # Maximum chunk size CHARS_PER_TOKEN = 4 # Rough estimate: 1 token ≈ 4 chars def __init__(self, repo_name: str): """ Initialize chunker for a repository Args: repo_name: Name of the repository being chunked """ self.repo_name = repo_name self.chunk_counter = 0 def chunk_file(self, parsed_file: ParsedFile) -> List[CodeChunk]: """ Chunk a parsed file into semantic units Strategy: 1. Each class becomes a chunk (if under max size) 2. Each top-level function becomes a chunk 3. Large classes are split by methods 4. Very small chunks are merged Args: parsed_file: ParsedFile object from parser Returns: List of CodeChunk objects """ chunks = [] # Extract imports for context import_statements = [imp.module for imp in parsed_file.imports] # Chunk classes for cls in parsed_file.classes: cls_chunks = self._chunk_class( cls, parsed_file, import_statements ) chunks.extend(cls_chunks) # Chunk standalone functions (not part of classes) for func in parsed_file.functions: # Skip if function is part of a class (already chunked) if self._is_function_in_class(func, parsed_file.classes): continue func_chunk = self._chunk_function( func, parsed_file, import_statements, parent_class=None ) if func_chunk: chunks.append(func_chunk) # If file has no functions/classes, create a file-level chunk if not chunks: chunks.append(self._chunk_entire_file(parsed_file, import_statements)) return chunks def _chunk_class( self, cls: CodeSymbol, parsed_file: ParsedFile, imports: List[str] ) -> List[CodeChunk]: """Chunk a class definition""" chunks = [] token_count = self._estimate_tokens(cls.text) # If class is small enough, keep it as one chunk if token_count <= self.TARGET_MAX_TOKENS: chunks.append(CodeChunk( chunk_id=self._generate_chunk_id(), file_path=parsed_file.file_path, repo_name=self.repo_name, language=parsed_file.language.value, chunk_type="class", code_text=cls.text, start_line=cls.start_line, end_line=cls.end_line, symbol_name=cls.name, parent_class=None, imports=imports, context_text=self._build_context( parsed_file.file_path, cls.name, None, cls.text ), token_count=token_count )) else: # Split large class by methods # But first, create a chunk for the class header/definition class_header = self._extract_class_header(cls, parsed_file) if class_header: chunks.append(CodeChunk( chunk_id=self._generate_chunk_id(), file_path=parsed_file.file_path, repo_name=self.repo_name, language=parsed_file.language.value, chunk_type="class_definition", code_text=class_header, start_line=cls.start_line, end_line=min(cls.start_line + 50, cls.end_line), # First ~50 lines symbol_name=cls.name, parent_class=None, imports=imports, context_text=self._build_context( parsed_file.file_path, cls.name, None, class_header ), token_count=self._estimate_tokens(class_header) )) # Then chunk the methods methods = self._extract_methods_from_class(cls, parsed_file) for method in methods: method_chunk = self._chunk_function( method, parsed_file, imports, parent_class=cls.name ) if method_chunk: chunks.append(method_chunk) return chunks def _chunk_function( self, func: CodeSymbol, parsed_file: ParsedFile, imports: List[str], parent_class: Optional[str] ) -> Optional[CodeChunk]: """Chunk a function definition""" token_count = self._estimate_tokens(func.text) # Skip very small functions (less than min threshold) if token_count < self.TARGET_MIN_TOKENS: # Could merge with neighbors, but for now just skip # TODO: Implement merging strategy pass return CodeChunk( chunk_id=self._generate_chunk_id(), file_path=parsed_file.file_path, repo_name=self.repo_name, language=parsed_file.language.value, chunk_type="function", code_text=func.text, start_line=func.start_line, end_line=func.end_line, symbol_name=func.name, parent_class=parent_class, imports=imports, context_text=self._build_context( parsed_file.file_path, func.name, parent_class, func.text ), token_count=token_count ) def _chunk_entire_file( self, parsed_file: ParsedFile, imports: List[str] ) -> CodeChunk: """Create a chunk from entire file (when no functions/classes found)""" token_count = self._estimate_tokens(parsed_file.raw_code) # Truncate if too large code_text = parsed_file.raw_code if token_count > self.TARGET_MAX_TOKENS: target_chars = self.TARGET_MAX_TOKENS * self.CHARS_PER_TOKEN code_text = code_text[:target_chars] + "\n... (truncated)" token_count = self.TARGET_MAX_TOKENS return CodeChunk( chunk_id=self._generate_chunk_id(), file_path=parsed_file.file_path, repo_name=self.repo_name, language=parsed_file.language.value, chunk_type="file", code_text=code_text, start_line=0, end_line=len(parsed_file.raw_code.splitlines()), symbol_name=None, parent_class=None, imports=imports, context_text=self._build_context( parsed_file.file_path, None, None, code_text ), token_count=token_count ) def _extract_methods_from_class( self, cls: CodeSymbol, parsed_file: ParsedFile ) -> List[CodeSymbol]: """Extract methods that belong to a class""" methods = [] for func in parsed_file.functions: # Check if function is within class boundaries if cls.start_byte <= func.start_byte < cls.end_byte: methods.append(func) return methods def _is_function_in_class( self, func: CodeSymbol, classes: List[CodeSymbol] ) -> bool: """Check if a function is inside a class""" for cls in classes: if cls.start_byte <= func.start_byte < cls.end_byte: return True return False def _estimate_tokens(self, text: str) -> int: """Estimate token count from text""" return len(text) // self.CHARS_PER_TOKEN def _generate_chunk_id(self) -> str: """Generate unique chunk ID""" self.chunk_counter += 1 return f"{self.repo_name}:chunk:{self.chunk_counter}" def _extract_class_header( self, cls: CodeSymbol, parsed_file: ParsedFile ) -> Optional[str]: """ Extract class header/definition (first ~50 lines or up to first method) This captures the class declaration, includes, inheritance, and attributes but excludes method definitions for large classes. """ lines = cls.text.splitlines() # Take first 50 lines or up to TARGET_MAX_TOKENS, whichever is smaller max_lines = 50 header_lines = [] for i, line in enumerate(lines[:max_lines]): # Stop if we estimate we've hit the token limit current_text = "\n".join(header_lines + [line]) if self._estimate_tokens(current_text) > self.TARGET_MAX_TOKENS: break header_lines.append(line) if header_lines: return "\n".join(header_lines) return None def _build_context( self, file_path: str, symbol_name: Optional[str], parent_class: Optional[str], code_text: str ) -> str: """ Build enhanced context text for better retrieval Format: "File: {path} | Symbol: {name} | Class: {class}\n{code}" """ parts = [f"File: {file_path}"] if parent_class: parts.append(f"Class: {parent_class}") if symbol_name: if parent_class: parts.append(f"Method: {symbol_name}") else: parts.append(f"Function: {symbol_name}") context_header = " | ".join(parts) return f"{context_header}\n\n{code_text}"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gkatechis/mcpIndexer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server