Local DeepWiki MCP Server

Overview Schema Related Servers Score Discussions

chunker.py•34 KiB

"""AST-based code chunking for semantic extraction.""" import hashlib from pathlib import Path from typing import Any, Iterator from tree_sitter import Node from local_deepwiki.config import ChunkingConfig, get_config from local_deepwiki.core.parser import ( CodeParser, find_nodes_by_type, get_docstring, get_node_name, get_node_text, ) from local_deepwiki.logging import get_logger from local_deepwiki.models import ChunkType, CodeChunk, Language from local_deepwiki.plugins.registry import get_plugin_registry logger = get_logger(__name__) # Node types that represent extractable code units per language FUNCTION_NODE_TYPES: dict[Language, set[str]] = { Language.PYTHON: {"function_definition", "async_function_definition"}, Language.JAVASCRIPT: { "function_declaration", "arrow_function", "function_expression", "method_definition", }, Language.TYPESCRIPT: { "function_declaration", "arrow_function", "function_expression", "method_definition", }, Language.GO: {"function_declaration", "method_declaration"}, Language.RUST: {"function_item"}, Language.JAVA: {"method_declaration", "constructor_declaration"}, Language.C: {"function_definition"}, Language.CPP: {"function_definition"}, Language.SWIFT: {"function_declaration", "init_declaration"}, Language.RUBY: {"method", "singleton_method"}, Language.PHP: {"function_definition", "method_declaration"}, Language.KOTLIN: {"function_declaration"}, Language.CSHARP: {"method_declaration", "constructor_declaration"}, } CLASS_NODE_TYPES: dict[Language, set[str]] = { Language.PYTHON: {"class_definition"}, Language.JAVASCRIPT: {"class_declaration"}, Language.TYPESCRIPT: {"class_declaration", "interface_declaration", "type_alias_declaration"}, Language.GO: {"type_declaration"}, Language.RUST: {"struct_item", "impl_item", "trait_item", "enum_item"}, Language.JAVA: {"class_declaration", "interface_declaration", "enum_declaration"}, Language.C: {"struct_specifier"}, Language.CPP: {"class_specifier", "struct_specifier"}, Language.SWIFT: { "class_declaration", "struct_declaration", "protocol_declaration", "enum_declaration", "extension_declaration", }, Language.RUBY: {"class", "module"}, Language.PHP: {"class_declaration", "interface_declaration", "trait_declaration"}, Language.KOTLIN: {"class_declaration", "object_declaration"}, Language.CSHARP: { "class_declaration", "struct_declaration", "interface_declaration", "enum_declaration", }, } IMPORT_NODE_TYPES: dict[Language, set[str]] = { Language.PYTHON: {"import_statement", "import_from_statement"}, Language.JAVASCRIPT: {"import_statement", "import_declaration"}, Language.TYPESCRIPT: {"import_statement", "import_declaration"}, Language.GO: {"import_declaration"}, Language.RUST: {"use_declaration"}, Language.JAVA: {"import_declaration"}, Language.C: {"preproc_include"}, Language.CPP: {"preproc_include"}, Language.SWIFT: {"import_declaration"}, Language.RUBY: {"call"}, # require/require_relative are method calls in Ruby AST Language.PHP: {"namespace_use_declaration"}, # use statements in PHP Language.KOTLIN: {"import_header"}, Language.CSHARP: {"using_directive"}, } def get_parent_classes(class_node: Node, source: bytes, language: Language) -> list[str]: """Extract parent class names from a class definition. Args: class_node: The class AST node. source: Source bytes. language: Programming language. Returns: List of parent class names. """ parents = [] if language == Language.PYTHON: # Python: class Child(Parent, Mixin): → argument_list > identifier for child in class_node.children: if child.type == "argument_list": for arg in child.children: if arg.type == "identifier": parents.append(get_node_text(arg, source)) elif language in (Language.TYPESCRIPT, Language.JAVASCRIPT): # TS/JS: class Child extends Parent implements Interface for child in class_node.children: if child.type == "class_heritage": for clause in child.children: if clause.type in ("extends_clause", "implements_clause"): for item in clause.children: if item.type in ("identifier", "type_identifier"): parents.append(get_node_text(item, source)) elif language == Language.JAVA: # Java: class Child extends Parent implements Interface for child in class_node.children: if child.type == "superclass": for item in child.children: if item.type == "type_identifier": parents.append(get_node_text(item, source)) elif child.type == "super_interfaces": for item in find_nodes_by_type(child, {"type_identifier"}): parents.append(get_node_text(item, source)) elif language == Language.SWIFT: # Swift: class Child: Parent, Protocol for child in class_node.children: if child.type == "type_inheritance_clause": for item in child.children: if item.type in ("user_type", "type_identifier"): # Get the identifier from user_type text = get_node_text(item, source) if text and text not in (":", ","): parents.append(text) elif language == Language.CPP: # C++: class Child : public Parent for child in class_node.children: if child.type == "base_class_clause": for item in find_nodes_by_type(child, {"type_identifier"}): parents.append(get_node_text(item, source)) elif language == Language.RUBY: # Ruby: class Child < Parent for child in class_node.children: if child.type == "superclass": for sc in child.children: if sc.type == "constant" or sc.type == "scope_resolution": parents.append(get_node_text(sc, source)) elif language == Language.PHP: # PHP: class Child extends Parent implements Interface1, Interface2 for child in class_node.children: if child.type == "base_clause": # extends clause for item in find_nodes_by_type(child, {"name", "qualified_name"}): parents.append(get_node_text(item, source)) elif child.type == "class_interface_clause": # implements clause for item in find_nodes_by_type(child, {"name", "qualified_name"}): parents.append(get_node_text(item, source)) elif language == Language.KOTLIN: # Kotlin: class Child : Parent(), Interface1, Interface2 for child in class_node.children: if child.type == "delegation_specifiers": for spec in child.children: if spec.type == "delegation_specifier": for item in find_nodes_by_type(spec, {"user_type", "simple_identifier"}): text = get_node_text(item, source) if text and text not in (":", ","): parents.append(text) break # Only get the type name, not nested parts elif language == Language.CSHARP: # C#: class Child : Parent, IInterface1, IInterface2 for child in class_node.children: if child.type == "base_list": for item in find_nodes_by_type( child, {"identifier", "generic_name", "qualified_name"} ): text = get_node_text(item, source) if text: parents.append(text) return parents def extract_python_parameter_types(func_node: Node, source: bytes) -> dict[str, str | None]: """Extract parameter types from a Python function. Args: func_node: The function_definition AST node. source: Source code bytes. Returns: Dictionary mapping parameter names to their type hints. """ param_types: dict[str, str | None] = {} params_node = func_node.child_by_field_name("parameters") if not params_node: return param_types for child in params_node.children: if child.type == "identifier": # Simple parameter without type hint name = get_node_text(child, source) if name not in ("self", "cls"): param_types[name] = None elif child.type == "typed_parameter": # Parameter with type hint: name: type # Or typed *args: *args: type, typed **kwargs: **kwargs: type name_node = None type_node = None splat_pattern = None for c in child.children: if c.type == "identifier": name_node = c elif c.type == "type": type_node = c elif c.type == "list_splat_pattern": splat_pattern = c for sc in c.children: if sc.type == "identifier": name_node = sc break elif c.type == "dictionary_splat_pattern": splat_pattern = c for sc in c.children: if sc.type == "identifier": name_node = sc break if name_node: name = get_node_text(name_node, source) if name not in ("self", "cls"): type_hint = get_node_text(type_node, source) if type_node else None # Add prefix for splat patterns if splat_pattern: prefix = "*" if splat_pattern.type == "list_splat_pattern" else "**" name = f"{prefix}{name}" param_types[name] = type_hint elif child.type == "default_parameter": # Parameter with default: name = value name_node = child.child_by_field_name("name") if name_node: name = get_node_text(name_node, source) if name not in ("self", "cls"): param_types[name] = None elif child.type == "typed_default_parameter": # Parameter with type and default: name: type = value name_node = child.child_by_field_name("name") type_node = child.child_by_field_name("type") if name_node: name = get_node_text(name_node, source) if name not in ("self", "cls"): type_hint = get_node_text(type_node, source) if type_node else None param_types[name] = type_hint elif child.type in ("list_splat_pattern", "dictionary_splat_pattern"): # *args or **kwargs for c in child.children: if c.type == "identifier": name = get_node_text(c, source) prefix = "*" if child.type == "list_splat_pattern" else "**" param_types[f"{prefix}{name}"] = None break elif c.type == "typed_parameter": # *args: type or **kwargs: type inner_name = None inner_type = None for tc in c.children: if tc.type == "identifier": inner_name = tc elif tc.type == "type": inner_type = tc if inner_name: name = get_node_text(inner_name, source) prefix = "*" if child.type == "list_splat_pattern" else "**" type_hint = get_node_text(inner_type, source) if inner_type else None param_types[f"{prefix}{name}"] = type_hint break return param_types def extract_python_parameter_defaults(func_node: Node, source: bytes) -> dict[str, str]: """Extract parameter default values from a Python function. Args: func_node: The function_definition AST node. source: Source code bytes. Returns: Dictionary mapping parameter names to their default values. """ defaults: dict[str, str] = {} params_node = func_node.child_by_field_name("parameters") if not params_node: return defaults for child in params_node.children: if child.type == "default_parameter": name_node = child.child_by_field_name("name") value_node = child.child_by_field_name("value") if name_node and value_node: name = get_node_text(name_node, source) if name not in ("self", "cls"): defaults[name] = get_node_text(value_node, source) elif child.type == "typed_default_parameter": name_node = child.child_by_field_name("name") value_node = child.child_by_field_name("value") if name_node and value_node: name = get_node_text(name_node, source) if name not in ("self", "cls"): defaults[name] = get_node_text(value_node, source) return defaults def extract_python_return_type(func_node: Node, source: bytes) -> str | None: """Extract return type annotation from a Python function. Args: func_node: The function_definition AST node. source: Source code bytes. Returns: Return type string or None. """ return_type_node = func_node.child_by_field_name("return_type") if return_type_node: return get_node_text(return_type_node, source) return None def extract_python_decorators(func_node: Node, source: bytes) -> list[str]: """Extract decorators from a Python function. Args: func_node: The function_definition AST node. source: Source code bytes. Returns: List of decorator strings. """ decorators: list[str] = [] if func_node.parent: prev_sibling = func_node.prev_sibling while prev_sibling: if prev_sibling.type == "decorator": dec_text = get_node_text(prev_sibling, source) decorators.insert(0, dec_text) elif prev_sibling.type not in ("comment", "decorator"): break prev_sibling = prev_sibling.prev_sibling return decorators def is_async_function(func_node: Node) -> bool: """Check if a function is async. Args: func_node: The function AST node. Returns: True if the function is async. """ return func_node.type == "async_function_definition" or any( c.type == "async" for c in func_node.children ) def extract_python_raised_exceptions(func_node: Node, source: bytes) -> list[str]: """Extract exception types raised by a Python function. Finds all `raise` statements within the function and extracts the exception type being raised. Args: func_node: The function_definition AST node. source: Source code bytes. Returns: List of unique exception type names raised by the function. """ exceptions: set[str] = set() def find_raise_statements(node: Node) -> None: """Recursively find raise statements in the AST.""" if node.type == "raise_statement": # Extract the exception type for child in node.children: if child.type == "identifier": # Direct raise like: raise ValueError exc_name = get_node_text(child, source) if exc_name and exc_name != "raise": exceptions.add(exc_name) break elif child.type == "call": # Raise with call like: raise ValueError("msg") for call_child in child.children: if call_child.type == "identifier": exc_name = get_node_text(call_child, source) if exc_name: exceptions.add(exc_name) break elif call_child.type == "attribute": # Handle module.Exception like: raise errors.CustomError exc_name = get_node_text(call_child, source) if exc_name: exceptions.add(exc_name) break break # Recurse into child nodes (but not into nested functions) for child in node.children: if child.type not in ("function_definition", "async_function_definition"): find_raise_statements(child) # Start searching from the function body for child in func_node.children: if child.type == "block": find_raise_statements(child) return sorted(exceptions) def extract_function_type_metadata( func_node: Node, source: bytes, language: Language ) -> dict[str, Any]: """Extract type annotation metadata from a function node. Args: func_node: The function AST node. source: Source code bytes. language: Programming language. Returns: Metadata dictionary with type information. """ metadata: dict[str, Any] = {} if language == Language.PYTHON: # Extract parameter types param_types = extract_python_parameter_types(func_node, source) # Only include parameters that have type hints typed_params = {k: v for k, v in param_types.items() if v is not None} if typed_params: metadata["parameter_types"] = typed_params # Extract parameter defaults param_defaults = extract_python_parameter_defaults(func_node, source) if param_defaults: metadata["parameter_defaults"] = param_defaults # Extract return type return_type = extract_python_return_type(func_node, source) if return_type: metadata["return_type"] = return_type # Extract decorators decorators = extract_python_decorators(func_node, source) if decorators: metadata["decorators"] = decorators # Check if async if is_async_function(func_node): metadata["is_async"] = True # Extract raised exceptions raised_exceptions = extract_python_raised_exceptions(func_node, source) if raised_exceptions: metadata["raises"] = raised_exceptions # TODO: Add support for other languages (TypeScript, Java, etc.) return metadata class CodeChunker: """Extract semantic code chunks from source files using AST analysis.""" def __init__(self, config: ChunkingConfig | None = None): """Initialize the chunker. Args: config: Optional chunking configuration. """ base_config = config or get_config().chunking # Store a defensive copy to prevent external mutation self.config = base_config.model_copy(deep=True) self.parser = CodeParser() def chunk_file(self, file_path: Path, repo_root: Path) -> Iterator[CodeChunk]: """Extract code chunks from a source file. Checks for registered language parser plugins first. If a plugin handles the file extension, uses the plugin's parse_file method. Otherwise falls back to the built-in tree-sitter parser. Args: file_path: Path to the source file. repo_root: Root directory of the repository. Yields: CodeChunk objects for each semantic unit found. """ # Check for plugin parser first registry = get_plugin_registry() plugin_parser = registry.get_parser_for_extension(file_path.suffix) if plugin_parser is not None: # Use plugin parser - it returns CodeChunk objects directly logger.debug(f"Using plugin parser '{plugin_parser.language_name}' for {file_path.name}") try: source = file_path.read_bytes() chunks = plugin_parser.parse_file(file_path, source) yield from chunks return except Exception as e: logger.warning(f"Plugin parser failed for {file_path}: {e}, falling back to built-in") # Fall back to built-in tree-sitter parser result = self.parser.parse_file(file_path) if result is None: logger.debug(f"Skipping unsupported file: {file_path}") return root, language, source = result rel_path = str(file_path.relative_to(repo_root)) logger.debug(f"Chunking {rel_path} ({language.value})") # Extract module-level chunk (file overview) yield self._create_module_chunk(root, source, language, rel_path) # Extract imports import_types = IMPORT_NODE_TYPES.get(language, set()) import_nodes = find_nodes_by_type(root, import_types) if import_nodes: yield self._create_imports_chunk(import_nodes, source, language, rel_path) # Extract classes and their methods class_types = CLASS_NODE_TYPES.get(language, set()) for class_node in find_nodes_by_type(root, class_types): yield from self._extract_class_chunks(class_node, source, language, rel_path) # Extract top-level functions (not inside classes) function_types = FUNCTION_NODE_TYPES.get(language, set()) for func_node in find_nodes_by_type(root, function_types): # Skip if inside a class (already processed) if not self._is_inside_class(func_node, class_types): yield self._create_function_chunk(func_node, source, language, rel_path) def _create_module_chunk( self, root: Node, source: bytes, language: Language, file_path: str, ) -> CodeChunk: """Create a chunk for the module/file overview. Args: root: AST root node. source: Source bytes. language: Programming language. file_path: Relative file path. Returns: A CodeChunk for the module. """ # Get module docstring if present docstring = None if language == Language.PYTHON: # Python module docstring is first expression if root.children and root.children[0].type == "expression_statement": expr = root.children[0] if expr.children and expr.children[0].type == "string": docstring = get_node_text(expr.children[0], source) if docstring.startswith('"""') or docstring.startswith("'''"): docstring = docstring[3:-3].strip() # Create a summary of the file structure content = self._create_file_summary(root, source, language) chunk_id = self._generate_id(file_path, "module", 0) return CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.MODULE, name=Path(file_path).stem, content=content, start_line=1, end_line=source.count(b"\n") + 1, docstring=docstring, metadata={"is_overview": True}, ) def _create_file_summary(self, root: Node, source: bytes, language: Language) -> str: """Create a summary of file structure for the module chunk. Args: root: AST root node. source: Source bytes. language: Programming language. Returns: A summary string of file contents. """ parts = [] # List imports import_types = IMPORT_NODE_TYPES.get(language, set()) imports = find_nodes_by_type(root, import_types) if imports: import_text = "\n".join(get_node_text(n, source) for n in imports[:10]) if len(imports) > 10: import_text += f"\n# ... and {len(imports) - 10} more imports" parts.append(f"# Imports:\n{import_text}") # List classes class_types = CLASS_NODE_TYPES.get(language, set()) classes = find_nodes_by_type(root, class_types) if classes: class_names = [get_node_name(c, source, language) or "anonymous" for c in classes] parts.append(f"# Classes: {', '.join(class_names)}") # List functions function_types = FUNCTION_NODE_TYPES.get(language, set()) functions = [ f for f in find_nodes_by_type(root, function_types) if not self._is_inside_class(f, class_types) ] if functions: func_names = [get_node_name(f, source, language) or "anonymous" for f in functions] parts.append(f"# Functions: {', '.join(func_names)}") return "\n\n".join(parts) if parts else "# Empty file" def _create_imports_chunk( self, import_nodes: list[Node], source: bytes, language: Language, file_path: str, ) -> CodeChunk: """Create a chunk for import statements. Args: import_nodes: List of import nodes. source: Source bytes. language: Programming language. file_path: Relative file path. Returns: A CodeChunk for imports. """ content = "\n".join(get_node_text(n, source) for n in import_nodes) start_line = min(n.start_point[0] + 1 for n in import_nodes) end_line = max(n.end_point[0] + 1 for n in import_nodes) chunk_id = self._generate_id(file_path, "imports", start_line) return CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.IMPORT, name="imports", content=content, start_line=start_line, end_line=end_line, metadata={"import_count": len(import_nodes)}, ) def _extract_class_chunks( self, class_node: Node, source: bytes, language: Language, file_path: str, ) -> Iterator[CodeChunk]: """Extract chunks from a class definition. Args: class_node: The class AST node. source: Source bytes. language: Programming language. file_path: Relative file path. Yields: CodeChunks for the class and its methods. """ class_name = get_node_name(class_node, source, language) or "anonymous" docstring = get_docstring(class_node, source, language) content = get_node_text(class_node, source) # Extract parent classes for inheritance parent_classes = get_parent_classes(class_node, source, language) # Check if class is too large and needs to be split lines = content.count("\n") + 1 if lines > self.config.class_split_threshold: # For large classes, create a summary chunk and method chunks yield self._create_class_summary_chunk( class_node, source, language, file_path, class_name, docstring, parent_classes ) # Extract methods separately function_types = FUNCTION_NODE_TYPES.get(language, set()) for method_node in find_nodes_by_type(class_node, function_types): yield self._create_method_chunk( method_node, source, language, file_path, class_name ) else: # Small class - include everything in one chunk chunk_id = self._generate_id( file_path, f"class_{class_name}", class_node.start_point[0] ) metadata: dict[str, int | list[str]] = {"line_count": lines} if parent_classes: metadata["parent_classes"] = parent_classes yield CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.CLASS, name=class_name, content=content, start_line=class_node.start_point[0] + 1, end_line=class_node.end_point[0] + 1, docstring=docstring, metadata=metadata, ) def _create_class_summary_chunk( self, class_node: Node, source: bytes, language: Language, file_path: str, class_name: str, docstring: str | None, parent_classes: list[str] | None = None, ) -> CodeChunk: """Create a summary chunk for a large class. Args: class_node: The class AST node. source: Source bytes. language: Programming language. file_path: Relative file path. class_name: Name of the class. docstring: Class docstring if any. parent_classes: List of parent class names. Returns: A summary CodeChunk for the class. """ # Get class signature and method list function_types = FUNCTION_NODE_TYPES.get(language, set()) methods = find_nodes_by_type(class_node, function_types) method_names = [get_node_name(m, source, language) or "anonymous" for m in methods] # Build summary content signature_end = class_node.start_byte for child in class_node.children: if child.type in ("block", "class_body", "declaration_list"): signature_end = child.start_byte break signature = ( source[class_node.start_byte : signature_end].decode("utf-8", errors="replace").strip() ) content = f"{signature}\n # Methods: {', '.join(method_names)}" chunk_id = self._generate_id(file_path, f"class_{class_name}", class_node.start_point[0]) metadata: dict[str, bool | int | list[str]] = { "is_summary": True, "method_count": len(methods), } if parent_classes: metadata["parent_classes"] = parent_classes return CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.CLASS, name=class_name, content=content, start_line=class_node.start_point[0] + 1, end_line=class_node.end_point[0] + 1, docstring=docstring, metadata=metadata, ) def _create_method_chunk( self, method_node: Node, source: bytes, language: Language, file_path: str, class_name: str, ) -> CodeChunk: """Create a chunk for a class method. Args: method_node: The method AST node. source: Source bytes. language: Programming language. file_path: Relative file path. class_name: Name of the parent class. Returns: A CodeChunk for the method. """ method_name = get_node_name(method_node, source, language) or "anonymous" content = get_node_text(method_node, source) docstring = get_docstring(method_node, source, language) # Extract type annotation metadata metadata = extract_function_type_metadata(method_node, source, language) chunk_id = self._generate_id( file_path, f"{class_name}.{method_name}", method_node.start_point[0] ) return CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.METHOD, name=method_name, content=content, start_line=method_node.start_point[0] + 1, end_line=method_node.end_point[0] + 1, docstring=docstring, parent_name=class_name, metadata=metadata, ) def _create_function_chunk( self, func_node: Node, source: bytes, language: Language, file_path: str, ) -> CodeChunk: """Create a chunk for a top-level function. Args: func_node: The function AST node. source: Source bytes. language: Programming language. file_path: Relative file path. Returns: A CodeChunk for the function. """ func_name = get_node_name(func_node, source, language) or "anonymous" content = get_node_text(func_node, source) docstring = get_docstring(func_node, source, language) # Extract type annotation metadata metadata = extract_function_type_metadata(func_node, source, language) chunk_id = self._generate_id(file_path, f"func_{func_name}", func_node.start_point[0]) return CodeChunk( id=chunk_id, file_path=file_path, language=language, chunk_type=ChunkType.FUNCTION, name=func_name, content=content, start_line=func_node.start_point[0] + 1, end_line=func_node.end_point[0] + 1, docstring=docstring, metadata=metadata, ) def _is_inside_class(self, node: Node, class_types: set[str]) -> bool: """Check if a node is inside a class definition. Args: node: The node to check. class_types: Set of class node type names. Returns: True if the node is inside a class. """ parent = node.parent while parent: if parent.type in class_types: return True parent = parent.parent return False def _generate_id(self, file_path: str, name: str, line: int) -> str: """Generate a unique chunk ID. Args: file_path: File path. name: Chunk name. line: Line number. Returns: A unique ID string. """ key = f"{file_path}:{name}:{line}" return hashlib.sha256(key.encode()).hexdigest()[:16]

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/UrbanDiver/local-deepwiki-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunker.py•34 KiB