Skip to main content
Glama
johannhartmann

MCP Code Analysis Server

test_chunking.py17.6 kB
"""Tests for text chunking strategies.""" from collections.abc import Generator from typing import Any from unittest.mock import MagicMock, patch import pytest from src.indexer.chunking import MAX_MODULE_DOCSTRING_SEARCH_LINES, CodeChunker @pytest.fixture def mock_settings() -> Generator[MagicMock, None, None]: """Mock settings for testing.""" with patch("src.indexer.chunking.settings") as mock: mock.parser.chunk_size = 100 mock.embeddings.max_tokens = 8000 yield mock @pytest.fixture def code_chunker(mock_settings: MagicMock) -> CodeChunker: """Create CodeChunker instance with mocked settings.""" return CodeChunker() def test_code_chunker_initialization(mock_settings: MagicMock) -> None: """Test CodeChunker initialization.""" chunker = CodeChunker() assert chunker.chunk_size == 100 assert chunker.max_tokens == 8000 def test_chunk_by_entity_functions(code_chunker: CodeChunker) -> None: """Test chunking by function entities.""" entities: dict[str, list[dict[str, Any]]] = { "functions": [ { "name": "test_func", "start_line": 1, "end_line": 5, "parameters": [{"name": "x", "type": "int"}], "return_type": "int", "docstring": "Test function", "is_async": False, "is_generator": False, }, { "name": "async_func", "start_line": 7, "end_line": 10, "parameters": [], "is_async": True, }, ], "classes": [], "imports": [], } file_content = """def test_func(x: int) -> int: '''Test function''' result = x * 2 return result async def async_func(): '''Async function''' await asyncio.sleep(1) return True """ chunks = code_chunker.chunk_by_entity(entities, file_content) assert len(chunks) == 2 # Check first function chunk chunk1 = chunks[0] assert chunk1["type"] == "function" assert "def test_func" in chunk1["content"] assert chunk1["metadata"]["entity_name"] == "test_func" assert chunk1["metadata"]["parameters"] == [{"name": "x", "type": "int"}] assert chunk1["metadata"]["return_type"] == "int" assert chunk1["metadata"]["is_async"] is False # Check second function chunk chunk2 = chunks[1] assert chunk2["type"] == "function" assert "async def async_func" in chunk2["content"] assert chunk2["metadata"]["is_async"] is True def test_chunk_by_entity_classes(code_chunker: CodeChunker) -> None: """Test chunking by class entities.""" entities: dict[str, list[dict[str, Any]]] = { "classes": [ { "name": "TestClass", "start_line": 1, "end_line": 10, "base_classes": ["BaseClass"], "docstring": "Test class", "methods": [ {"name": "method1", "start_line": 5, "end_line": 7}, {"name": "method2", "start_line": 8, "end_line": 10}, ], "is_abstract": False, }, ], "functions": [], } file_content = """class TestClass(BaseClass): '''Test class''' def method1(self): '''Method 1''' pass def method2(self): '''Method 2''' return 42 """ chunks = code_chunker.chunk_by_entity(entities, file_content) assert len(chunks) == 1 chunk = chunks[0] assert chunk["type"] == "class" assert "class TestClass" in chunk["content"] assert chunk["metadata"]["entity_name"] == "TestClass" assert chunk["metadata"]["base_classes"] == ["BaseClass"] assert chunk["metadata"]["method_count"] == 2 assert chunk["metadata"]["is_abstract"] is False def test_chunk_by_entity_large_class(code_chunker: CodeChunker) -> None: """Test chunking large class creates separate method chunks.""" # Create a large class methods: list[dict[str, Any]] = [] method_lines = [] for i in range(20): methods.append( { "name": f"method{i}", "start_line": i * 6 + 3, "end_line": i * 6 + 8, } ) method_lines.extend( [ f" def method{i}(self):", f" '''Method {i}'''", " # Some implementation", " result = self.process()", " return result", "", ] ) entities = { "classes": [ { "name": "LargeClass", "start_line": 1, "end_line": 130, # Large class "base_classes": [], "methods": methods, }, ], } file_content = "class LargeClass:\n '''Large class'''\n" + "\n".join( method_lines ) chunks = code_chunker.chunk_by_entity(entities, file_content) # Should have 1 class chunk + individual method chunks assert len(chunks) > 1 # First chunk should be the entire class assert chunks[0]["type"] == "class" assert chunks[0]["metadata"]["entity_name"] == "LargeClass" # Subsequent chunks should be methods method_chunks = [c for c in chunks[1:] if c["type"] == "method"] assert len(method_chunks) == len(methods) # Check first method chunk if method_chunks: assert method_chunks[0]["type"] == "method" assert method_chunks[0]["metadata"]["parent_class"] == "LargeClass" def test_chunk_by_entity_module(code_chunker: CodeChunker) -> None: """Test chunking module-level code.""" entities: dict[str, list[dict[str, Any]]] = { "imports": [ {"module": "os"}, {"module": "sys"}, ], "classes": [{"name": "MyClass"}], "functions": [{"name": "my_func"}], } file_content = '''""" Module docstring explaining the purpose. """ import os import sys from typing import List # Module constants DEFAULT_VALUE = 42 def my_func(): pass class MyClass: pass ''' chunks = code_chunker.chunk_by_entity(entities, file_content) # Should have a module chunk module_chunks = [c for c in chunks if c["type"] == "module"] assert len(module_chunks) == 1 module_chunk = module_chunks[0] assert '"""' in module_chunk["content"] assert "import os" in module_chunk["content"] assert module_chunk["metadata"]["import_count"] == 2 assert module_chunk["metadata"]["class_count"] == 1 assert module_chunk["metadata"]["function_count"] == 1 def test_chunk_by_entity_no_module_docstring(code_chunker: CodeChunker) -> None: """Test module chunk when no clear module section.""" entities: dict[str, list[dict[str, Any]]] = { "imports": [], "classes": [], "functions": [], } file_content = """def first_function(): pass class FirstClass: pass """ chunks = code_chunker.chunk_by_entity(entities, file_content) # Should not create module chunk if no clear module section module_chunks = [c for c in chunks if c["type"] == "module"] assert len(module_chunks) == 0 def test_chunk_by_entity_with_context(code_chunker: CodeChunker) -> None: """Test that context lines are included.""" entities = { "functions": [ { "name": "process_data", "start_line": 5, "end_line": 7, }, ], } file_content = """# File header import numpy as np # Important function def process_data(data): '''Process the data''' return np.array(data) # Another section def other_func(): pass """ chunks = code_chunker.chunk_by_entity(entities, file_content) chunk = chunks[0] # Should include context before (3 lines) and after (1 line) assert "import numpy" in chunk["content"] assert "# Important function" in chunk["content"] assert "# Another section" in chunk["content"] def test_chunk_by_lines(code_chunker: CodeChunker) -> None: """Test chunking by fixed line count.""" file_content = "\n".join([f"Line {i}" for i in range(200)]) chunks = code_chunker.chunk_by_lines(file_content, overlap=20) # Check chunks were created assert len(chunks) > 1 # Check first chunk first_chunk = chunks[0] assert first_chunk["type"] == "lines" assert first_chunk["start_line"] == 1 assert first_chunk["metadata"]["has_overlap"] is False # Check second chunk has overlap if len(chunks) > 1: second_chunk = chunks[1] assert second_chunk["type"] == "lines" assert second_chunk["metadata"]["has_overlap"] is True # Verify overlap exists first_lines = first_chunk["content"].split("\n") second_lines = second_chunk["content"].split("\n") # Last 20 lines of first chunk should match first 20 of second assert len(set(first_lines[-20:]) & set(second_lines[:20])) > 0 def test_chunk_by_lines_small_file(code_chunker: CodeChunker) -> None: """Test chunking small file that fits in one chunk.""" file_content = "\n".join([f"Line {i}" for i in range(50)]) chunks = code_chunker.chunk_by_lines(file_content) assert len(chunks) == 1 assert chunks[0]["start_line"] == 1 assert chunks[0]["end_line"] == 50 def test_merge_small_chunks(code_chunker: CodeChunker) -> None: """Test merging small chunks.""" chunks = [ { "type": "function", "content": "def f1(): pass", "start_line": 1, "end_line": 1, "metadata": {"entity_name": "f1"}, }, { "type": "function", "content": "def f2(): return 1", "start_line": 3, "end_line": 3, "metadata": {"entity_name": "f2"}, }, { "type": "class", "content": "class LargeClass:\n" + "\n".join([" def method(): pass"] * 20), "start_line": 5, "end_line": 30, "metadata": {"entity_name": "LargeClass"}, }, { "type": "method", "content": "def small(): pass", "start_line": 32, "end_line": 32, "metadata": {"entity_name": "small"}, }, ] merged = code_chunker.merge_small_chunks(chunks, min_size=10) # Small functions should be merged assert len(merged) < len(chunks) # Large class should remain separate class_chunks = [ c for c in merged if c["metadata"].get("entity_name") == "LargeClass" ] assert len(class_chunks) == 1 # Check merged chunk merged_chunks = [c for c in merged if c["type"] == "merged"] if merged_chunks: merged_chunk = merged_chunks[0] assert "f1" in str(merged_chunk["metadata"]["merged_entities"]) assert "f2" in str(merged_chunk["metadata"]["merged_entities"]) def test_merge_small_chunks_all_large(code_chunker: CodeChunker) -> None: """Test merge when all chunks are large.""" chunks = [ { "type": "function", "content": "def large_func():\n" + "\n".join([" # code"] * 20), "start_line": 1, "end_line": 25, "metadata": {"entity_name": "large_func"}, }, { "type": "class", "content": "class LargeClass:\n" + "\n".join([" pass"] * 20), "start_line": 30, "end_line": 55, "metadata": {"entity_name": "LargeClass"}, }, ] merged = code_chunker.merge_small_chunks(chunks, min_size=10) # No merging should occur assert len(merged) == len(chunks) assert all(c["type"] != "merged" for c in merged) def test_merge_chunks_function(code_chunker: CodeChunker) -> None: """Test the merge operation between two chunks.""" chunk1 = { "type": "function", "content": "def func1():\n return 1", "start_line": 1, "end_line": 2, "metadata": {"entity_name": "func1"}, } chunk2 = { "type": "function", "content": "def func2():\n return 2", "start_line": 4, "end_line": 5, "metadata": {"entity_name": "func2"}, } merged = code_chunker._merge_chunks(chunk1, chunk2) assert merged["type"] == "merged" assert merged["start_line"] == 1 assert merged["end_line"] == 5 assert "func1" in merged["content"] assert "func2" in merged["content"] assert merged["metadata"]["merged_types"] == ["function", "function"] assert merged["metadata"]["merged_entities"] == ["func1", "func2"] def test_create_module_chunk_edge_cases(code_chunker: CodeChunker) -> None: """Test module chunk creation edge cases.""" # Test with imports but no docstring entities: dict[str, list[dict[str, Any]]] = {"imports": [{"module": "os"}]} file_content = """import os import sys def main(): pass """ chunks = code_chunker.chunk_by_entity(entities, file_content) module_chunks = [c for c in chunks if c["type"] == "module"] if module_chunks: assert "import os" in module_chunks[0]["content"] # Test with very long module section long_imports = "\n".join([f"import module{i}" for i in range(60)]) file_content = f'"""\nLong module\n"""\n{long_imports}\n\ndef func():\n pass' chunks = code_chunker.chunk_by_entity(entities, file_content) module_chunks = [c for c in chunks if c["type"] == "module"] if module_chunks: # Should stop at MAX_MODULE_DOCSTRING_SEARCH_LINES assert module_chunks[0]["end_line"] <= MAX_MODULE_DOCSTRING_SEARCH_LINES def test_entity_chunk_metadata_completeness(code_chunker: CodeChunker) -> None: """Test that entity chunks have complete metadata.""" entities: dict[str, list[dict[str, Any]]] = { "functions": [ { "name": "test_func", "start_line": 1, "end_line": 3, "parameters": [{"name": "x", "type": "int"}], "return_type": "str", "docstring": "Test", "is_async": True, "is_generator": True, }, ], "classes": [ { "name": "TestClass", "start_line": 5, "end_line": 10, "base_classes": ["A", "B"], "docstring": "Class", "methods": [{"name": "m1"}, {"name": "m2"}], "is_abstract": True, }, ], } file_content = """async def test_func(x: int) -> str: '''Test''' yield str(x) class TestClass(A, B): '''Class''' def m1(self): pass def m2(self): pass """ chunks = code_chunker.chunk_by_entity(entities, file_content) # Check function metadata func_chunk = next(c for c in chunks if c["type"] == "function") meta = func_chunk["metadata"] assert meta["entity_name"] == "test_func" assert meta["has_docstring"] is True assert meta["parameters"] == [{"name": "x", "type": "int"}] assert meta["return_type"] == "str" assert meta["is_async"] is True assert meta["is_generator"] is True # Check class metadata class_chunk = next(c for c in chunks if c["type"] == "class") meta = class_chunk["metadata"] assert meta["entity_name"] == "TestClass" assert meta["has_docstring"] is True assert meta["base_classes"] == ["A", "B"] assert meta["method_count"] == 2 assert meta["is_abstract"] is True def test_chunk_by_entity_empty_file(code_chunker: CodeChunker) -> None: """Test chunking empty file.""" entities: dict[str, list[dict[str, Any]]] = { "functions": [], "classes": [], "imports": [], } file_content = "" chunks = code_chunker.chunk_by_entity(entities, file_content) assert len(chunks) == 0 def test_chunk_by_entity_no_entities(code_chunker: CodeChunker) -> None: """Test chunking file with no recognized entities.""" entities: dict[str, list[dict[str, Any]]] = { "functions": [], "classes": [], "imports": [], } file_content = """# Just comments and constants CONSTANT = 42 # More comments DATA = [1, 2, 3] """ chunks = code_chunker.chunk_by_entity(entities, file_content) # Might create module chunk or no chunks assert len(chunks) <= 1 def test_performance_large_file(code_chunker: CodeChunker) -> None: """Test performance with large file.""" # Create a large file with many entities entities: dict[str, list[dict[str, Any]]] = { "functions": [ { "name": f"func{i}", "start_line": i * 5 + 1, "end_line": i * 5 + 4, } for i in range(100) ], "classes": [], } lines = [] for i in range(100): lines.extend( [ f"def func{i}():", " '''Docstring'''", " result = process()", " return result", "", ] ) file_content = "\n".join(lines) import time start = time.time() chunks = code_chunker.chunk_by_entity(entities, file_content) duration = time.time() - start assert len(chunks) == 100 assert duration < 1.0 # Should complete quickly

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/johannhartmann/mcpcodeanalysis'

If you have feedback or need assistance with the MCP directory API, please join our Discord server