ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests

test_cast_algorithm.py•26.9 KiB

""" Comprehensive tests for cAST (Code AST) algorithm implementation. These tests exercise the real functionality of the cAST chunking system without mocks, validating algorithm correctness, edge cases, and performance. """ import re import time from pathlib import Path from typing import List import pytest from chunkhound.core.types.common import ChunkType, Language, FileId from chunkhound.parsers.parser_factory import get_parser_factory, create_parser_for_language from chunkhound.interfaces.language_parser import ParseResult class TestCASTAlgorithmCore: """Test core cAST algorithm split-then-merge behavior.""" def test_cast_split_then_merge_large_function(self): """Test that oversized functions get split recursively then merged optimally.""" # Create a Python function that exceeds max_chunk_size (1200 chars) large_function = ''' def process_large_data(items): """Process a large dataset with multiple operations.""" results = [] for item in items: if item.is_valid(): processed = item.transform() if processed.meets_criteria(): enhanced = processed.enhance_with_metadata() validated = enhanced.validate_business_rules() results.append(validated) ''' + " # comment line\n" * 100 # Force size over limit parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(large_function) # Verify splitting occurred assert len(chunks) > 1, "Large function should be split" # Verify no chunk exceeds limits for chunk in chunks: non_ws_chars = len(re.sub(r'\s', '', chunk.code)) assert non_ws_chars <= 1200, f"Character limit exceeded: {non_ws_chars}" # Verify estimated tokens under limit estimated_tokens = non_ws_chars * 1.75 assert estimated_tokens <= 6000, f"Token limit exceeded: {estimated_tokens}" def test_cast_greedy_merge_small_adjacent(self): """Test that small adjacent chunks get merged greedily.""" code = ''' def small_func_1(): return 1 def small_func_2(): return 2 def small_func_3(): return 3 ''' parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(code) # Should merge small adjacent functions assert len(chunks) <= 3, "Functions should be processed" # Verify content is preserved all_code = ' '.join(chunk.code for chunk in chunks) assert "small_func_1" in all_code assert "small_func_2" in all_code assert "small_func_3" in all_code def test_dual_constraint_character_and_token_limits(self): """Test both character (1200) and token (6000) limits are enforced.""" # Create content that hits token limit before character limit # Using lots of unique identifiers (high token density) high_token_density = ''' def function_with_many_unique_identifiers(): variable_name_that_is_extremely_long_and_descriptive = 1 another_incredibly_verbose_variable_name_here = 2 ''' + '\n'.join([ f' unique_variable_name_number_{i} = {i}' for i in range(500) # Each line ~35 chars but ~6 tokens ]) parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(high_token_density) for chunk in chunks: non_ws_chars = len(re.sub(r'\s', '', chunk.code)) estimated_tokens = non_ws_chars * 1.75 assert estimated_tokens <= 6000, f"Token limit exceeded: {estimated_tokens}" assert non_ws_chars <= 1200, f"Character limit exceeded: {non_ws_chars}" def test_emergency_splitting_real_minified_jquery(self): """Test emergency splitting using real jQuery minified code.""" import os from pathlib import Path # Use real jQuery minified file fixtures_dir = Path(__file__).parent / "fixtures" / "real_world_files" jquery_path = fixtures_dir / "jquery-3.7.1.min.js" if not jquery_path.exists(): pytest.skip("Real jQuery file not available for testing") # Read first 8000 characters (exceeds limits, single line) jquery_content = jquery_path.read_text(encoding='utf-8')[:8000] parser = create_parser_for_language(Language.JAVASCRIPT) chunks = parser.parse_content(jquery_content) assert len(chunks) >= 1, "Should handle jQuery minified code" # Verify all chunks respect size limits (this is the real test) for i, chunk in enumerate(chunks): non_ws_chars = len(re.sub(r'\s', '', chunk.code)) assert non_ws_chars <= 1200, f"jQuery chunk {i} exceeds limit: {non_ws_chars}" # Verify chunking worked appropriately total_chunk_content = sum(len(chunk.code) for chunk in chunks) original_non_ws = len(re.sub(r'\s', '', jquery_content)) chunks_non_ws = sum(len(re.sub(r'\s', '', chunk.code)) for chunk in chunks) print(f"jQuery test: {len(jquery_content)} chars → {len(chunks)} chunks → {total_chunk_content} chars") print(f"Non-whitespace: {original_non_ws} → {chunks_non_ws}") # Should have meaningful chunk extraction (not just empty chunks) assert total_chunk_content > 1000, f"Too little chunk content: {total_chunk_content}" assert chunks_non_ws > 500, f"Too little meaningful content: {chunks_non_ws}" class TestLanguageSpecificEdgeCases: """Test cross-language AST structure handling.""" def test_python_class_with_nested_methods(self): """Test Python class chunking preserves method boundaries.""" python_class = ''' class DataProcessor: """Main data processing class.""" def __init__(self, config): self.config = config def process_batch(self, items): """Process a batch of items.""" results = [] for item in items: result = self._process_single(item) if result: results.append(result) return results def _process_single(self, item): """Process single item with validation.""" if not item.is_valid(): return None return item.transform() ''' parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(python_class) # Should extract class and methods as separate concepts assert len(chunks) > 0, "Should produce chunks" # Verify different chunk types exist chunk_types = {chunk.chunk_type for chunk in chunks} # Should have function/method chunks at minimum code_types = {ChunkType.FUNCTION, ChunkType.METHOD, ChunkType.CLASS} assert any(ct in code_types for ct in chunk_types), f"Expected code chunks, got: {chunk_types}" def test_javascript_function_expressions_vs_declarations(self): """Test JavaScript handles both function declarations and expressions.""" js_code = ''' // Function declaration function regularFunction() { return "declaration"; } // Function expression const expressionFunc = function() { return "expression"; }; // Arrow function const arrowFunc = () => { return "arrow"; }; // Method in object const obj = { method() { return "method"; } }; ''' parser = create_parser_for_language(Language.JAVASCRIPT) chunks = parser.parse_content(js_code) # Should capture function definitions assert len(chunks) >= 1, "Should capture functions" # Verify content is preserved all_code = ' '.join(chunk.code for chunk in chunks) assert "regularFunction" in all_code assert "expressionFunc" in all_code or "arrowFunc" in all_code def test_deeply_nested_code_structure(self): """Test handling of deeply nested code that might break parsers.""" nested_code = ''' def outer_function(): def level_1(): def level_2(): def level_3(): def level_4(): def level_5(): return "deep" return level_5() return level_4() return level_3() return level_2() return level_1() class OuterClass: class MiddleClass: class InnerClass: def deep_method(self): if True: if True: if True: return "nested_conditions" ''' parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(nested_code) # Should handle nesting without stack overflow or infinite recursion assert len(chunks) > 0, "Should handle nested code" # Verify chunks are still within size limits despite nesting for chunk in chunks: non_ws_chars = len(re.sub(r'\s', '', chunk.code)) assert non_ws_chars <= 1200, f"Size limit exceeded in nested code: {non_ws_chars}" def test_mixed_content_types_integration(self): """Test files with mixed comments, imports, code.""" mixed_content = ''' #!/usr/bin/env python3 """ Module docstring explaining the purpose. This is a multi-line docstring that should be captured. """ import os import sys from typing import List, Dict # Global constants MAX_ITEMS = 1000 DEFAULT_CONFIG = {"timeout": 30} def main(): """Main entry point.""" print("Starting application") # Process configuration config = load_config() # Main processing loop while True: items = get_next_batch() if not items: break process_batch(items) # Helper functions def load_config(): """Load configuration from environment.""" return {"debug": os.getenv("DEBUG", False)} def get_next_batch(): """Get next batch of items to process.""" # Implementation details... pass if __name__ == "__main__": main() ''' parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(mixed_content) # Should capture different content types appropriately assert len(chunks) > 0, "Should produce chunks from mixed content" # Verify function content is captured all_code = ' '.join(chunk.code for chunk in chunks) assert "def main" in all_code or "main()" in all_code class TestSizeLimitEdgeCases: """Test boundary conditions around size limits.""" def test_chunk_exactly_at_size_limits(self): """Test chunks that are exactly at the size boundaries.""" # Create function that is exactly 1200 non-whitespace characters base_func = 'def test_func():\n ' base_non_ws = len(re.sub(r'\s', '', base_func)) remaining_chars = 1200 - base_non_ws if remaining_chars > 10: padding = 'x = 1 # ' + 'a' * max(0, remaining_chars - 10) + '\n' exact_size_func = base_func + padding # Verify it's at or near the limit actual_size = len(re.sub(r'\s', '', exact_size_func)) assert 1150 <= actual_size <= 1250, f"Test setup error: size is {actual_size}" parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(exact_size_func) # Should produce at least one chunk assert len(chunks) >= 1, "Should produce chunks" # Verify size constraints for chunk in chunks: non_ws_chars = len(re.sub(r'\s', '', chunk.code)) assert non_ws_chars <= 1200, f"Chunk exceeds limit: {non_ws_chars}" def test_real_world_oversized_content_d3js(self): """Test handling of oversized content using real D3.js library.""" import os from pathlib import Path # Use real D3.js minified file (very large) fixtures_dir = Path(__file__).parent / "fixtures" / "real_world_files" d3_path = fixtures_dir / "d3.min.js" if not d3_path.exists(): pytest.skip("Real D3.js file not available for testing") # D3.js is large - test a substantial portion d3_content = d3_path.read_text(encoding='utf-8') parser = create_parser_for_language(Language.JAVASCRIPT) chunks = parser.parse_content(d3_content) # Critical test: NO chunk should exceed size constraints oversized_chunks = [] for i, chunk in enumerate(chunks): non_ws_chars = len(re.sub(r'\s', '', chunk.code)) if non_ws_chars > 1200: oversized_chunks.append((i, non_ws_chars)) assert len(oversized_chunks) == 0, f"Found {len(oversized_chunks)} oversized chunks: {oversized_chunks}" # Should produce reasonable number of chunks for large file assert len(chunks) > 10, f"Too few chunks for large D3.js file: {len(chunks)}" # Performance check original_size = len(d3_content) total_chunk_size = sum(len(chunk.code) for chunk in chunks) expansion_ratio = total_chunk_size / original_size # D3 should compress or have minimal expansion assert expansion_ratio <= 2.0, f"Excessive expansion on real D3.js: {expansion_ratio:.2f}x" def test_empty_and_minimal_files(self): """Test edge cases with empty or minimal content.""" test_cases = [ "", # Empty file "# Just a comment", # Only comment "import os", # Only import "x = 1", # Single statement "def f(): pass", # Minimal function ] parser = create_parser_for_language(Language.PYTHON) for content in test_cases: chunks = parser.parse_content(content) # Should handle gracefully - no crashes assert isinstance(chunks, list), f"Should return list for: {repr(content)}" if chunks: assert len(chunks[0].code.strip()) > 0, f"Chunk should not be empty for: {repr(content)}" class TestPerformanceAndScalability: """Test performance and memory efficiency.""" def test_real_world_minified_libraries_comprehensive(self): """Comprehensive test of real-world minified JavaScript libraries. This tests the cAST algorithm against actual production JavaScript files that represent real pathological cases for chunking algorithms. """ import os fixtures_dir = Path(__file__).parent / "fixtures" / "real_world_files" # Test different real-world minified files with expected characteristics test_files = [ ("jquery-3.7.1.min.js", "jQuery 3.7.1", { "min_chunks": 5, "max_expansion": 2.0, "description": "Classic single-line minified library" }), ("react.min.js", "React 18.2.0", { "min_chunks": 2, "max_expansion": 2.0, "description": "Production React build with some line breaks" }), ("vue.min.js", "Vue 3.4.27", { "min_chunks": 10, "max_expansion": 1.5, "description": "Vue production build - mixed minification" }), ("bootstrap.min.js", "Bootstrap 5.3.2", { "min_chunks": 5, "max_expansion": 2.0, "description": "Bootstrap bundle with moderate minification" }), ("d3.min.js", "D3.js 7.8.5", { "min_chunks": 15, "max_expansion": 1.0, "description": "Large visualization library - excellent compression expected" }), ] parser = create_parser_for_language(Language.JAVASCRIPT) results = [] for filename, lib_name, expectations in test_files: file_path = fixtures_dir / filename if not file_path.exists(): continue # Skip if file not available print(f"\n=== Testing {lib_name} ===") print(f"Description: {expectations['description']}") file_content = file_path.read_text(encoding='utf-8') original_size = len(file_content) file_lines = len(file_content.splitlines()) start_time = time.time() chunks = parser.parse_content(file_content) processing_time = time.time() - start_time # Calculate metrics total_chunk_size = sum(len(chunk.code) for chunk in chunks) expansion_ratio = total_chunk_size / original_size if original_size > 0 else 1.0 avg_chunk_size = total_chunk_size / len(chunks) if chunks else 0 print(f"File size: {original_size:,} bytes ({file_lines} lines)") print(f"Processing time: {processing_time:.3f}s") print(f"Chunks created: {len(chunks)}") print(f"Total chunk content: {total_chunk_size:,} bytes") print(f"Expansion ratio: {expansion_ratio:.3f}x") print(f"Average chunk size: {avg_chunk_size:.0f} bytes") # Performance assertions assert processing_time < 5.0, f"{filename}: Processing too slow: {processing_time}s" assert len(chunks) > 0, f"{filename}: Should produce chunks" assert len(chunks) >= expectations["min_chunks"], f"{filename}: Too few chunks: {len(chunks)}" # Memory efficiency assertions assert expansion_ratio <= expectations["max_expansion"], \ f"{filename}: Expansion {expansion_ratio:.3f}x > {expectations['max_expansion']}x" # Size constraint assertions (CRITICAL) oversized_chunks = [] for i, chunk in enumerate(chunks): non_ws_chars = len(re.sub(r'\s', '', chunk.code)) if non_ws_chars > 1200: oversized_chunks.append((i, non_ws_chars)) assert len(oversized_chunks) == 0, \ f"{filename}: Found {len(oversized_chunks)} oversized chunks: {oversized_chunks[:3]}" # Token limit check for i, chunk in enumerate(chunks): estimated_tokens = len(re.sub(r'\s', '', chunk.code)) * 1.75 assert estimated_tokens <= 6000, \ f"{filename} chunk {i}: Estimated {estimated_tokens:.0f} tokens > 6000 limit" results.append({ "library": lib_name, "file_size": original_size, "chunks": len(chunks), "expansion": expansion_ratio, "processing_time": processing_time }) # Summary validation if results: print(f"\n=== Summary of {len(results)} Libraries ===") for result in results: print(f"{result['library']}: {result['file_size']:,}B → {result['chunks']} chunks " f"({result['expansion']:.2f}x) in {result['processing_time']:.3f}s") # Overall performance check total_processing_time = sum(r['processing_time'] for r in results) total_file_size = sum(r['file_size'] for r in results) avg_expansion = sum(r['expansion'] for r in results) / len(results) print(f"Total processing time: {total_processing_time:.3f}s for {total_file_size:,} bytes") print(f"Average expansion ratio: {avg_expansion:.3f}x") assert total_processing_time < 10.0, f"Total processing time too slow: {total_processing_time:.3f}s" assert avg_expansion <= 2.0, f"Average expansion too high: {avg_expansion:.3f}x" def test_synthetic_repetitive_python_file(self): """Test processing of synthetically repetitive Python code. NOTE: This is a pathological case with maximally repetitive code patterns that would never occur in real-world development. It tests the worst-case scenario for content duplication. """ # Generate a realistic large Python file large_file_content = self.generate_realistic_python_file( num_classes=3, # Reduced to be less pathological methods_per_class=5, lines_per_method=10, include_docstrings=True, include_type_hints=True ) parser = create_parser_for_language(Language.PYTHON) start_time = time.time() chunks = parser.parse_content(large_file_content) processing_time = time.time() - start_time # Performance verification assert processing_time < 5.0, f"Processing too slow: {processing_time}s" assert len(chunks) > 0, "Should produce chunks" # Memory efficiency check - more lenient for synthetic repetitive code total_chunk_size = sum(len(chunk.code) for chunk in chunks) original_size = len(large_file_content) expansion_ratio = total_chunk_size / original_size print(f"Synthetic repetitive code: {original_size} bytes → {total_chunk_size} bytes ({expansion_ratio:.2f}x)") # Allow higher expansion for pathologically repetitive synthetic code # Real code would never be this repetitive assert expansion_ratio <= 3.0, f"Excessive expansion even for synthetic code: {expansion_ratio:.2f}x" def test_different_file_encodings(self): """Test handling of files with different encodings.""" # Test UTF-8 with special characters utf8_content = ''' def función_con_caracteres_especiales(): """Función que maneja caracteres especiales: áéíóú ñ.""" mensaje = "¡Hola, mundo! 你好世界 🌍" return mensaje class DeutscheKlasse: """Klasse mit deutschen Umlauten: äöüß.""" def methode_mit_umlauten(self): return "Größe: 10cm" ''' parser = create_parser_for_language(Language.PYTHON) chunks = parser.parse_content(utf8_content) assert len(chunks) > 0, "Should handle UTF-8 content" # Verify special characters are preserved chunk_text = ' '.join(chunk.code for chunk in chunks) assert 'función_con_caracteres_especiales' in chunk_text # These might not always be preserved depending on parsing # assert '你好世界' in chunk_text # assert 'äöüß' in chunk_text def generate_realistic_python_file(self, num_classes, methods_per_class, lines_per_method, include_docstrings=True, include_type_hints=True): """Generate realistic Python file for testing.""" content = ['#!/usr/bin/env python3'] content.append('"""Large module for testing purposes."""') content.append('') content.append('import os') content.append('import sys') content.append('from typing import List, Dict, Optional') content.append('') for class_idx in range(num_classes): class_name = f"TestClass{class_idx}" if include_docstrings: content.append(f'class {class_name}:') content.append(f' """Test class number {class_idx}."""') content.append('') else: content.append(f'class {class_name}:') for method_idx in range(methods_per_class): method_name = f"method_{method_idx}" if include_type_hints: content.append(f' def {method_name}(self, param: str) -> Optional[str]:') else: content.append(f' def {method_name}(self, param):') if include_docstrings: content.append(f' """Method {method_idx} implementation."""') for line_idx in range(lines_per_method): content.append(f' # Line {line_idx} of method {method_name}') content.append(f' result_{line_idx} = param + str({line_idx})') content.append(' return result_0') content.append('') return '\n'.join(content) class TestIntegrationAndAdapterCompatibility: """Test adapter compatibility and integration points.""" def test_universal_parser_direct_usage(self): """Test UniversalParser direct usage without adapter.""" # Test with real parsing workload code = ''' class RealClass: def real_method(self, param): return param * 2 def standalone_function(): return "result" ''' # Create universal parser directly factory = get_parser_factory() universal_parser = factory.create_parser(Language.PYTHON) # Test direct usage chunks = universal_parser.parse_content(code) assert len(chunks) > 0, "Should produce chunks" assert isinstance(chunks, list), "Should return list" # Verify Chunk objects for chunk in chunks: assert hasattr(chunk, 'symbol'), "Should have symbol attribute" assert hasattr(chunk, 'start_line'), "Should have start_line attribute" assert hasattr(chunk, 'code'), "Should have code attribute" def test_factory_caching_and_reuse(self): """Test parser factory caching works correctly.""" factory = get_parser_factory() # Create same parser twice parser1 = factory.create_parser(Language.PYTHON) parser2 = factory.create_parser(Language.PYTHON) # Should be same instance (cached) assert parser1 is parser2, "Parser should be cached" # Test different languages if available try: js_parser = factory.create_parser(Language.JAVASCRIPT) assert parser1 is not js_parser, "Different languages should be different parsers" assert parser1.language_name == "python" assert js_parser.language_name == "javascript" except Exception: # JavaScript parser may not be available, skip this part pass def test_language_detection_and_extensions(self): """Test language detection and file extension mapping.""" factory = get_parser_factory() # Test file detection test_file = Path("test.py") parser = factory.create_parser_for_file(test_file) assert parser.language_name == "python" # Test language availability available_languages = factory.get_available_languages() assert isinstance(available_languages, dict) assert Language.PYTHON in available_languages assert available_languages[Language.PYTHON] is True # Python should be available

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_cast_algorithm.py•26.9 KiB