ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests

test_parsers.py•12.5 KiB

""" Parser validation tests. Tests that all parsers can parse minimal valid code samples. """ import pytest from chunkhound.core.types.common import FileId, Language from chunkhound.parsers.parser_factory import get_parser_factory from chunkhound.parsers.universal_engine import SetupError # Minimal valid code snippets for each language LANGUAGE_SAMPLES = { Language.PYTHON: "def hello(): pass", Language.JAVA: "class Test { }", Language.CSHARP: "class Test { }", Language.TYPESCRIPT: "const x = 1;", Language.JAVASCRIPT: "const x = 1;", Language.TSX: "const x = <div>hello</div>;", Language.JSX: "const x = <div>hello</div>;", Language.GROOVY: "def hello() { }", Language.KOTLIN: "fun hello() { }", Language.GO: "package main\nfunc main() { }", Language.HASKELL: "add x y = x + y", Language.RUST: "fn main() { }", Language.BASH: "echo hello", Language.MAKEFILE: "all:\n\techo hello", Language.C: "int main() { return 0; }", Language.CPP: "int main() { return 0; }", Language.OBJC: "@interface MyClass : NSObject\n@end", Language.MATLAB: "function result = hello()\nresult = 1;\nend", Language.MARKDOWN: "# Hello\nWorld", Language.JSON: '{"hello": "world"}', Language.YAML: "hello: world", Language.TOML: "hello = 'world'", Language.HCL: "resource \"aws_s3_bucket\" \"b\" {\n bucket = \"my-bucket\"\n}\n", Language.TEXT: "hello world", Language.PDF: "hello world", # PDF parser handles text content Language.ZIG: "fn main() void { }", Language.VUE: '<template><div>{{ message }}</div></template>\n<script setup lang="ts">\nconst message = "hello"\n</script>', Language.SVELTE: '<script lang="ts">\n let message = "hello";\n</script>\n<div>{message}</div>', Language.PHP: '<?php\nfunction hello() {\n return "world";\n}\n?>', Language.SWIFT: "class MyClass {\n func hello() -> String {\n return \"world\"\n }\n}", Language.DART: "void main() { }", } def create_large_array_content(language: Language, item_count: int = 150) -> str: """Create realistic large array content for testing line calculations. This function generates content with large arrays/lists to test that parsers correctly handle line number calculations when splitting chunks. """ if language == Language.TOML: # Create a large dependencies array similar to the actual pyproject.toml that broke # Include realistic package names, version constraints, and comments dependencies = [] for i in range(item_count): pkg_type = i % 6 if pkg_type == 0: dependencies.append(f'"duckdb>={i % 3}.{i % 10}.0"') elif pkg_type == 1: dependencies.append(f'"tree-sitter-{["python", "rust", "go", "java"][i % 4]}>={i % 2}.{i % 20}.0,<{(i % 2)+1}.{(i % 20)+5}.0"') elif pkg_type == 2: dependencies.append(f'"fastapi>={i % 3}.{i % 10}.0"') elif pkg_type == 3: dependencies.append(f'"pytest>={i % 2}.{i % 15}.0"') elif pkg_type == 4: # Add a comment line occasionally to match real structure if i % 10 == 0: dependencies.append(f'# "commented-package>=1.0.0", # Using alternative instead') dependencies.append(f'"package-{i}>=1.{i % 10}.0"') else: dependencies.append(f'"pydantic>={i % 3}.{i % 8}.0"') deps_text = ',\n '.join(dependencies) return f'''[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "test-project" version = "1.0.0" description = "Test project with large dependency array" dependencies = [ {deps_text} ] [project.optional-dependencies] dev = [ "pytest>=7.4.0", "black>=23.0.0", ] ''' elif language == Language.JSON: # Create a package.json-style file with large dependencies items = [f'"package-{i}": ">=1.{i % 10}.0"' for i in range(item_count)] items_text = ',\n '.join(items) return f'''{{ "name": "test-project", "dependencies": {{ {items_text} }}, "devDependencies": {{}} }}''' elif language == Language.YAML: # Create a YAML file with large list items = [f' - package-{i}>=1.{i % 10}.0' for i in range(item_count)] items_text = '\n'.join(items) return f'''name: test-project dependencies: {items_text} build: requires: ["setuptools"] ''' elif language == Language.PYTHON: # Create a Python file with large list literal items = [f'"package-{i}"' for i in range(item_count)] items_text = ',\n '.join(items) return f'''# Test module with large list dependencies = [ {items_text} ] def setup(): return dependencies ''' elif language == Language.JAVASCRIPT: # Create a JavaScript file with large array items = [f'"package-{i}"' for i in range(item_count)] items_text = ',\n '.join(items) return f'''// Test module with large array const dependencies = [ {items_text} ]; module.exports = dependencies; ''' elif language == Language.TYPESCRIPT: # Create a TypeScript file with large array items = [f'"package-{i}"' for i in range(item_count)] items_text = ',\n '.join(items) return f'''// Test module with large array const dependencies: string[] = [ {items_text} ]; export default dependencies; ''' elif language == Language.ZIG: # Create a Zig file with large array items = [f'"{i}"' for i in range(item_count)] items_text = ',\n '.join(items) return f'''// Test module with large array const dependencies = [_][]const u8{{ {items_text} }}; pub fn main() void {{ const count = dependencies.len; _ = count; }} ''' else: # Fallback - use the minimal sample for unsupported languages return LANGUAGE_SAMPLES.get(language, "") class TestParserValidation: """Test that all parsers can parse minimal valid code.""" @pytest.mark.parametrize("language", [lang for lang in Language if lang != Language.UNKNOWN]) def test_parser_can_parse_minimal_code(self, language): """Test that each parser can parse a minimal valid code sample.""" factory = get_parser_factory() # Create parser parser = factory.create_parser(language) assert parser is not None, f"Failed to create parser for {language.value}" # Get sample code sample_code = LANGUAGE_SAMPLES.get(language) assert sample_code is not None, f"No sample code defined for {language.value}" # Parse the sample try: chunks = parser.parse_content(sample_code, "test_file", FileId(1)) assert isinstance(chunks, list), f"Parser for {language.value} didn't return a list" # Don't require chunks - some parsers might return empty for minimal code except SetupError as e: # SetupError indicates critical parser initialization failure (e.g., version incompatibility) # This should cause immediate test failure pytest.fail(f"CRITICAL: Parser setup failed for {language.value}: {e}") except Exception as e: pytest.fail(f"Parser for {language.value} failed to parse minimal code: {e}") @pytest.mark.parametrize("language", [lang for lang in Language if lang != Language.UNKNOWN]) def test_parser_initializes_tree_sitter_language(self, language): """Test that parsers can initialize tree-sitter Language objects without version conflicts. This test specifically targets the tree-sitter language initialization where version compatibility is checked. This was the missing piece that allowed incompatible versions to pass CI tests. """ factory = get_parser_factory() # Create parser - this should work even with version issues try: parser = factory.create_parser(language) assert parser is not None, f"Failed to create parser for {language.value}" except SetupError as e: # SetupError during parser creation indicates missing or incompatible dependencies pytest.fail(f"CRITICAL: Cannot create parser for {language.value}: {e}") # For text and PDF parsers, skip tree-sitter language initialization if language in (Language.TEXT, Language.PDF): return # Force tree-sitter Language object creation by accessing the engine's language # This is where version compatibility errors actually occur try: if hasattr(parser, 'engine') and parser.engine is not None: # Access the _language property which contains the Language object ts_language = parser.engine._language assert ts_language is not None, f"Tree-sitter language is None for {language.value}" except SetupError as e: # This is the critical error we want to catch - version incompatibility pytest.fail(f"CRITICAL: Tree-sitter version incompatibility for {language.value}: {e}") except Exception as e: # Check if this is a version incompatibility error if "Incompatible Language version" in str(e): pytest.fail(f"CRITICAL: Tree-sitter version incompatibility for {language.value}: {e}") else: pytest.fail(f"Unexpected error initializing tree-sitter language for {language.value}: {e}") @pytest.mark.parametrize("language,item_count", [ (Language.TOML, 150), # Original bug: large dependency arrays (Language.JSON, 150), # Similar structure in package.json (Language.YAML, 150), # Similar structure in YAML configs (Language.PYTHON, 100), # Large list literals (Language.JAVASCRIPT, 100), # Large array literals (Language.TYPESCRIPT, 100), # Large array literals with types (Language.ZIG, 100), # Large array literals in Zig ]) def test_parser_handles_long_arrays(self, language, item_count): """Test that parsers correctly handle files with long arrays/lists. This test specifically validates the fix for the line calculation bug that occurred when parsing TOML files with large dependency arrays. The bug would cause invalid line ranges (start_line > end_line) when the universal parser split chunks containing large arrays. The test ensures: 1. No chunks are skipped due to invalid line ranges 2. All chunks have valid line numbers (start_line <= end_line) 3. Line numbers stay within content bounds 4. The fix works universally across all parser types """ factory = get_parser_factory() # Create parser parser = factory.create_parser(language) assert parser is not None, f"Failed to create parser for {language.value}" # Generate large array content large_content = create_large_array_content(language, item_count) total_lines = large_content.count('\n') + 1 print(f"\nTesting {language.value} with {item_count} array items ({total_lines} lines)") # Parse the content try: chunks = parser.parse_content(large_content, f"test_large_array.{language.value}", FileId(1)) assert isinstance(chunks, list), f"Parser for {language.value} didn't return a list" # Critical validation: ensure no chunks have invalid line ranges invalid_chunks = [] for chunk in chunks: if chunk.start_line > chunk.end_line: invalid_chunks.append(f"{chunk.symbol}: {chunk.start_line} > {chunk.end_line}") elif chunk.start_line < 1: invalid_chunks.append(f"{chunk.symbol}: start_line < 1 ({chunk.start_line})") elif chunk.end_line > total_lines: invalid_chunks.append(f"{chunk.symbol}: end_line > total_lines ({chunk.end_line} > {total_lines})") # Fail the test if any chunks have invalid line ranges if invalid_chunks: error_msg = f"Found {len(invalid_chunks)} chunks with invalid line ranges in {language.value}:\n" error_msg += "\n".join(f" - {error}" for error in invalid_chunks) pytest.fail(error_msg) print(f"✓ Successfully parsed {len(chunks)} chunks with valid line ranges") except Exception as e: pytest.fail(f"Parser for {language.value} failed to parse large array content: {e}")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_parsers.py•12.5 KiB