ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests

test_performance_discovery.py•6.21 KiB

"""Performance benchmarks for directory discovery. These tests measure and validate the performance characteristics of parallel vs sequential file discovery, documenting actual speedup on different directory structures. """ import time import pytest from pathlib import Path from chunkhound.core.types.common import Language from chunkhound.parsers.parser_factory import create_parser_for_language from chunkhound.providers.database.duckdb_provider import DuckDBProvider from chunkhound.services.indexing_coordinator import IndexingCoordinator @pytest.fixture async def coordinator(tmp_path): """Create test coordinator.""" db = DuckDBProvider(":memory:", base_directory=tmp_path) db.connect() parser = create_parser_for_language(Language.PYTHON) coordinator = IndexingCoordinator(db, tmp_path, None, {Language.PYTHON: parser}) return coordinator def create_large_repo(base_path: Path, num_dirs: int, files_per_dir: int): """Create a large repository structure for benchmarking. Args: base_path: Base directory num_dirs: Number of top-level directories files_per_dir: Number of files per directory """ for i in range(num_dirs): dir_path = base_path / f"module_{i}" dir_path.mkdir() for j in range(files_per_dir): file_path = dir_path / f"file_{j}.py" file_path.write_text(f"# Module {i}, File {j}\ndef func_{i}_{j}():\n pass\n") @pytest.mark.asyncio async def test_small_repo_performance(coordinator, tmp_path): """Benchmark discovery on small repos (<100 files). EXPECTATION: Sequential may be faster due to process spawning overhead. """ # Create small repo (50 files across 2 dirs - below parallel threshold) create_large_repo(tmp_path, num_dirs=2, files_per_dir=25) patterns = ["**/*.py"] exclude_patterns = [] # Measure sequential start = time.perf_counter() sequential_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=False ) sequential_time = time.perf_counter() - start # Measure parallel (will auto-fallback to sequential due to < 4 dirs) start = time.perf_counter() parallel_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=True ) parallel_time = time.perf_counter() - start # Verify results match assert set(sequential_files) == set(parallel_files) assert len(sequential_files) == 50 print(f"\nSmall repo (50 files, 2 dirs):") print(f" Sequential: {sequential_time:.4f}s") print(f" Parallel: {parallel_time:.4f}s (auto-fallback to sequential)") @pytest.mark.asyncio async def test_medium_repo_performance(coordinator, tmp_path): """Benchmark discovery on medium repos (100-1000 files). EXPECTATION: Parallel should show measurable speedup (1.5-3x). """ # Create medium repo (500 files across 10 dirs - triggers parallel mode) create_large_repo(tmp_path, num_dirs=10, files_per_dir=50) patterns = ["**/*.py"] exclude_patterns = [] # Measure sequential start = time.perf_counter() sequential_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=False ) sequential_time = time.perf_counter() - start # Measure parallel start = time.perf_counter() parallel_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=True ) parallel_time = time.perf_counter() - start # Verify results match assert set(sequential_files) == set(parallel_files) assert len(sequential_files) == 500 speedup = sequential_time / parallel_time if parallel_time > 0 else 0 print(f"\nMedium repo (500 files, 10 dirs):") print(f" Sequential: {sequential_time:.4f}s") print(f" Parallel: {parallel_time:.4f}s") print(f" Speedup: {speedup:.2f}x") # Document actual performance - speedup should be > 1.0 on multi-core systems # (may be marginal on CI systems or small repos) @pytest.mark.asyncio async def test_large_repo_performance(coordinator, tmp_path): """Benchmark discovery on large repos (>1000 files). EXPECTATION: Parallel should show significant speedup (2-5x on multi-core). """ # Create large repo (2000 files across 20 dirs - enterprise monorepo scale) create_large_repo(tmp_path, num_dirs=20, files_per_dir=100) patterns = ["**/*.py"] exclude_patterns = [] # Measure sequential start = time.perf_counter() sequential_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=False ) sequential_time = time.perf_counter() - start # Measure parallel start = time.perf_counter() parallel_files = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=True ) parallel_time = time.perf_counter() - start # Verify results match assert set(sequential_files) == set(parallel_files) assert len(sequential_files) == 2000 speedup = sequential_time / parallel_time if parallel_time > 0 else 0 print(f"\nLarge repo (2000 files, 20 dirs):") print(f" Sequential: {sequential_time:.4f}s") print(f" Parallel: {parallel_time:.4f}s") print(f" Speedup: {speedup:.2f}x") # On multi-core systems, parallel should be noticeably faster # Note: Actual speedup depends on CPU cores, I/O speed, and filesystem caching @pytest.mark.asyncio async def test_config_driven_discovery(coordinator, tmp_path): """Test that discovery mode respects configuration settings.""" create_large_repo(tmp_path, num_dirs=10, files_per_dir=10) patterns = ["**/*.py"] exclude_patterns = [] # Test with config defaults (parallel_discovery=True by default) files_default = await coordinator._discover_files( tmp_path, patterns, exclude_patterns # Uses config setting ) # Test explicit override files_explicit = await coordinator._discover_files( tmp_path, patterns, exclude_patterns, parallel_discovery=False ) # Results should be identical regardless of mode assert set(files_default) == set(files_explicit) assert len(files_default) == 100

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_performance_discovery.py•6.21 KiB