Code-Index-MCP

test_benchmarks.py•27.3 KiB

""" Comprehensive benchmark tests for MCP Server performance validation. This module provides: - pytest-benchmark integration tests with SLO validation - Performance regression tests - Interface compliance testing (IBenchmarkRunner, IPerformanceMonitor) - Memory and resource usage tests - Automated performance baseline generation """ import tempfile import time from pathlib import Path import pytest from mcp_server.benchmarks import ( BenchmarkResult, BenchmarkRunner, BenchmarkSuite, PerformanceMetrics, ) from mcp_server.interfaces.indexing_interfaces import IBenchmarkRunner from mcp_server.interfaces.metrics_interfaces import IPerformanceMonitor from mcp_server.interfaces.shared_interfaces import Result from mcp_server.plugin_base import IPlugin, SearchResult, SymbolDef class MockPlugin(IPlugin): """Mock plugin for benchmark testing.""" def __init__(self, lang: str = "python", delay_ms: float = 0): self.lang = lang self.delay_ms = delay_ms self._symbols = {} self._files = {} def supports(self, path: Path) -> bool: extensions = { "python": [".py"], "javascript": [".js"], "c": [".c", ".h"], } return path.suffix in extensions.get(self.lang, []) def index(self, path: Path, content: str): # Simulate indexing delay if self.delay_ms > 0: time.sleep(self.delay_ms / 1000) self._files[str(path)] = content # Extract simple symbols if self.lang == "python": for line in content.split("\n"): if line.startswith("def "): name = line.split("(")[0].replace("def ", "").strip() self._symbols[name] = SymbolDef( name=name, type="function", path=str(path), line=1, character=0, definition=line, ) elif line.startswith("class "): name = line.split("(")[0].split(":")[0].replace("class ", "").strip() self._symbols[name] = SymbolDef( name=name, type="class", path=str(path), line=1, character=0, definition=line, ) def getDefinition(self, symbol: str) -> SymbolDef | None: # Simulate lookup delay if self.delay_ms > 0: time.sleep(self.delay_ms / 1000) return self._symbols.get(symbol) def search(self, query: str, opts: dict): # Simulate search delay if self.delay_ms > 0: time.sleep(self.delay_ms / 1000) results = [] for name, symbol in self._symbols.items(): if query.lower() in name.lower(): results.append( SearchResult( path=symbol.path, line=symbol.line, character=symbol.character, snippet=symbol.definition, score=1.0, ) ) return results[: opts.get("limit", 20)] @pytest.fixture def mock_plugins(): """Create mock plugins for testing.""" return [ MockPlugin("python", delay_ms=5), MockPlugin("javascript", delay_ms=5), MockPlugin("c", delay_ms=5), ] @pytest.fixture def benchmark_suite(mock_plugins): """Create benchmark suite with mock plugins.""" with tempfile.TemporaryDirectory() as tmpdir: db_path = Path(tmpdir) / "test.db" suite = BenchmarkSuite(mock_plugins, db_path) yield suite @pytest.fixture def benchmark_runner(): """Create benchmark runner for tests.""" with tempfile.TemporaryDirectory() as tmpdir: runner = BenchmarkRunner(Path(tmpdir)) yield runner class TestPerformanceMetrics: """Test the PerformanceMetrics class.""" def test_metrics_initialization(self): """Test metric initialization.""" metric = PerformanceMetrics("test_op") assert metric.operation == "test_op" assert metric.count == 0 assert metric.mean == 0.0 assert metric.samples == [] def test_add_samples(self): """Test adding timing samples.""" metric = PerformanceMetrics("test_op") samples = [10.5, 15.2, 12.3, 18.7, 11.1] for sample in samples: metric.add_sample(sample) assert metric.count == 5 assert metric.min == 10.5 assert metric.max == 18.7 assert 13.0 < metric.mean < 14.0 # Approximate def test_percentiles(self): """Test percentile calculations.""" metric = PerformanceMetrics("test_op") # Add 100 samples with known distribution for i in range(100): metric.add_sample(i) assert metric.median == 49.5 assert metric.p95 == 94.05 assert metric.p99 == 98.01 def test_slo_validation(self): """Test SLO validation.""" metric = PerformanceMetrics("test_op") # Add samples below target for _ in range(100): metric.add_sample(50) # All 50ms assert metric.is_within_slo(100) # Should pass assert not metric.is_within_slo(40) # Should fail class TestBenchmarkSuite: """Test the BenchmarkSuite class.""" def test_suite_initialization(self, mock_plugins): """Test suite initialization.""" with tempfile.TemporaryDirectory() as tmpdir: db_path = Path(tmpdir) / "test.db" suite = BenchmarkSuite(mock_plugins, db_path) assert len(suite.plugins) == 3 assert suite.dispatcher is not None assert suite.store is not None assert suite.gateway is not None def test_symbol_lookup_benchmark(self, benchmark_suite): """Test symbol lookup benchmarking.""" # Pre-populate some symbols for i in range(10): symbol = SymbolDef( name=f"function_{i}", type="function", path=f"/test/file_{i}.py", line=1, character=0, definition=f"def function_{i}():", ) benchmark_suite.plugins[0]._symbols[f"function_{i}"] = symbol metric = benchmark_suite.benchmark_symbol_lookup(iterations=50) assert metric.operation == "symbol_lookup" assert metric.count == 50 assert metric.p95 > 0 assert metric.memory_usage_mb > 0 def test_search_benchmark(self, benchmark_suite): """Test search benchmarking.""" # Pre-populate symbols for i in range(20): symbol = SymbolDef( name=f"test_function_{i}", type="function", path=f"/test/file_{i}.py", line=1, character=0, definition=f"def test_function_{i}():", ) benchmark_suite.plugins[0]._symbols[f"test_function_{i}"] = symbol metrics = benchmark_suite.benchmark_search(iterations=10) assert "fuzzy_search" in metrics assert "semantic_search" in metrics assert metrics["fuzzy_search"].count == 10 assert metrics["fuzzy_search"].p95 > 0 def test_indexing_benchmark(self, benchmark_suite): """Test indexing throughput benchmark.""" metric = benchmark_suite.benchmark_indexing(file_count=100) assert metric.operation == "indexing_throughput" assert metric.count >= 100 # Should have timing for each file assert hasattr(metric, "files_per_minute") assert metric.files_per_minute > 0 def test_memory_usage_benchmark(self, benchmark_suite): """Test memory usage benchmarking.""" memory_usage = benchmark_suite.benchmark_memory_usage([10, 50, 100]) assert 10 in memory_usage assert 50 in memory_usage assert 100 in memory_usage # Memory should increase with file count assert memory_usage[100] > memory_usage[10] def test_cache_performance_benchmark(self, benchmark_suite): """Test cache performance benchmarking.""" # Pre-populate some data for i in range(10): symbol = SymbolDef( name=f"cached_function_{i}", type="function", path=f"/test/cached_{i}.py", line=1, character=0, definition=f"def cached_function_{i}():", ) benchmark_suite.plugins[0]._symbols[f"cached_function_{i}"] = symbol metrics = benchmark_suite.benchmark_cache_performance(iterations=50) assert "cache_hit" in metrics assert "cache_miss" in metrics # Cache hits should generally be faster than misses # (though with our mock, they're the same) assert metrics["cache_hit"].count == 50 assert metrics["cache_miss"].count == 50 def test_validate_performance_requirements(self, benchmark_suite): """Test performance requirement validation.""" # Create a result with known metrics result = BenchmarkResult("test") # Add passing metric passing_metric = PerformanceMetrics("symbol_lookup") for _ in range(100): passing_metric.add_sample(50) # All under 100ms target result.add_metric("symbol_lookup", passing_metric) # Add failing metric failing_metric = PerformanceMetrics("fuzzy_search") for _ in range(100): failing_metric.add_sample(600) # All over 500ms target result.add_metric("fuzzy_search", failing_metric) validations = benchmark_suite.validate_performance_requirements(result) assert validations["symbol_lookup_slo"] is True assert validations["search_slo"] is False class TestBenchmarkRunner: """Test the BenchmarkRunner class.""" def test_runner_initialization(self): """Test runner initialization.""" with tempfile.TemporaryDirectory() as tmpdir: runner = BenchmarkRunner(Path(tmpdir)) assert runner.output_dir.exists() assert runner.history == [] def test_run_benchmarks(self, benchmark_runner, mock_plugins): """Test running full benchmark suite.""" result = benchmark_runner.run_benchmarks( mock_plugins, save_results=True, compare_with_previous=False ) assert isinstance(result, BenchmarkResult) assert len(result.metrics) > 0 assert hasattr(result, "validations") assert result.duration_seconds > 0 def test_save_and_load_history(self, benchmark_runner): """Test saving and loading benchmark history.""" # Create a mock result result = BenchmarkResult("test") metric = PerformanceMetrics("test_op") metric.add_sample(10.5) result.add_metric("test_op", metric) result.finalize() # Save result benchmark_runner._save_result(result) # Reload runner and check history new_runner = BenchmarkRunner(benchmark_runner.output_dir) assert len(new_runner.history) == 1 assert new_runner.history[0]["suite_name"] == "test" def test_regression_detection(self, benchmark_runner): """Test performance regression detection.""" # Create baseline result baseline = BenchmarkResult("baseline") baseline_metric = PerformanceMetrics("test_op") for _ in range(100): baseline_metric.add_sample(50) baseline.add_metric("test_op", baseline_metric) baseline.finalize() benchmark_runner._save_result(baseline) # Create current result with regression current = BenchmarkResult("current") current_metric = PerformanceMetrics("test_op") for _ in range(100): current_metric.add_sample(100) # 100% slower current.add_metric("test_op", current_metric) regression_report = benchmark_runner._check_regressions(current) assert regression_report["status"] == "checked" assert len(regression_report["regressions"]) == 1 assert regression_report["regressions"][0]["metric"] == "test_op" assert regression_report["regressions"][0]["change_percent"] > 90 # Test improvement detection as well improved = BenchmarkResult("improved") improved_metric = PerformanceMetrics("test_op") for _ in range(100): improved_metric.add_sample(25) # 50% faster improved.add_metric("test_op", improved_metric) improvement_report = benchmark_runner._check_regressions(improved) assert improvement_report["status"] == "checked" assert len(improvement_report["improvements"]) == 1 def test_generate_reports(self, benchmark_runner): """Test report generation.""" result = BenchmarkResult("test") metric = PerformanceMetrics("test_op") for i in range(100): metric.add_sample(i) result.add_metric("test_op", metric) result.validations = {"test_slo": True} result.finalize() benchmark_runner._generate_reports(result) # Check files were created html_file = benchmark_runner.output_dir / "benchmark_report.html" text_file = benchmark_runner.output_dir / "benchmark_summary.txt" assert html_file.exists() assert text_file.exists() # Verify content html_content = html_file.read_text() assert "MCP Server Benchmark Report" in html_content assert "test_op" in html_content text_content = text_file.read_text() assert "Performance Metrics:" in text_content assert "test_op" in text_content def test_export_for_ci(self, benchmark_runner): """Test CI export functionality.""" result = BenchmarkResult("test") metric = PerformanceMetrics("test_op") metric.add_sample(50) result.add_metric("test_op", metric) result.validations = {"test_slo": True} result.finalize() ci_data = benchmark_runner.export_for_ci(result) assert "metrics" in ci_data assert "test_op" in ci_data["metrics"] assert ci_data["metrics"]["test_op"]["p95_ms"] > 0 assert ci_data["passed"] is True @pytest.mark.benchmark(group="symbol_lookup") def test_benchmark_symbol_lookup_performance(benchmark, mock_plugins): """Benchmark symbol lookup with pytest-benchmark and SLO validation.""" suite = BenchmarkSuite(mock_plugins) # Pre-populate symbols for i in range(100): symbol = SymbolDef( name=f"bench_function_{i}", type="function", path=f"/test/bench_{i}.py", line=1, character=0, definition=f"def bench_function_{i}():", ) suite.plugins[0]._symbols[f"bench_function_{i}"] = symbol def lookup(): timer_id = suite.start_timer("symbol_lookup", {"symbol": "bench_function_42"}) try: result = suite.dispatcher.lookup("bench_function_42") return result finally: duration = suite.stop_timer(timer_id) # Validate against SLO during the benchmark assert ( duration * 1000 <= BenchmarkSuite.SYMBOL_LOOKUP_TARGET_MS ), f"Symbol lookup took {duration*1000:.2f}ms, exceeds {BenchmarkSuite.SYMBOL_LOOKUP_TARGET_MS}ms target" result = benchmark(lookup) assert result is not None # Validate benchmark meets p95 target stats = benchmark.stats if hasattr(stats, "data"): p95_time_ms = sorted(stats.data)[int(len(stats.data) * 0.95)] * 1000 assert ( p95_time_ms <= BenchmarkSuite.SYMBOL_LOOKUP_TARGET_MS ), f"P95 {p95_time_ms:.2f}ms exceeds target {BenchmarkSuite.SYMBOL_LOOKUP_TARGET_MS}ms" @pytest.mark.benchmark(group="search") def test_benchmark_search_performance(benchmark, mock_plugins): """Benchmark search with pytest-benchmark and SLO validation.""" suite = BenchmarkSuite(mock_plugins) # Pre-populate symbols for i in range(100): symbol = SymbolDef( name=f"search_function_{i}", type="function", path=f"/test/search_{i}.py", line=1, character=0, definition=f"def search_function_{i}():", ) suite.plugins[0]._symbols[f"search_function_{i}"] = symbol def search(): timer_id = suite.start_timer("fuzzy_search", {"query": "search"}) try: results = list(suite.dispatcher.search("search", semantic=False)) return results finally: duration = suite.stop_timer(timer_id) # Validate against SLO during the benchmark assert ( duration * 1000 <= BenchmarkSuite.SEARCH_TARGET_MS ), f"Search took {duration*1000:.2f}ms, exceeds {BenchmarkSuite.SEARCH_TARGET_MS}ms target" results = benchmark(search) assert len(results) > 0 # Validate benchmark meets p95 target stats = benchmark.stats if hasattr(stats, "data"): p95_time_ms = sorted(stats.data)[int(len(stats.data) * 0.95)] * 1000 assert ( p95_time_ms <= BenchmarkSuite.SEARCH_TARGET_MS ), f"P95 {p95_time_ms:.2f}ms exceeds target {BenchmarkSuite.SEARCH_TARGET_MS}ms" @pytest.mark.benchmark(group="indexing") def test_benchmark_indexing_performance(benchmark, mock_plugins): """Benchmark indexing with pytest-benchmark and throughput validation.""" suite = BenchmarkSuite(mock_plugins) with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: f.write( ''' def benchmark_function(): """Benchmark test function.""" return 42 class BenchmarkClass: def __init__(self): self.value = 100 def calculate_result(self, input_value): return self.value * input_value ''' ) f.flush() path = Path(f.name) content = path.read_text() def index(): timer_id = suite.start_timer("indexing", {"file_path": str(path)}) try: plugin = suite.dispatcher._match_plugin(path) plugin.index(path, content) finally: suite.stop_timer(timer_id) benchmark(index) # Validate indexing throughput - individual file should be quick stats = benchmark.stats if hasattr(stats, "data"): mean_time_ms = (sum(stats.data) / len(stats.data)) * 1000 # Individual file indexing should be under 100ms for reasonable throughput assert ( mean_time_ms <= 100 ), f"Mean indexing time {mean_time_ms:.2f}ms too slow for throughput target" # Cleanup path.unlink() @pytest.mark.slow class TestLargeBenchmarks: """Large-scale benchmark tests (marked as slow).""" def test_large_codebase_benchmark(self, benchmark_suite): """Test benchmarking with a large number of files.""" # This would be a more extensive test in real scenarios metric = benchmark_suite.benchmark_indexing(file_count=1000) assert metric.files_per_minute > 0 assert metric.count == 1000 # Check if meets the 10K files/minute target meets_target = metric.files_per_minute >= BenchmarkSuite.FILES_PER_MINUTE_TARGET # Log the actual performance for analysis logger.info( f"Indexing throughput: {metric.files_per_minute:.0f} files/minute (target: {BenchmarkSuite.FILES_PER_MINUTE_TARGET})" ) # Allow some slack in tests (50% of target) but document the gap min_acceptable = BenchmarkSuite.FILES_PER_MINUTE_TARGET * 0.5 assert ( metric.files_per_minute >= min_acceptable ), f"Throughput {metric.files_per_minute:.0f} below minimum {min_acceptable:.0f} files/minute" if not meets_target: logger.warning( f"Indexing throughput {metric.files_per_minute:.0f} files/minute below target {BenchmarkSuite.FILES_PER_MINUTE_TARGET}" ) def test_memory_scaling(self, benchmark_suite): """Test memory usage scaling with file count.""" memory_usage = benchmark_suite.benchmark_memory_usage([100, 500, 1000]) # Check linear or sub-linear scaling ratio_500_100 = memory_usage[500] / memory_usage[100] ratio_1000_500 = memory_usage[1000] / memory_usage[500] # Memory usage should scale sub-linearly logger.info( f"Memory scaling: 100→500 files: {ratio_500_100:.2f}x, 500→1000 files: {ratio_1000_500:.2f}x" ) # Allow for some memory overhead but ensure sub-linear scaling assert ratio_500_100 < 6.0, f"Memory scaling 100→500 files too high: {ratio_500_100:.2f}x" assert ( ratio_1000_500 < 3.0 ), f"Memory scaling 500→1000 files too high: {ratio_1000_500:.2f}x" class TestInterfaceCompliance: """Test interface compliance and contract validation.""" def test_benchmark_runner_implements_interface(self, benchmark_runner): """Test that BenchmarkRunner implements IBenchmarkRunner interface.""" assert isinstance(benchmark_runner, IBenchmarkRunner) # Verify all required methods are implemented required_methods = [ "run_indexing_benchmark", "run_search_benchmark", "run_memory_benchmark", "generate_benchmark_report", ] for method_name in required_methods: assert hasattr(benchmark_runner, method_name) method = getattr(benchmark_runner, method_name) assert callable(method) def test_benchmark_suite_implements_interfaces(self, benchmark_suite): """Test that BenchmarkSuite implements performance monitoring interfaces.""" assert isinstance(benchmark_suite, IPerformanceMonitor) # Verify IPerformanceMonitor methods performance_methods = [ "start_timer", "stop_timer", "record_duration", "get_performance_stats", ] for method_name in performance_methods: assert hasattr(benchmark_suite, method_name) method = getattr(benchmark_suite, method_name) assert callable(method) @pytest.mark.asyncio async def test_result_pattern_compliance(self, benchmark_runner, mock_plugins): """Test that interface methods return Result[T] as specified.""" # Test run_indexing_benchmark returns Result with tempfile.TemporaryDirectory() as tmpdir: test_file = Path(tmpdir) / "test.py" test_file.write_text("def test_function(): pass") result = await benchmark_runner.run_indexing_benchmark([str(test_file)]) assert isinstance(result, Result) assert hasattr(result, "success") assert hasattr(result, "value") assert hasattr(result, "error") if result.success: assert result.value is not None assert isinstance(result.value, dict) assert "indexed_files" in result.value else: assert result.error is not None # Test run_search_benchmark returns Result result = await benchmark_runner.run_search_benchmark(["test", "function", "class"]) assert isinstance(result, Result) # Test generate_benchmark_report returns Result result = await benchmark_runner.generate_benchmark_report() assert isinstance(result, Result) @pytest.mark.performance_baseline class TestPerformanceBaseline: """Test performance baseline generation and validation.""" def test_baseline_generation(self, benchmark_runner, mock_plugins): """Test generation of performance baseline.""" # Run benchmarks to generate baseline result = benchmark_runner.run_benchmarks( mock_plugins, save_results=True, compare_with_previous=False ) assert isinstance(result, BenchmarkResult) assert len(result.metrics) > 0 assert hasattr(result, "validations") # Verify baseline files are created assert benchmark_runner.output_dir.exists() history_file = benchmark_runner.output_dir / "benchmark_history.json" assert history_file.exists() # Load and verify baseline data import json with open(history_file, "r") as f: history = json.load(f) assert len(history) > 0 baseline = history[-1] assert "suite_name" in baseline assert "timestamp" in baseline assert "metrics" in baseline # Verify key metrics are present expected_metrics = ["symbol_lookup", "fuzzy_search", "indexing"] for metric_name in expected_metrics: if metric_name in baseline["metrics"]: metric_data = baseline["metrics"][metric_name] assert "p95" in metric_data assert "mean" in metric_data assert isinstance(metric_data["p95"], (int, float)) def test_slo_validation_comprehensive(self, benchmark_suite): """Test comprehensive SLO validation.""" # Create a comprehensive benchmark result result = BenchmarkResult("SLO Validation Test") # Add symbol lookup metric (should pass) symbol_metric = PerformanceMetrics("symbol_lookup") for _ in range(100): symbol_metric.add_sample(80) # 80ms, under 100ms target result.add_metric("symbol_lookup", symbol_metric) # Add search metric (should pass) search_metric = PerformanceMetrics("fuzzy_search") for _ in range(100): search_metric.add_sample(300) # 300ms, under 500ms target result.add_metric("fuzzy_search", search_metric) # Add indexing metric with throughput indexing_metric = PerformanceMetrics("indexing") for _ in range(1000): indexing_metric.add_sample(5) # 5ms per file indexing_metric.files_per_minute = 12000 # Above 10K target result.add_metric("indexing", indexing_metric) # Add memory metric memory_metric = PerformanceMetrics("memory_usage") memory_metric.memory_per_file_count = { 10000: 1800 } # 1.8GB for 10K files, under 2GB for 100K extrapolated result.add_metric("memory_usage", memory_metric) # Validate against requirements validations = benchmark_suite.validate_performance_requirements(result) # All SLOs should pass assert validations["symbol_lookup_slo"] is True assert validations["search_slo"] is True assert validations["indexing_throughput"] is True assert validations["memory_usage"] is True print(f"SLO Validation Results: {validations}") # Utility functions for test data generation def generate_test_symbols(count: int, name_prefix: str = "test") -> List[SymbolDef]: """Generate test symbols for benchmarking.""" symbols = [] for i in range(count): symbol = SymbolDef( name=f"{name_prefix}_{i}", type="function" if i % 2 == 0 else "class", path=f"/test/{name_prefix}_{i // 10}.py", line=i % 100 + 1, character=0, definition=(f"def {name_prefix}_{i}():" if i % 2 == 0 else f"class {name_prefix}_{i}:"), ) symbols.append(symbol) return symbols

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_benchmarks.py•27.3 KiB