Codebase MCP Server

codebase-mcp
tests
benchmarks

test_indexing_perf.py•12.7 kB

"""Indexing performance benchmarks for codebase-mcp. This module provides pytest-benchmark tests for measuring indexing performance of the codebase-mcp server against constitutional performance targets. **Performance Targets** (from specs/011-performance-validation-multi/spec.md): - FR-001: Index 10,000 files in <60s (p95) across 5 consecutive runs **Constitutional Compliance**: - Principle VIII: Type Safety (full mypy --strict compliance) - Principle IV: Performance Guarantees (<60s p95 for 10,000 files) - Principle VII: TDD (benchmarks serve as performance regression tests) **Usage**: # Run indexing benchmarks only pytest tests/benchmarks/test_indexing_perf.py --benchmark-only # Save results to JSON baseline pytest tests/benchmarks/test_indexing_perf.py --benchmark-only \ --benchmark-json=performance_baselines/indexing_benchmark.json # Compare against baseline pytest tests/benchmarks/test_indexing_perf.py --benchmark-only \ --benchmark-compare=performance_baselines/indexing_baseline.json **Benchmark Structure**: - Uses pytest-benchmark fixture with pedantic mode for accurate measurements - 5 iterations with 1 warmup round (per FR-001 requirements) - Validates against constitutional target: <60s (60,000ms) p95 latency - Returns PerformanceBenchmarkResult model for validation """ from __future__ import annotations import asyncio from datetime import datetime from decimal import Decimal from pathlib import Path from typing import TYPE_CHECKING from uuid import uuid4 import pytest import pytest_asyncio from sqlalchemy.ext.asyncio import AsyncSession from src.models.performance import PerformanceBenchmarkResult from src.services.indexer import index_repository from tests.fixtures.test_repository import generate_benchmark_repository if TYPE_CHECKING: from pytest_benchmark.fixture import BenchmarkFixture # type: ignore[import-untyped] # ============================================================================== # Constants # ============================================================================== # Constitutional target from FR-001 CONSTITUTIONAL_TARGET_MS: Decimal = Decimal("60000.0") # 60 seconds = 60,000ms # Test parameters from FR-001 BENCHMARK_ITERATIONS: int = 5 WARMUP_ROUNDS: int = 1 TARGET_FILE_COUNT: int = 10_000 # ============================================================================== # Fixtures # ============================================================================== @pytest_asyncio.fixture(scope="function") async def benchmark_repository(tmp_path: Path) -> Path: """Generate 10,000-file benchmark repository for performance testing. Uses test_repository fixtures to create realistic codebase with: - 10,000 files (60% Python, 40% JavaScript) - File sizes: 100 bytes to 50KB - Directory depth: up to 5 levels - Code complexity: functions, classes, imports Args: tmp_path: pytest tmp_path fixture (function-scoped) Returns: Path to generated repository root Note: Function-scoped to ensure each test gets fresh repository. Generation may take ~10 seconds for 10,000 files. """ repo_path = generate_benchmark_repository(base_path=tmp_path) return repo_path # ============================================================================== # Helper Functions # ============================================================================== async def _run_indexing(repo_path: Path, session: AsyncSession) -> float: """Run indexing operation and return duration in seconds. This is the core operation being benchmarked. It performs: 1. Repository scanning (file discovery) 2. Change detection (or force reindex) 3. File chunking with AST parsing 4. Embedding generation via Ollama 5. Database persistence Args: repo_path: Path to repository to index session: Database session for persistence Returns: Duration in seconds (from IndexResult.duration_seconds) Raises: RuntimeError: If indexing fails """ result = await index_repository( repo_path=repo_path, name="benchmark_repo", db=session, force_reindex=True, # Force full reindex for consistent benchmarking ) if result.status == "failed": raise RuntimeError(f"Indexing failed: {result.errors}") return result.duration_seconds def _create_benchmark_result( benchmark_stats: dict[str, float], test_parameters: dict[str, str | int | float], ) -> PerformanceBenchmarkResult: """Create PerformanceBenchmarkResult from pytest-benchmark statistics. Converts pytest-benchmark stats dict to Pydantic model for validation against constitutional targets. Args: benchmark_stats: Statistics from pytest-benchmark (stats dict) test_parameters: Test configuration (file_count, iterations, etc.) Returns: PerformanceBenchmarkResult with all latency percentiles Note: - All latencies converted from seconds to milliseconds - Uses Decimal for precision (Constitutional Principle VIII) """ # Extract percentiles from benchmark stats (in seconds) # pytest-benchmark stores as fractions, multiply by 1000 for milliseconds stats = benchmark_stats # Get percentiles (default to 0 if not available) p50_s = stats.get("median", 0.0) p95_s = stats.get("q95", 0.0) p99_s = stats.get("q99", 0.0) mean_s = stats.get("mean", 0.0) min_s = stats.get("min", 0.0) max_s = stats.get("max", 0.0) # Convert to milliseconds with Decimal precision latency_p50_ms = Decimal(str(p50_s * 1000)) latency_p95_ms = Decimal(str(p95_s * 1000)) latency_p99_ms = Decimal(str(p99_s * 1000)) latency_mean_ms = Decimal(str(mean_s * 1000)) latency_min_ms = Decimal(str(min_s * 1000)) latency_max_ms = Decimal(str(max_s * 1000)) # Determine pass status if latency_p95_ms < CONSTITUTIONAL_TARGET_MS: pass_status = "pass" elif latency_p95_ms < CONSTITUTIONAL_TARGET_MS * Decimal("1.1"): pass_status = "warning" else: pass_status = "fail" # Extract iterations as int for sample_size iterations = test_parameters.get("iterations", BENCHMARK_ITERATIONS) if not isinstance(iterations, int): iterations = int(iterations) return PerformanceBenchmarkResult( benchmark_id=str(uuid4()), server_id="codebase-mcp", operation_type="index", timestamp=datetime.utcnow(), latency_p50_ms=latency_p50_ms, latency_p95_ms=latency_p95_ms, latency_p99_ms=latency_p99_ms, latency_mean_ms=latency_mean_ms, latency_min_ms=latency_min_ms, latency_max_ms=latency_max_ms, sample_size=iterations, test_parameters=test_parameters, pass_status=pass_status, # type: ignore[arg-type] target_threshold_ms=CONSTITUTIONAL_TARGET_MS, ) # ============================================================================== # Benchmark Tests # ============================================================================== @pytest.mark.benchmark(group="indexing") @pytest.mark.asyncio async def test_indexing_10k_files_performance( benchmark: BenchmarkFixture, benchmark_repository: Path, session: AsyncSession, ) -> None: """Benchmark indexing performance for 10,000-file repository. Validates FR-001: System MUST validate that codebase-mcp indexing completes in under 60 seconds (p95) for a 10,000-file repository across 5 consecutive benchmark runs. **What is measured**: - File scanning and change detection - AST-based code chunking - Embedding generation via Ollama - Database persistence (chunks + embeddings) **What is NOT measured**: - Test fixture setup (repository generation) - Database schema creation - Session/engine initialization **Performance Target**: - p95 latency < 60,000ms (60 seconds) - 5 iterations with 1 warmup round - Constitutional compliance required Args: benchmark: pytest-benchmark fixture benchmark_repository: Generated 10,000-file repository session: Database session for indexing Raises: AssertionError: If p95 latency exceeds constitutional target """ def run_sync() -> float: """Synchronous wrapper for async indexing operation. pytest-benchmark requires synchronous callable, so we use asyncio.run to execute the async indexing function. Returns: Duration in seconds """ return asyncio.run(_run_indexing(benchmark_repository, session)) # Run benchmark with pedantic mode for accurate measurements result = benchmark.pedantic( run_sync, iterations=BENCHMARK_ITERATIONS, warmup_rounds=WARMUP_ROUNDS, rounds=1, # 1 round per iteration (each round is expensive) ) # Extract statistics from benchmark result stats = result.stats.as_dict() # Create PerformanceBenchmarkResult for validation test_params: dict[str, str | int | float] = { "file_count": TARGET_FILE_COUNT, "iterations": BENCHMARK_ITERATIONS, "warmup_rounds": WARMUP_ROUNDS, "force_reindex": 1, # Use int for bool (1 = True) } perf_result = _create_benchmark_result(stats, test_params) # Validate against constitutional target assert perf_result.latency_p95_ms < CONSTITUTIONAL_TARGET_MS, ( f"Indexing p95 {perf_result.latency_p95_ms}ms exceeds " f"constitutional target {CONSTITUTIONAL_TARGET_MS}ms (60 seconds)" ) # Log result details for analysis print(f"\n{'=' * 60}") print(f"Indexing Performance Benchmark Results") print(f"{'=' * 60}") print(f"File Count: {TARGET_FILE_COUNT:,}") print(f"Iterations: {BENCHMARK_ITERATIONS}") print(f"Warmup Rounds: {WARMUP_ROUNDS}") print(f"") print(f"Latency Statistics (milliseconds):") print(f" p50 (median): {perf_result.latency_p50_ms:>10.2f} ms") print(f" p95: {perf_result.latency_p95_ms:>10.2f} ms") print(f" p99: {perf_result.latency_p99_ms:>10.2f} ms") print(f" mean: {perf_result.latency_mean_ms:>10.2f} ms") print(f" min: {perf_result.latency_min_ms:>10.2f} ms") print(f" max: {perf_result.latency_max_ms:>10.2f} ms") print(f"") print(f"Constitutional Target: {CONSTITUTIONAL_TARGET_MS} ms (60 seconds)") print(f"Status: {perf_result.pass_status.upper()}") print(f"{'=' * 60}\n") @pytest.mark.benchmark(group="indexing") @pytest.mark.asyncio async def test_indexing_variance_validation( benchmark: BenchmarkFixture, benchmark_repository: Path, session: AsyncSession, ) -> None: """Validate indexing performance variance is within acceptable limits. Validates SC-001: Codebase-mcp indexes 10,000 files in under 60 seconds (p95) with less than 5% variance across 5 runs (variance calculated as coefficient of variation: standard deviation / mean × 100%). **Variance Calculation**: - Coefficient of Variation (CV) = (std_dev / mean) × 100% - Target: CV < 5% for consistent performance Args: benchmark: pytest-benchmark fixture benchmark_repository: Generated 10,000-file repository session: Database session for indexing Raises: AssertionError: If variance exceeds 5% threshold """ def run_sync() -> float: """Synchronous wrapper for async indexing operation.""" return asyncio.run(_run_indexing(benchmark_repository, session)) # Run benchmark result = benchmark.pedantic( run_sync, iterations=BENCHMARK_ITERATIONS, warmup_rounds=WARMUP_ROUNDS, rounds=1, ) # Extract statistics stats = result.stats.as_dict() mean_s = stats.get("mean", 0.0) stddev_s = stats.get("stddev", 0.0) # Calculate coefficient of variation (CV) if mean_s > 0: cv_percent = (stddev_s / mean_s) * 100.0 else: cv_percent = 0.0 # Validate variance within 5% threshold assert cv_percent < 5.0, ( f"Indexing variance {cv_percent:.2f}% exceeds 5% threshold. " f"Mean: {mean_s:.2f}s, StdDev: {stddev_s:.2f}s" ) print(f"\nVariance Validation:") print(f" Mean: {mean_s:.2f} s") print(f" Standard Deviation: {stddev_s:.2f} s") print(f" Coefficient of Var: {cv_percent:.2f}% (target: <5%)") print(f" Status: {'PASS' if cv_percent < 5.0 else 'FAIL'}\n")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Ravenight13/codebase-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server