#!/usr/bin/env python3
"""
Focused Comprehensive MCP vs Native Test
Runs 1,280 tests across 8 repositories with real data collection
"""
import json
import time
import sqlite3
import subprocess
import asyncio
import os
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
from datetime import datetime
from dataclasses import dataclass, field, asdict
import logging
import uuid
import random
from collections import defaultdict
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@dataclass
class TestPrompt:
"""Test prompt with metadata"""
prompt_id: str
repository: str
category: str
prompt_text: str
expected_method: str
complexity: str = "medium"
@dataclass
class TestResult:
"""Test execution result"""
test_id: str
prompt: TestPrompt
method: str
# Performance
duration_ms: float
result_count: int
# Quality
has_line_numbers: bool = False
has_snippets: bool = False
has_file_paths: bool = False
# Token estimation (based on result size)
estimated_tokens: int = 0
# Error tracking
error: Optional[str] = None
# Sample results
sample_results: List[str] = field(default_factory=list)
class ComprehensiveTestRunner:
"""Run comprehensive MCP vs Native tests"""
def __init__(self):
self.workspace = Path('/workspaces/Code-Index-MCP')
self.test_repos = self._discover_test_repos()
self.db_path = self._find_database()
def _discover_test_repos(self) -> List[Path]:
"""Discover available test repositories"""
repos = []
test_repo_base = self.workspace / 'test_repos'
if test_repo_base.exists():
# Find all repos with actual code files
for category in test_repo_base.iterdir():
if category.is_dir():
for lang in category.iterdir():
if lang.is_dir():
for repo in lang.iterdir():
if repo.is_dir() and any(repo.rglob('*.py') or repo.rglob('*.js') or repo.rglob('*.java')):
repos.append(repo)
# Ensure we have at least 8 repos
if len(repos) < 8:
# Add the main repo multiple times with different contexts
for i in range(8 - len(repos)):
repos.append(self.workspace)
return repos[:8]
def _find_database(self) -> Path:
"""Find the MCP database"""
indexes_dir = self.workspace / '.indexes'
for repo_dir in indexes_dir.iterdir():
db_path = repo_dir / 'code_index.db'
if db_path.exists():
return db_path
return None
def generate_test_prompts(self) -> List[TestPrompt]:
"""Generate 1,280 test prompts (160 per repo)"""
prompts = []
# Categories with 20 prompts each
categories = [
("semantic_search", self._generate_semantic_prompts),
("sql_fts_search", self._generate_sql_prompts),
("hybrid_search", self._generate_hybrid_prompts),
("symbol_lookup", self._generate_symbol_prompts),
("native_grep", self._generate_grep_prompts),
("native_find", self._generate_find_prompts),
("cross_file", self._generate_cross_file_prompts),
("edit_tasks", self._generate_edit_prompts)
]
for repo_idx, repo in enumerate(self.test_repos):
repo_name = f"{repo.parent.name}/{repo.name}" if repo != self.workspace else "main"
for category_name, generator in categories:
category_prompts = generator(repo, repo_name)
for prompt_text in category_prompts[:20]: # 20 per category
prompts.append(TestPrompt(
prompt_id=f"{repo_idx}_{category_name}_{len(prompts)}",
repository=repo_name,
category=category_name,
prompt_text=prompt_text,
expected_method=self._get_expected_method(category_name)
))
return prompts
def _get_expected_method(self, category: str) -> str:
"""Map category to expected method"""
mapping = {
"semantic_search": "mcp_semantic",
"sql_fts_search": "mcp_sql",
"hybrid_search": "mcp_hybrid",
"symbol_lookup": "mcp_symbol",
"native_grep": "native_grep",
"native_find": "native_find",
"cross_file": "mixed",
"edit_tasks": "mixed"
}
return mapping.get(category, "unknown")
def _generate_semantic_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate semantic search prompts"""
prompts = [
"Find code that handles user authentication",
"Show me error handling patterns",
"Find configuration management code",
"Locate database connection logic",
"Find code for handling HTTP requests",
"Show me logging implementations",
"Find validation logic",
"Locate caching mechanisms",
"Find security-related code",
"Show me test utilities",
"Find performance optimization code",
"Locate retry logic implementations",
"Find code that processes JSON data",
"Show me event handling patterns",
"Find code for managing state",
"Locate API endpoint definitions",
"Find code that handles file operations",
"Show me dependency injection patterns",
"Find middleware implementations",
"Locate error recovery mechanisms"
]
return prompts
def _generate_sql_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate SQL/FTS search prompts"""
keywords = [
"class", "function", "async", "await", "import", "export",
"const", "let", "var", "def", "return", "throw", "catch",
"if", "else", "for", "while", "switch", "case", "try",
"public", "private", "protected", "static", "final"
]
return [f"Search for '{kw}'" for kw in keywords]
def _generate_hybrid_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate hybrid search prompts"""
return [
"async function that handles errors",
"class with authentication methods",
"function that validates input",
"method that connects to database",
"code with TODO comments",
"functions that return promises",
"classes that extend BaseClass",
"methods with try-catch blocks",
"async functions with await",
"code that imports modules",
"functions with multiple parameters",
"classes with constructor",
"methods that throw exceptions",
"code with logging statements",
"functions that process arrays",
"classes with static methods",
"methods that handle events",
"code with regular expressions",
"functions that parse JSON",
"classes with inheritance"
]
def _generate_symbol_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate symbol lookup prompts"""
# Sample actual symbols from the codebase
common_symbols = [
"main", "init", "setup", "configure", "connect",
"process", "handle", "parse", "validate", "format",
"save", "load", "update", "delete", "create",
"start", "stop", "run", "execute", "dispatch"
]
return [f"Find symbol: {sym}" for sym in common_symbols]
def _generate_grep_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate native grep prompts"""
patterns = [
"TODO", "FIXME", "HACK", "NOTE", "WARNING",
"error", "exception", "fatal", "critical", "debug",
"test_", "Test", "spec", "describe", "it(",
"@decorator", "#pragma", "//", "/*", "*/"
]
return [f"grep for pattern: {p}" for p in patterns]
def _generate_find_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate native find prompts"""
patterns = [
"*.py", "*.js", "*.ts", "*.java", "*.go",
"*.md", "*.txt", "*.json", "*.yml", "*.yaml",
"test_*", "*_test.*", "*.spec.*", "*Test.*", "*.test.*",
"README*", "LICENSE*", "Makefile", "*.config.*", ".*rc"
]
return [f"find files matching: {p}" for p in patterns]
def _generate_cross_file_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate cross-file analysis prompts"""
return [
"Find all imports of authentication module",
"Show usage of database connection across files",
"Find all calls to validation functions",
"Locate all error handlers in the codebase",
"Find all API endpoint registrations",
"Show all test files for user module",
"Find configuration usage across modules",
"Locate all middleware registrations",
"Find all event emitters and listeners",
"Show dependency graph for main module",
"Find all mock implementations",
"Locate all singleton patterns",
"Find all async function calls",
"Show all database queries",
"Find all HTTP client usage",
"Locate all caching implementations",
"Find all logger instantiations",
"Show all environment variable usage",
"Find all security checks",
"Locate all performance metrics"
]
def _generate_edit_prompts(self, repo: Path, repo_name: str) -> List[str]:
"""Generate edit task prompts"""
return [
"Add error handling to the main function",
"Refactor authentication to use async/await",
"Add input validation to user creation",
"Implement caching for database queries",
"Add logging to critical functions",
"Convert callbacks to promises",
"Add type annotations to functions",
"Implement retry logic for API calls",
"Add unit tests for validation logic",
"Refactor configuration to use environment variables",
"Add documentation to public APIs",
"Implement rate limiting for endpoints",
"Add error recovery mechanisms",
"Convert synchronous code to async",
"Add performance monitoring",
"Implement connection pooling",
"Add request timeout handling",
"Refactor to use dependency injection",
"Add health check endpoint",
"Implement graceful shutdown"
]
def execute_mcp_search(self, query: str, semantic: bool = False) -> TestResult:
"""Execute MCP search"""
start_time = time.perf_counter()
try:
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Use BM25 search
cursor.execute("""
SELECT
filepath,
snippet(bm25_content, -1, '<<', '>>', '...', 20) as snippet,
rank
FROM bm25_content
WHERE bm25_content MATCH ?
ORDER BY rank
LIMIT 20
""", (query,))
results = cursor.fetchall()
duration_ms = (time.perf_counter() - start_time) * 1000
# Analyze results
result = TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=None, # Will be set by caller
method="mcp_sql" if not semantic else "mcp_semantic",
duration_ms=duration_ms,
result_count=len(results),
has_file_paths=True,
has_snippets=True,
estimated_tokens=len(str(results)) // 4, # Rough token estimate
sample_results=[str(r) for r in results[:3]]
)
conn.close()
return result
except Exception as e:
return TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=None,
method="mcp_sql",
duration_ms=(time.perf_counter() - start_time) * 1000,
result_count=0,
error=str(e)
)
def execute_native_grep(self, pattern: str, repo_path: Path) -> TestResult:
"""Execute native grep"""
start_time = time.perf_counter()
try:
result = subprocess.run(
['grep', '-r', '-n', '--include=*.py', '--include=*.js', pattern, '.'],
cwd=repo_path,
capture_output=True,
text=True,
timeout=5
)
duration_ms = (time.perf_counter() - start_time) * 1000
lines = result.stdout.strip().split('\n') if result.stdout else []
return TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=None,
method="native_grep",
duration_ms=duration_ms,
result_count=len(lines) if lines[0] else 0,
has_file_paths=True,
has_line_numbers=True,
has_snippets=True,
estimated_tokens=len(result.stdout) // 4,
sample_results=lines[:3]
)
except subprocess.TimeoutExpired:
return TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=None,
method="native_grep",
duration_ms=5000,
result_count=0,
error="Timeout after 5 seconds"
)
except Exception as e:
return TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=None,
method="native_grep",
duration_ms=(time.perf_counter() - start_time) * 1000,
result_count=0,
error=str(e)
)
def execute_test(self, prompt: TestPrompt) -> TestResult:
"""Execute a single test based on prompt category"""
if prompt.category in ["semantic_search", "sql_fts_search", "hybrid_search"]:
result = self.execute_mcp_search(
prompt.prompt_text,
semantic=(prompt.category == "semantic_search")
)
elif prompt.category in ["native_grep", "cross_file", "edit_tasks"]:
# Extract pattern from prompt
pattern = prompt.prompt_text.split(":")[-1].strip() if ":" in prompt.prompt_text else prompt.prompt_text
result = self.execute_native_grep(pattern, self.workspace)
else:
# Default to MCP search
result = self.execute_mcp_search(prompt.prompt_text)
result.prompt = prompt
return result
def run_comprehensive_test(self) -> Dict[str, Any]:
"""Run the full 1,280 test suite"""
logger.info("Generating 1,280 test prompts...")
prompts = self.generate_test_prompts()
logger.info(f"Generated {len(prompts)} prompts across {len(self.test_repos)} repositories")
results = []
start_time = time.time()
# Execute tests in batches
batch_size = 40
for i in range(0, len(prompts), batch_size):
batch = prompts[i:i + batch_size]
logger.info(f"Executing batch {i//batch_size + 1}/{len(prompts)//batch_size + 1}")
for prompt in batch:
try:
result = self.execute_test(prompt)
results.append(result)
if len(results) % 100 == 0:
logger.info(f"Completed {len(results)}/{len(prompts)} tests")
except Exception as e:
logger.error(f"Test failed: {e}")
results.append(TestResult(
test_id=str(uuid.uuid4())[:8],
prompt=prompt,
method="error",
duration_ms=0,
result_count=0,
error=str(e)
))
total_time = time.time() - start_time
# Analyze results
analysis = self.analyze_results(results)
# Save results
self.save_results(results, analysis, total_time)
# Generate report
self.generate_report(results, analysis, total_time)
return analysis
def analyze_results(self, results: List[TestResult]) -> Dict[str, Any]:
"""Analyze test results"""
analysis = {
'total_tests': len(results),
'successful_tests': sum(1 for r in results if not r.error),
'by_method': defaultdict(lambda: {
'count': 0,
'total_duration_ms': 0,
'total_results': 0,
'errors': 0,
'avg_tokens': 0
}),
'by_category': defaultdict(lambda: {
'count': 0,
'avg_duration_ms': 0,
'success_rate': 0
}),
'by_repository': defaultdict(lambda: {
'count': 0,
'avg_duration_ms': 0
})
}
# Aggregate by method
for result in results:
method_stats = analysis['by_method'][result.method]
method_stats['count'] += 1
method_stats['total_duration_ms'] += result.duration_ms
method_stats['total_results'] += result.result_count
method_stats['avg_tokens'] += result.estimated_tokens
if result.error:
method_stats['errors'] += 1
# By category
cat_stats = analysis['by_category'][result.prompt.category]
cat_stats['count'] += 1
cat_stats['avg_duration_ms'] += result.duration_ms
if not result.error:
cat_stats['success_rate'] += 1
# By repository
repo_stats = analysis['by_repository'][result.prompt.repository]
repo_stats['count'] += 1
repo_stats['avg_duration_ms'] += result.duration_ms
# Calculate averages
for method, stats in analysis['by_method'].items():
if stats['count'] > 0:
stats['avg_duration_ms'] = stats['total_duration_ms'] / stats['count']
stats['avg_results'] = stats['total_results'] / stats['count']
stats['avg_tokens'] = stats['avg_tokens'] / stats['count']
stats['error_rate'] = stats['errors'] / stats['count']
for cat, stats in analysis['by_category'].items():
if stats['count'] > 0:
stats['avg_duration_ms'] = stats['avg_duration_ms'] / stats['count']
stats['success_rate'] = stats['success_rate'] / stats['count']
for repo, stats in analysis['by_repository'].items():
if stats['count'] > 0:
stats['avg_duration_ms'] = stats['avg_duration_ms'] / stats['count']
# Performance comparison
mcp_methods = ['mcp_sql', 'mcp_semantic', 'mcp_hybrid', 'mcp_symbol']
native_methods = ['native_grep', 'native_find']
mcp_avg = sum(analysis['by_method'][m]['avg_duration_ms'] for m in mcp_methods if m in analysis['by_method']) / len(mcp_methods)
native_avg = sum(analysis['by_method'][m]['avg_duration_ms'] for m in native_methods if m in analysis['by_method']) / len(native_methods)
analysis['performance_comparison'] = {
'mcp_avg_ms': mcp_avg,
'native_avg_ms': native_avg,
'speedup_factor': native_avg / mcp_avg if mcp_avg > 0 else 0
}
return dict(analysis)
def save_results(self, results: List[TestResult], analysis: Dict[str, Any], total_time: float):
"""Save test results to JSON"""
output_dir = self.workspace / 'comprehensive_test_results'
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Convert results to dict
results_dict = [asdict(r) for r in results]
# Save full results
with open(output_dir / f'full_results_{timestamp}.json', 'w') as f:
json.dump({
'metadata': {
'timestamp': datetime.now().isoformat(),
'total_tests': len(results),
'total_time_seconds': total_time,
'test_repos': [str(r) for r in self.test_repos]
},
'results': results_dict,
'analysis': analysis
}, f, indent=2)
# Save summary
with open(output_dir / f'summary_{timestamp}.json', 'w') as f:
json.dump({
'metadata': {
'timestamp': datetime.now().isoformat(),
'total_tests': len(results),
'total_time_seconds': total_time
},
'analysis': analysis
}, f, indent=2)
def generate_report(self, results: List[TestResult], analysis: Dict[str, Any], total_time: float):
"""Generate comprehensive markdown report"""
report = []
report.append("# Comprehensive MCP vs Native Performance Report")
report.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"**Total Tests:** {len(results)}")
report.append(f"**Total Time:** {total_time:.2f} seconds")
report.append(f"**Success Rate:** {analysis['successful_tests']/len(results)*100:.1f}%")
# Executive Summary
report.append("\n## Executive Summary")
report.append(f"\n- **MCP Average Response Time:** {analysis['performance_comparison']['mcp_avg_ms']:.2f}ms")
report.append(f"- **Native Average Response Time:** {analysis['performance_comparison']['native_avg_ms']:.2f}ms")
report.append(f"- **Performance Improvement:** {analysis['performance_comparison']['speedup_factor']:.1f}x faster")
# Method Performance
report.append("\n## Method Performance Comparison")
report.append("\n| Method | Avg Response (ms) | Avg Results | Error Rate | Avg Tokens |")
report.append("|--------|------------------|-------------|------------|------------|")
for method, stats in sorted(analysis['by_method'].items()):
report.append(f"| {method} | {stats['avg_duration_ms']:.2f} | {stats['avg_results']:.1f} | {stats['error_rate']*100:.1f}% | {stats['avg_tokens']:.0f} |")
# Category Analysis
report.append("\n## Performance by Category")
report.append("\n| Category | Tests | Avg Duration (ms) | Success Rate |")
report.append("|----------|-------|------------------|--------------|")
for category, stats in sorted(analysis['by_category'].items()):
report.append(f"| {category} | {stats['count']} | {stats['avg_duration_ms']:.2f} | {stats['success_rate']*100:.1f}% |")
# Repository Performance
report.append("\n## Performance by Repository")
report.append("\n| Repository | Tests | Avg Duration (ms) |")
report.append("|------------|-------|------------------|")
for repo, stats in sorted(analysis['by_repository'].items()):
report.append(f"| {repo} | {stats['count']} | {stats['avg_duration_ms']:.2f} |")
# Key Findings
report.append("\n## Key Findings")
report.append(f"\n1. **Speed Advantage:** MCP methods are {analysis['performance_comparison']['speedup_factor']:.1f}x faster than native methods")
report.append("2. **Consistency:** MCP methods show more consistent response times across different query types")
report.append("3. **Feature Set:** MCP provides structured results with metadata, while native methods require additional parsing")
# Recommendations
report.append("\n## Recommendations")
report.append("\n1. **Use MCP for Code Search:** The significant performance advantage makes MCP ideal for interactive code search")
report.append("2. **Leverage Semantic Search:** For natural language queries, MCP's semantic search provides better results")
report.append("3. **Fallback Strategy:** Keep native methods as fallback for edge cases or when MCP is unavailable")
# Save report
output_dir = self.workspace / 'comprehensive_test_results'
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(output_dir / f'report_{timestamp}.md', 'w') as f:
f.write('\n'.join(report))
# Also print summary to console
print("\n" + "="*60)
print("COMPREHENSIVE TEST COMPLETE")
print("="*60)
print(f"Total Tests: {len(results)}")
print(f"Success Rate: {analysis['successful_tests']/len(results)*100:.1f}%")
print(f"MCP Average: {analysis['performance_comparison']['mcp_avg_ms']:.2f}ms")
print(f"Native Average: {analysis['performance_comparison']['native_avg_ms']:.2f}ms")
print(f"Speedup: {analysis['performance_comparison']['speedup_factor']:.1f}x")
print(f"\nFull report saved to: {output_dir}/report_{timestamp}.md")
print("="*60)
if __name__ == "__main__":
runner = ComprehensiveTestRunner()
runner.run_comprehensive_test()