Code-Index-MCP

test_repository_indexing.py•15.2 KiB

#!/usr/bin/env python3 """ Real-world repository indexing tests for Code-Index-MCP. Tests indexing performance and accuracy on actual GitHub repositories. """ import os import subprocess import tempfile from pathlib import Path import pytest from mcp_server.dispatcher.dispatcher import Dispatcher from mcp_server.interfaces.shared_interfaces import Result from mcp_server.plugin_system.plugin_manager import PluginManager from mcp_server.storage.sqlite_store import SQLiteStore class TestRepositoryIndexing: """Test indexing performance on real-world repositories.""" @pytest.fixture def workspace_dir(self): """Provide workspace directory for test repositories.""" workspace = Path("test_workspace/real_repos") workspace.mkdir(parents=True, exist_ok=True) return workspace @pytest.fixture def test_db(self): """Provide clean test database.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = f.name store = SQLiteStore(db_path) yield store # Cleanup try: os.unlink(db_path) except OSError: pass @pytest.fixture def dispatcher(self): """Provide configured dispatcher with plugins.""" dispatcher = Dispatcher() plugin_manager = PluginManager() # Load all available plugins for plugin_name in ["python_plugin", "js_plugin", "c_plugin", "cpp_plugin"]: try: plugin = plugin_manager.load_plugin(plugin_name) dispatcher.register_plugin(plugin_name, plugin) except Exception as e: print(f"Failed to load {plugin_name}: {e}") return dispatcher def ensure_repository_exists(self, workspace_dir: Path, repo_name: str, repo_url: str) -> Path: """Ensure repository exists in workspace, download if needed.""" repo_path = workspace_dir / repo_name if not repo_path.exists(): print(f"Downloading {repo_name} from {repo_url}...") try: subprocess.run( ["git", "clone", "--depth=1", repo_url, str(repo_path)], check=True, capture_output=True, timeout=300, ) print(f"Successfully downloaded {repo_name}") except subprocess.CalledProcessError as e: pytest.skip(f"Failed to download {repo_name}: {e}") except subprocess.TimeoutExpired: pytest.skip(f"Timeout downloading {repo_name}") return repo_path def index_file_with_plugin( self, file_path: str, content: str, dispatcher: Dispatcher ) -> Result: """Index a file using the appropriate plugin.""" try: # Detect language from file extension path_obj = Path(file_path) extension = path_obj.suffix.lower() language_map = { ".py": "python_plugin", ".js": "js_plugin", ".jsx": "js_plugin", ".ts": "js_plugin", ".tsx": "js_plugin", ".c": "c_plugin", ".h": "c_plugin", ".cpp": "cpp_plugin", ".cc": "cpp_plugin", ".cxx": "cpp_plugin", ".hpp": "cpp_plugin", } plugin_name = language_map.get(extension) if not plugin_name or plugin_name not in dispatcher.plugins: return Result.error(f"No plugin available for {extension}") plugin = dispatcher.plugins[plugin_name] result = plugin.indexFile(file_path, content) return ( Result.success(result) if result else Result.error("Plugin returned empty result") ) except Exception as e: return Result.error(f"Error indexing {file_path}: {str(e)}") @pytest.mark.performance @pytest.mark.parametrize( "repo_name,repo_url,expected_files,expected_symbols", [ ("requests", "https://github.com/psf/requests.git", 50, 500), ("terminal_subset", "https://github.com/microsoft/terminal.git", 200, 2000), ], ) def test_small_repository_indexing( self, repo_name, repo_url, expected_files, expected_symbols, workspace_dir, test_db, dispatcher, benchmark, ): """Test indexing performance on small real-world repositories.""" repo_path = self.ensure_repository_exists(workspace_dir, repo_name, repo_url) def index_repository(): repo_id = test_db.create_repository(str(repo_path), repo_name, {"type": "test"}) indexed_files = 0 total_symbols = 0 errors = [] # Find relevant source files extensions = [".py", ".js", ".ts", ".c", ".cpp", ".h", ".hpp"] source_files = [] for ext in extensions: source_files.extend(repo_path.rglob(f"*{ext}")) # Limit to reasonable number for testing source_files = source_files[:1000] for file_path in source_files: if file_path.is_file(): try: content = file_path.read_text(encoding="utf-8", errors="ignore") if len(content.strip()) == 0: continue result = self.index_file_with_plugin(str(file_path), content, dispatcher) if result.success: indexed_files += 1 symbols = result.value.get("symbols", []) total_symbols += len(symbols) # Store in database file_id = test_db.store_file( repo_id, str(file_path.relative_to(repo_path)), content, file_path.suffix[1:], file_path.stat().st_mtime, ) for symbol in symbols: test_db.store_symbol( file_id, symbol.get("name", ""), symbol.get("type", ""), symbol.get("line", 0), symbol.get("line", 0), symbol.get("column", 0), symbol.get("column", 0), symbol.get("definition", ""), ) else: errors.append(f"{file_path}: {result.error}") except Exception as e: errors.append(f"{file_path}: {str(e)}") return indexed_files, total_symbols, len(errors) # Benchmark the indexing process result = benchmark(index_repository) indexed_files, total_symbols, error_count = result # Validate results assert ( indexed_files >= expected_files * 0.6 ), f"Expected at least {expected_files * 0.6} files, got {indexed_files}" assert ( total_symbols >= expected_symbols * 0.6 ), f"Expected at least {expected_symbols * 0.6} symbols, got {total_symbols}" assert error_count < indexed_files * 0.2, f"Too many errors: {error_count}/{indexed_files}" # Performance assertions assert benchmark.stats.mean < 60.0, f"Indexing took too long: {benchmark.stats.mean:.2f}s" print(f"Indexed {indexed_files} files with {total_symbols} symbols, {error_count} errors") print(f"Success rate: {(indexed_files / (indexed_files + error_count)) * 100:.1f}%") @pytest.mark.performance @pytest.mark.slow def test_large_repository_indexing(self, workspace_dir, test_db, dispatcher, benchmark): """Test indexing performance on Django (large Python codebase).""" repo_path = self.ensure_repository_exists( workspace_dir, "django", "https://github.com/django/django.git" ) def index_django(): repo_id = test_db.create_repository(str(repo_path), "django", {"type": "large_test"}) indexed_files = 0 total_symbols = 0 errors = [] # Index only Python files to focus on our plugin python_files = list(repo_path.rglob("*.py")) print(f"Found {len(python_files)} Python files") # Limit to 1000 files for testing performance test_files = python_files[:1000] for file_path in test_files: try: # Skip very large files if file_path.stat().st_size > 1024 * 1024: # 1MB continue content = file_path.read_text(encoding="utf-8", errors="ignore") if len(content.strip()) == 0: continue result = self.index_file_with_plugin(str(file_path), content, dispatcher) if result.success: indexed_files += 1 symbols = result.value.get("symbols", []) total_symbols += len(symbols) else: errors.append(f"{file_path}: {result.error}") except Exception as e: errors.append(f"{file_path}: {str(e)}") return indexed_files, total_symbols, len(errors) result = benchmark(index_django) indexed_files, total_symbols, error_count = result # Performance and quality assertions assert indexed_files >= 800, f"Should index most files successfully, got {indexed_files}" assert total_symbols >= 10000, f"Django should have many symbols, got {total_symbols}" assert ( error_count < indexed_files * 0.1 ), f"Error rate too high: {error_count}/{indexed_files}" assert benchmark.stats.mean < 180.0, f"Indexing took too long: {benchmark.stats.mean:.2f}s" print( f"Django indexing: {indexed_files} files, {total_symbols} symbols, {error_count} errors" ) print(f"Rate: {indexed_files / benchmark.stats.mean:.1f} files/second") @pytest.mark.integration def test_multi_language_repository(self, workspace_dir, test_db, dispatcher): """Test indexing a multi-language repository like VS Code.""" repo_path = self.ensure_repository_exists( workspace_dir, "vscode_subset", "https://github.com/microsoft/vscode.git" ) repo_id = test_db.create_repository( str(repo_path), "vscode_subset", {"type": "multi_lang_test"} ) language_stats = {} total_indexed = 0 # Find files for each language language_extensions = { "typescript": [".ts", ".tsx"], "javascript": [".js", ".jsx"], "python": [".py"], "c": [".c", ".h"], "cpp": [".cpp", ".hpp"], } for language, extensions in language_extensions.items(): language_files = [] for ext in extensions: language_files.extend(list(repo_path.rglob(f"*{ext}"))[:100]) # Limit per language indexed_count = 0 symbol_count = 0 for file_path in language_files: try: if file_path.stat().st_size > 512 * 1024: # Skip files > 512KB continue content = file_path.read_text(encoding="utf-8", errors="ignore") if len(content.strip()) == 0: continue result = self.index_file_with_plugin(str(file_path), content, dispatcher) if result.success: indexed_count += 1 symbols = result.value.get("symbols", []) symbol_count += len(symbols) except Exception: continue language_stats[language] = { "files": len(language_files), "indexed": indexed_count, "symbols": symbol_count, } total_indexed += indexed_count # Validate multi-language support languages_with_results = [ lang for lang, stats in language_stats.items() if stats["indexed"] > 0 ] assert ( len(languages_with_results) >= 2 ), f"Should support multiple languages, got: {languages_with_results}" assert total_indexed >= 50, f"Should index reasonable number of files, got {total_indexed}" for language, stats in language_stats.items(): if stats["files"] > 0: success_rate = stats["indexed"] / stats["files"] print( f"{language}: {stats['indexed']}/{stats['files']} files ({success_rate:.1%}), {stats['symbols']} symbols" ) @pytest.mark.performance def test_indexing_memory_efficiency(self, workspace_dir, test_db, dispatcher): """Test memory efficiency during large repository indexing.""" import os import psutil repo_path = self.ensure_repository_exists( workspace_dir, "requests", "https://github.com/psf/requests.git" ) process = psutil.Process(os.getpid()) initial_memory = process.memory_info().rss / 1024 / 1024 # MB repo_id = test_db.create_repository( str(repo_path), "requests_memory_test", {"type": "memory_test"} ) indexed_files = 0 memory_samples = [initial_memory] python_files = list(repo_path.rglob("*.py")) for i, file_path in enumerate(python_files): try: content = file_path.read_text(encoding="utf-8", errors="ignore") result = self.index_file_with_plugin(str(file_path), content, dispatcher) if result.success: indexed_files += 1 # Sample memory every 10 files if i % 10 == 0: current_memory = process.memory_info().rss / 1024 / 1024 memory_samples.append(current_memory) except Exception: continue final_memory = process.memory_info().rss / 1024 / 1024 memory_increase = final_memory - initial_memory max_memory = max(memory_samples) # Memory efficiency assertions assert memory_increase < 100, f"Memory increase too high: {memory_increase:.1f}MB" assert max_memory < initial_memory + 150, f"Peak memory too high: {max_memory:.1f}MB" assert indexed_files >= 20, f"Should index reasonable number of files: {indexed_files}" print( f"Memory test: {indexed_files} files, {memory_increase:.1f}MB increase, {max_memory:.1f}MB peak" ) # Check for memory leaks if len(memory_samples) > 2: memory_trend = memory_samples[-1] - memory_samples[1] assert memory_trend < 50, f"Possible memory leak detected: {memory_trend:.1f}MB trend" if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_repository_indexing.py•15.2 KiB