mcp-skills

Overview Schema Related Servers Score Discussions

mcp-skills
tests
benchmarks

test_performance_benchmarks.py•19.2 KiB

"""Comprehensive performance benchmarks for mcp-skillset. Design Decision: Multi-Scale Performance Testing Rationale: Test performance across different scales (100, 1000, 10000 skills) to identify scalability bottlenecks and ensure performance doesn't degrade non-linearly with dataset growth. Baseline Thresholds (target performance): - Index 100 skills: < 10 seconds - Index 1000 skills: < 100 seconds - Search query (p50): < 100ms - Search query (p95): < 500ms - SQLite lookup by ID: < 1ms - SQLite query with filters: < 10ms Performance Monitoring Strategy: 1. Track metrics across scales to detect O(n²) behavior 2. Use pytest-benchmark for consistent measurement 3. Save baseline for regression detection 4. Monitor memory usage for large-scale operations Error Handling: - Benchmark failures should not crash test suite - Out-of-memory scenarios should be handled gracefully - Timeout protection for large-scale benchmarks """ import gc import time from datetime import UTC from pathlib import Path import pytest from mcp_skills.models.skill import Skill from mcp_skills.services.indexing import IndexingEngine from mcp_skills.services.metadata_store import MetadataStore from mcp_skills.services.skill_manager import SkillManager class TestIndexingPerformance: """Benchmark indexing performance across different scales. Tests measure: - Time to index N skills - Memory usage during indexing - Throughput (skills/second) - Scalability characteristics """ def test_index_100_skills_baseline( self, benchmark, benchmark_storage_path: Path, benchmark_skills_100: list[Skill] ): """Benchmark indexing 100 skills (baseline scale). Target: < 10 seconds Performance Metrics: - Total time to index 100 skills - Average time per skill (~100ms expected) - Memory overhead Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path benchmark_skills_100: List of 100 sample skills """ def setup(): """Setup fresh engine for each benchmark round.""" skill_manager = SkillManager() skill_manager._skill_cache = { skill.id: skill for skill in benchmark_skills_100 } skill_manager._skill_paths = { skill.id: skill.file_path for skill in benchmark_skills_100 } engine = IndexingEngine( vector_backend="chromadb", graph_backend="networkx", skill_manager=skill_manager, storage_path=benchmark_storage_path / "index_100", ) return (engine, benchmark_skills_100), {} def index_skills(engine, skills, *args): """Index all skills.""" for skill in skills: engine.index_skill(skill) # Run benchmark with setup benchmark.pedantic(index_skills, setup=setup, rounds=3, iterations=1) # Verify all skills were indexed # (verification happens outside benchmark timing) def test_index_1000_skills_moderate_scale( self, benchmark, benchmark_storage_path: Path, benchmark_skills_1000: list[Skill], ): """Benchmark indexing 1000 skills (moderate scale). Target: < 100 seconds (10x skills should be ~10x time if O(n)) Performance Metrics: - Total time to index 1000 skills - Scalability factor vs. 100 skills - Detect O(n²) behavior if present Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path benchmark_skills_1000: List of 1000 sample skills """ def setup(): """Setup fresh engine for each benchmark round.""" skill_manager = SkillManager() skill_manager._skill_cache = { skill.id: skill for skill in benchmark_skills_1000 } skill_manager._skill_paths = { skill.id: skill.file_path for skill in benchmark_skills_1000 } engine = IndexingEngine( vector_backend="chromadb", graph_backend="networkx", skill_manager=skill_manager, storage_path=benchmark_storage_path / "index_1000", ) return (engine, benchmark_skills_1000), {} def index_skills(engine, skills, *args): """Index all skills.""" for skill in skills: engine.index_skill(skill) # Run benchmark with setup (fewer rounds due to time) benchmark.pedantic(index_skills, setup=setup, rounds=2, iterations=1) @pytest.mark.slow def test_index_10000_skills_large_scale( self, benchmark, benchmark_storage_path: Path, benchmark_skills_10000: list[Skill], ): """Benchmark indexing 10000 skills (large scale). Target: < 1000 seconds (100x skills should be ~100x time if O(n)) Performance Metrics: - Total time to index 10000 skills - Scalability characteristics - Memory usage at scale Note: Marked as 'slow' - skip in normal test runs. Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path benchmark_skills_10000: List of 10000 sample skills """ def setup(): """Setup fresh engine for each benchmark round.""" # Clear memory before large operation gc.collect() skill_manager = SkillManager() skill_manager._skill_cache = { skill.id: skill for skill in benchmark_skills_10000 } skill_manager._skill_paths = { skill.id: skill.file_path for skill in benchmark_skills_10000 } engine = IndexingEngine( vector_backend="chromadb", graph_backend="networkx", skill_manager=skill_manager, storage_path=benchmark_storage_path / "index_10000", ) return (engine, benchmark_skills_10000), {} def index_skills(engine, skills, *args): """Index all skills.""" for skill in skills: engine.index_skill(skill) # Run benchmark with single round due to time benchmark.pedantic(index_skills, setup=setup, rounds=1, iterations=1) def test_reindex_all_performance( self, benchmark, benchmark_storage_path: Path, benchmark_skills_100: list[Skill] ): """Benchmark reindex_all() operation. Tests the batch reindexing performance, which should be optimized compared to individual indexing. Target: Similar to individual indexing (~10 seconds for 100 skills) Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path benchmark_skills_100: List of 100 sample skills """ def setup(): """Setup engine with skill manager.""" skill_manager = SkillManager() skill_manager._skill_cache = { skill.id: skill for skill in benchmark_skills_100 } skill_manager._skill_paths = { skill.id: skill.file_path for skill in benchmark_skills_100 } # Mock discover_skills to return our benchmark skills skill_manager.discover_skills = lambda: benchmark_skills_100 engine = IndexingEngine( vector_backend="chromadb", graph_backend="networkx", skill_manager=skill_manager, storage_path=benchmark_storage_path / "reindex", ) return (engine,), {} def reindex_all(engine, *args): """Reindex all skills.""" engine.reindex_all(force=True) benchmark.pedantic(reindex_all, setup=setup, rounds=3, iterations=1) class TestSearchPerformance: """Benchmark search performance across different scales. Tests measure: - Query latency (p50, p95, p99) - ChromaDB vector search time - NetworkX graph search time - Hybrid search combination time """ def test_vector_search_latency_100_skills( self, benchmark, indexed_engine_100: IndexingEngine ): """Benchmark vector search latency with 100 indexed skills. Target: < 100ms (p50 latency) Performance Metrics: - Median query time (p50) - 95th percentile (p95) - 99th percentile (p99) Args: benchmark: pytest-benchmark fixture indexed_engine_100: Pre-indexed engine with 100 skills """ def search_query(): """Execute search query.""" results = indexed_engine_100._vector_search( query="python testing tools", top_k=10 ) return results # Run many iterations to get accurate percentile measurements benchmark(search_query) # Benchmark automatically tracks min, max, mean, median, stddev def test_vector_search_latency_1000_skills( self, benchmark, indexed_engine_1000: IndexingEngine ): """Benchmark vector search latency with 1000 indexed skills. Target: < 200ms (p50 latency, scaling sub-linearly) Performance Metrics: - Compare to 100-skill benchmark - Should scale sub-linearly due to indexing Args: benchmark: pytest-benchmark fixture indexed_engine_1000: Pre-indexed engine with 1000 skills """ def search_query(): """Execute search query.""" results = indexed_engine_1000._vector_search( query="python testing tools", top_k=10 ) return results benchmark(search_query) def test_graph_search_latency(self, benchmark, indexed_engine_100: IndexingEngine): """Benchmark graph search latency. Target: < 10ms (graph traversal should be very fast) Performance Metrics: - BFS traversal time - Graph query efficiency Args: benchmark: pytest-benchmark fixture indexed_engine_100: Pre-indexed engine with 100 skills """ def graph_search(): """Execute graph search.""" # Use first skill as seed seed_id = "benchmark-repo/skill-00000" results = indexed_engine_100._graph_search(seed_id, max_depth=2) return results benchmark(graph_search) def test_hybrid_search_end_to_end( self, benchmark, indexed_engine_100: IndexingEngine ): """Benchmark full hybrid search (vector + graph). Target: < 200ms (combined vector + graph + reranking) Performance Metrics: - End-to-end search latency - Hybrid combination overhead Args: benchmark: pytest-benchmark fixture indexed_engine_100: Pre-indexed engine with 100 skills """ def hybrid_search(): """Execute full hybrid search.""" results = indexed_engine_100.search(query="python testing tools", top_k=10) return results benchmark(hybrid_search) def test_search_with_filters(self, benchmark, indexed_engine_100: IndexingEngine): """Benchmark search with category and toolchain filters. Target: Similar to unfiltered search (~200ms) Performance Metrics: - Filter application overhead - ChromaDB metadata filtering efficiency Args: benchmark: pytest-benchmark fixture indexed_engine_100: Pre-indexed engine with 100 skills """ def filtered_search(): """Execute search with filters.""" results = indexed_engine_100.search( query="python testing", category="testing", toolchain="python", top_k=10, ) return results benchmark(filtered_search) def test_related_skills_traversal( self, benchmark, indexed_engine_100: IndexingEngine ): """Benchmark get_related_skills() graph traversal. Target: < 20ms (graph traversal + skill loading) Performance Metrics: - Graph traversal time - Skill loading overhead Args: benchmark: pytest-benchmark fixture indexed_engine_100: Pre-indexed engine with 100 skills """ def get_related(): """Get related skills.""" results = indexed_engine_100.get_related_skills( skill_id="benchmark-repo/skill-00000", max_depth=2 ) return results benchmark(get_related) class TestDatabasePerformance: """Benchmark SQLite metadata store performance. Tests measure: - Lookup by ID (indexed) - Query with filters - Batch insert performance - Index effectiveness """ def test_lookup_by_id_single(self, benchmark, metadata_store_100: MetadataStore): """Benchmark SQLite lookup by ID (indexed column). Target: < 1ms (indexed lookup should be very fast) Performance Metrics: - Single row retrieval time - Index effectiveness Args: benchmark: pytest-benchmark fixture metadata_store_100: Metadata store with 100 repositories """ def lookup_repository(): """Lookup repository by ID.""" repo_id = "benchmark/repo-00050" repo = metadata_store_100.get_repository(repo_id) return repo benchmark(lookup_repository) def test_list_all_repositories(self, benchmark, metadata_store_100: MetadataStore): """Benchmark SQLite query to list all repositories. Target: < 10ms for 100 repositories Performance Metrics: - Full table scan performance - Result materialization time Args: benchmark: pytest-benchmark fixture metadata_store_100: Metadata store with 100 repositories """ def list_all(): """List all repositories.""" repos = metadata_store_100.list_repositories() return list(repos) benchmark(list_all) def test_update_repository_metadata( self, benchmark, metadata_store_100: MetadataStore ): """Benchmark updating repository metadata. Target: < 5ms per update Performance Metrics: - Update operation time - Index maintenance overhead Args: benchmark: pytest-benchmark fixture metadata_store_100: Metadata store with 100 repositories """ def update_repo(): """Update repository metadata.""" repo = metadata_store_100.get_repository("benchmark/repo-00050") if repo: repo.skill_count = 42 metadata_store_100.update_repository(repo) benchmark(update_repo) def test_list_1000_repositories( self, benchmark, metadata_store_1000: MetadataStore ): """Benchmark listing all repositories (no filters). Target: < 50ms for 1000 repositories Performance Metrics: - Full table scan performance - Result materialization time Args: benchmark: pytest-benchmark fixture metadata_store_1000: Metadata store with 1000 repositories """ def list_all(): """List all repositories.""" repos = metadata_store_1000.list_repositories() return list(repos) benchmark(list_all) def test_batch_insert_performance(self, benchmark, benchmark_storage_path: Path): """Benchmark batch insert of 100 repositories. Target: < 1 second for 100 inserts Performance Metrics: - Insert throughput (repositories/second) - Transaction overhead Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path """ from datetime import datetime from mcp_skills.models.repository import Repository # Create a unique database for this benchmark timestamp = int(time.time() * 1000000) db_path = benchmark_storage_path / f"batch_insert_{timestamp}.db" store = MetadataStore(db_path=db_path) # Counter to make IDs unique across benchmark rounds round_counter = [0] def batch_insert(): """Insert all repositories.""" offset = round_counter[0] * 100 for i in range(100): repo = Repository( id=f"benchmark/repo-{(offset + i):06d}", url=f"https://github.com/benchmark/repo-{(offset + i):06d}.git", local_path=benchmark_storage_path / "repos" / f"repo-{(offset + i):06d}", priority=50 + (i % 50), last_updated=datetime(2024, 1, 1, 12, 0, 0, tzinfo=UTC), skill_count=10 + (i % 20), license="MIT", ) store.add_repository(repo) round_counter[0] += 1 benchmark(batch_insert) class TestMemoryUsage: """Memory usage benchmarks for large-scale operations. Tests measure: - Memory usage during indexing - Memory growth with dataset size - Memory cleanup after operations """ def test_memory_usage_index_1000_skills( self, benchmark, benchmark_storage_path: Path, benchmark_skills_1000: list[Skill], ): """Benchmark memory usage when indexing 1000 skills. Performance Metrics: - Peak memory usage - Memory growth pattern - Post-operation cleanup Args: benchmark: pytest-benchmark fixture benchmark_storage_path: Temporary storage path benchmark_skills_1000: List of 1000 sample skills """ # Note: pytest-benchmark doesn't directly measure memory, # but we can track it manually in the benchmark function def setup(): """Setup with memory tracking.""" gc.collect() # Clean memory before measurement skill_manager = SkillManager() skill_manager._skill_cache = { skill.id: skill for skill in benchmark_skills_1000 } skill_manager._skill_paths = { skill.id: skill.file_path for skill in benchmark_skills_1000 } engine = IndexingEngine( vector_backend="chromadb", graph_backend="networkx", skill_manager=skill_manager, storage_path=benchmark_storage_path / "memory_test", ) return (engine, benchmark_skills_1000), {} def index_with_memory_tracking(engine, skills, *args): """Index and track memory.""" for i, skill in enumerate(skills): engine.index_skill(skill) # Sample memory every 100 skills if i % 100 == 0: gc.collect() # Force collection to get accurate reading benchmark.pedantic( index_with_memory_tracking, setup=setup, rounds=1, iterations=1 )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bobmatnyc/mcp-skills'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_performance_benchmarks.py•19.2 KiB