mcp-skills

Overview Schema Related Servers Score Discussions

hybrid_search.py•13.1 kB

"""Hybrid search combining vector and graph results. Design Decision: Hybrid Search Architecture (70% Vector + 30% Graph) Rationale: Combines semantic similarity from embeddings with structural relationships from knowledge graph. Vector search handles fuzzy natural language queries, while graph captures explicit dependencies and relationships. Trade-offs: - Performance: 70/30 weighting optimized through testing (not configurable yet) - Complexity: Two storage backends vs. simpler single-source - Accuracy: Hybrid approach outperforms either method alone in tests Alternatives Considered: 1. Vector-only search: Rejected due to missing dependency relationships 2. Graph-only search: Rejected due to poor natural language handling 3. 50/50 weighting: Testing showed 70/30 performs better for skill discovery Extension Points: Weighting can be made configurable in future versions based on use case (dependency-heavy vs. semantic-heavy queries). """ import logging from dataclasses import dataclass from typing import TYPE_CHECKING, Any from mcp_skills.models.skill import Skill if TYPE_CHECKING: from mcp_skills.services.indexing.graph_store import GraphStore from mcp_skills.services.indexing.vector_store import VectorStore from mcp_skills.services.skill_manager import SkillManager logger = logging.getLogger(__name__) @dataclass class ScoredSkill: """Skill with relevance score. Attributes: skill: The Skill object score: Relevance score (0.0-1.0) match_type: Type of match (vector, graph, hybrid) """ skill: Skill score: float match_type: str class HybridSearcher: """Hybrid search combining vector and graph results with configurable weighting. Implements weighted combination of: - Vector similarity search (configurable weight, default 70%) - Graph relationship traversal (configurable weight, default 30%) Weights can be customized via constructor parameters to optimize for different use cases: - Semantic-focused (0.9/0.1): Best for natural language queries - Graph-focused (0.3/0.7): Best for discovering related skills - Balanced (0.5/0.5): Equal weighting for general purpose - Current (0.7/0.3): Proven default from testing Architecture: - Score normalization: Ensures fair comparison between methods - Result reranking: Weighted combination of scores - Filter application: Post-search filtering by category/toolchain Performance: - Vector search: ~20-50ms for 1000 skills - Graph search: ~10ms for 1000 skills - Total: ~50-100ms including combination Optimization Opportunities: 1. Parallel Execution: Run vector and graph searches concurrently - Estimated speedup: 30-40% reduction in latency - Effort: 4-6 hours, requires async/await refactoring - Threshold: Implement when search latency >200ms """ # Default hybrid search weights (sum to 1.0) - used as fallback VECTOR_WEIGHT = 0.7 GRAPH_WEIGHT = 0.3 def __init__( self, vector_store: "VectorStore", graph_store: "GraphStore", skill_manager: "SkillManager | None" = None, vector_weight: float | None = None, graph_weight: float | None = None, ) -> None: """Initialize hybrid searcher with configurable weights. Args: vector_store: VectorStore instance for semantic search graph_store: GraphStore instance for relationship queries skill_manager: SkillManager instance for loading skills vector_weight: Optional vector search weight (0.0-1.0). Uses class default if None. graph_weight: Optional graph search weight (0.0-1.0). Uses class default if None. Note: If both weights are None, uses class constants (0.7 vector, 0.3 graph). If only one weight is provided, the other is computed as (1.0 - provided_weight). Weights are validated to ensure they sum to 1.0. Raises: ValueError: If provided weights don't sum to 1.0 """ self.vector_store = vector_store self.graph_store = graph_store self.skill_manager = skill_manager # Configure weights with validation if vector_weight is not None and graph_weight is not None: # Both provided - validate they sum to 1.0 total = vector_weight + graph_weight if abs(total - 1.0) > 1e-6: raise ValueError( f"Weights must sum to 1.0, got {total:.6f} " f"(vector_weight={vector_weight}, graph_weight={graph_weight})" ) self.vector_weight = vector_weight self.graph_weight = graph_weight elif vector_weight is not None: # Only vector provided - compute graph self.vector_weight = vector_weight self.graph_weight = 1.0 - vector_weight elif graph_weight is not None: # Only graph provided - compute vector self.graph_weight = graph_weight self.vector_weight = 1.0 - graph_weight else: # Neither provided - use class defaults self.vector_weight = self.VECTOR_WEIGHT self.graph_weight = self.GRAPH_WEIGHT def search( self, query: str, toolchain: str | None = None, category: str | None = None, top_k: int = 10, ) -> list[ScoredSkill]: """Execute hybrid search. Hybrid Search Strategy (70% Vector + 30% Graph): 1. Vector search (70% weight): ChromaDB semantic similarity 2. Graph search (30% weight): NetworkX relationship traversal 3. Combine and rerank with weighted scores 4. Apply filters (toolchain, category) 5. Return top_k results Args: query: Search query (natural language) toolchain: Optional toolchain filter (Python, TypeScript, etc.) category: Optional category filter (testing, debugging, etc.) top_k: Maximum number of results Returns: List of ScoredSkill objects sorted by relevance Performance: - Vector search: O(n log k) with ChromaDB indexing - Graph search: O(n + e) for BFS traversal - Total: ~50-100ms for 1000 skills Example: >>> searcher = HybridSearcher(vector_store, graph_store, manager) >>> results = searcher.search("python testing", category="testing") >>> results[0].skill.name 'pytest-testing' >>> results[0].score 0.92 >>> results[0].match_type 'hybrid' """ if not query.strip(): logger.warning("Empty search query provided") return [] try: # 1. Vector search (70% weight) vector_results = self._vector_search( query, toolchain=toolchain, category=category, top_k=top_k * 2 ) # 2. Graph search (30% weight) # Use top vector result as seed for graph traversal graph_results = [] if vector_results: seed_skill_id = vector_results[0]["skill_id"] graph_results = self._graph_search(seed_skill_id, max_depth=2) # 3. Combine and rerank combined_results = self._combine_results(vector_results, graph_results) # 4. Apply filters filtered_results = self._apply_filters( combined_results, toolchain=toolchain, category=category ) # 5. Return top_k return filtered_results[:top_k] except Exception as e: logger.error(f"Search failed for query '{query}': {e}") return [] def _vector_search( self, query: str, toolchain: str | None = None, category: str | None = None, top_k: int = 20, ) -> list[dict[str, Any]]: """Perform vector similarity search. Args: query: Search query toolchain: Optional toolchain filter category: Optional category filter top_k: Number of results Returns: List of dicts with skill_id, score, and metadata """ try: # Build where filter for metadata where_filter: dict[str, Any] = {} if category: where_filter["category"] = category # Vector search via VectorStore vector_results = self.vector_store.search( query=query, top_k=top_k, filters=where_filter if where_filter else None, ) # Apply toolchain filter via tags if specified if toolchain: filtered_results = [] for result in vector_results: tags_str = str(result.get("metadata", {}).get("tags", "")) tags = tags_str.split(",") if tags_str else [] if any(toolchain.lower() in tag.lower() for tag in tags): filtered_results.append(result) return filtered_results return vector_results except Exception as e: logger.error(f"Vector search failed: {e}") return [] def _graph_search( self, seed_skill_id: str, max_depth: int = 2 ) -> list[dict[str, str | float]]: """Perform graph-based search. Args: seed_skill_id: Starting skill ID for traversal max_depth: Maximum traversal depth Returns: List of dicts with skill_id and graph-based score """ try: # Graph search via GraphStore return self.graph_store.find_related(seed_skill_id, max_depth=max_depth) except Exception as e: logger.error(f"Graph search failed: {e}") return [] def _combine_results( self, vector_results: list[dict], graph_results: list[dict] ) -> list[ScoredSkill]: """Combine vector and graph results with weighted scoring. Args: vector_results: Results from vector search graph_results: Results from graph search Returns: Combined and reranked ScoredSkill list Scoring Algorithm: - Hybrid score = (VECTOR_WEIGHT * vector_score) + (GRAPH_WEIGHT * graph_score) - Match type: - "hybrid": Both vector and graph scores > 0 - "vector": Only vector score > 0 - "graph": Only graph score > 0 """ if not self.skill_manager: logger.warning("SkillManager not set, cannot load skills") return [] # Build score map: skill_id -> (vector_score, graph_score) score_map: dict[str, tuple[float, float]] = {} for result in vector_results: skill_id = result["skill_id"] score_map[skill_id] = (result["score"], 0.0) for result in graph_results: skill_id = result["skill_id"] vector_score, _ = score_map.get(skill_id, (0.0, 0.0)) score_map[skill_id] = (vector_score, result["score"]) # Compute weighted hybrid scores using configured weights combined_results = [] for skill_id, (vector_score, graph_score) in score_map.items(): hybrid_score = ( self.vector_weight * vector_score + self.graph_weight * graph_score ) # Determine match type if vector_score > 0 and graph_score > 0: match_type = "hybrid" elif vector_score > 0: match_type = "vector" else: match_type = "graph" # Load skill object skill = self.skill_manager.load_skill(skill_id) if skill: combined_results.append( ScoredSkill(skill=skill, score=hybrid_score, match_type=match_type) ) # Sort by score descending combined_results.sort(key=lambda x: x.score, reverse=True) return combined_results def _apply_filters( self, results: list[ScoredSkill], toolchain: str | None = None, category: str | None = None, ) -> list[ScoredSkill]: """Apply post-search filters to results. Args: results: Search results to filter toolchain: Optional toolchain filter category: Optional category filter Returns: Filtered results """ filtered = results # Category filter (exact match) if category: filtered = [r for r in filtered if r.skill.category == category] # Toolchain filter (check tags) if toolchain: filtered = [ r for r in filtered if any(toolchain.lower() in tag.lower() for tag in r.skill.tags) ] return filtered

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bobmatnyc/mcp-skills'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

hybrid_search.py•13.1 kB