Documentation MCP Server

search.py•9.46 KiB

"""Search functionality for documentation.""" import re from typing import Literal, cast from docs_mcp.models.document import Document from docs_mcp.models.navigation import Category, SearchResult from docs_mcp.security.sanitizer import sanitize_query from docs_mcp.services.cache import get_cache from docs_mcp.services.hierarchy import get_breadcrumbs from docs_mcp.services.vector import get_vector_store from docs_mcp.utils.logger import logger class SearchError(Exception): """Raised when search operations fail.""" pass def search_content( query: str, documents: list[Document], categories: dict[str, Category], limit: int = 10, category_filter: str | None = None, ) -> list[SearchResult]: """Search documentation content with full-text search. Args: query: Search query string documents: List of documents to search categories: Category tree for context limit: Maximum number of results category_filter: Optional category to filter results Returns: List of SearchResult objects, sorted by relevance Raises: SearchError: If search fails """ try: # Sanitize query sanitized_query = sanitize_query(query, allow_regex=False) if not sanitized_query: return [] logger.debug(f"Searching for: {sanitized_query}") # Check cache cache = get_cache() cache_key = f"search:{sanitized_query}:{category_filter}:{limit}" cached = cache.get(cache_key) if cached: return cast(list[SearchResult], cached) # 1. Keyword Search keyword_results: dict[str, SearchResult] = {} # Compile regex for case-insensitive search try: pattern = re.compile(sanitized_query, re.IGNORECASE) except re.error as e: raise SearchError(f"Invalid search pattern: {e}") from e for doc in documents: if category_filter and not doc.uri.startswith(f"docs://{category_filter}"): continue title_score = 1.0 if pattern.search(doc.title) else 0.0 content_matches = pattern.findall(doc.content) content_score = min(len(content_matches) / 10.0, 1.0) if content_matches else 0.0 metadata_text = " ".join(doc.tags) + " " + (doc.category or "") metadata_score = 0.5 if pattern.search(metadata_text) else 0.0 relevance = title_score * 0.5 + content_score * 0.3 + metadata_score * 0.2 if relevance > 0: match_type: Literal["full_text", "metadata", "title", "semantic"] if title_score > 0: match_type = "title" elif metadata_score > 0: match_type = "metadata" else: match_type = "full_text" excerpt = _extract_excerpt(doc.content, sanitized_query) highlighted = _highlight_matches(excerpt, sanitized_query) breadcrumbs = [crumb["name"] for crumb in get_breadcrumbs(doc.uri)] category = breadcrumbs[0] if breadcrumbs else "docs" keyword_results[doc.uri] = SearchResult( document_uri=doc.uri, title=doc.title, excerpt=excerpt, breadcrumbs=breadcrumbs, category=category, relevance_score=relevance, match_type=match_type, highlighted_excerpt=highlighted, ) # 2. Semantic Search vector_scores: dict[str, float] = {} try: # Fetch more results than limit to allow good intersection vector_hits = get_vector_store().search(sanitized_query, limit=limit * 2) for hit in vector_hits: uri = hit.get("uri") score = hit.get("score", 0.0) if uri: vector_scores[uri] = score except Exception as e: logger.warning(f"Semantic search unavailable: {e}") # 3. Merge Results final_results = [] all_uris = set(keyword_results.keys()) | set(vector_scores.keys()) # Map for quick doc lookup doc_map = {d.uri: d for d in documents} for uri in all_uris: doc = doc_map.get(uri) if not doc: continue if category_filter and not uri.startswith(f"docs://{category_filter}"): continue k_result = keyword_results.get(uri) v_score = vector_scores.get(uri, 0.0) # Get base keyword score or 0 k_score = k_result.relevance_score if k_result else 0.0 # Weighted Combination if k_score > 0 and v_score > 0: # Strong signal: appears in both final_score = max(k_score, v_score) + 0.3 else: final_score = max(k_score, v_score) # Construct result if it wasn't in keyword results if k_result: result = k_result result.relevance_score = final_score else: # Pure semantic match breadcrumbs = [crumb["name"] for crumb in get_breadcrumbs(doc.uri)] category = breadcrumbs[0] if breadcrumbs else "docs" excerpt = doc.excerpt(200) result = SearchResult( document_uri=doc.uri, title=doc.title, excerpt=excerpt, breadcrumbs=breadcrumbs, category=category, relevance_score=final_score, match_type="semantic", highlighted_excerpt=excerpt, ) final_results.append(result) # Sort by relevance final_results.sort(key=lambda r: r.relevance_score, reverse=True) results = final_results[:limit] logger.info(f"Search found {len(results)} results for: {sanitized_query}") # Cache results cache.set(cache_key, results, ttl=1800) # 30 minutes return results except SearchError: raise except Exception as e: raise SearchError(f"Search failed: {e}") from e def search_by_metadata( tags: list[str] | None = None, category: str | None = None, documents: list[Document] | None = None, limit: int = 10, ) -> list[SearchResult]: """Search documentation by metadata (tags, category). Args: tags: Tags to filter by (OR logic) category: Category to filter by documents: List of documents to search limit: Maximum number of results Returns: List of SearchResult objects """ results: list[SearchResult] = [] if documents is None: return results for doc in documents: # Use AND logic: document must match ALL specified filters tag_match = not tags or any(tag in doc.tags for tag in tags) cat_match = not category or (doc.category == category or category in doc.uri) matches = tag_match and cat_match if matches: breadcrumbs = [crumb["name"] for crumb in get_breadcrumbs(doc.uri)] cat = breadcrumbs[0] if breadcrumbs else "docs" result = SearchResult( document_uri=doc.uri, title=doc.title, excerpt=doc.excerpt(200), breadcrumbs=breadcrumbs, category=cat, relevance_score=1.0, # All metadata matches are equally relevant match_type="metadata", highlighted_excerpt="", ) results.append(result) # Limit results results = results[:limit] logger.info( f"Metadata search found {len(results)} results (tags: {tags}, category: {category})" ) return results def _extract_excerpt(content: str, query: str, context_chars: int = 100) -> str: """Extract an excerpt showing the query match with context. Args: content: Document content query: Search query context_chars: Characters of context before/after match Returns: Excerpt string """ try: # Find first match pattern = re.compile(query, re.IGNORECASE) match = pattern.search(content) if not match: # Return first N characters if no match return content[: context_chars * 2] + "..." # Extract context around match start = max(0, match.start() - context_chars) end = min(len(content), match.end() + context_chars) excerpt = content[start:end] # Add ellipsis if truncated if start > 0: excerpt = "..." + excerpt if end < len(content): excerpt = excerpt + "..." return excerpt.strip() except Exception as e: logger.warning(f"Failed to extract excerpt: {e}") return content[: context_chars * 2] + "..." def _highlight_matches(text: str, query: str, highlight: str = "**") -> str: """Highlight query matches in text. Args: text: Text to highlight query: Query to highlight highlight: Marker to use for highlighting (e.g., "**" for bold) Returns: Text with matches highlighted """ try: pattern = re.compile(f"({re.escape(query)})", re.IGNORECASE) return pattern.sub(f"{highlight}\\1{highlight}", text) except Exception as e: logger.warning(f"Failed to highlight matches: {e}") return text

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/esola-thomas/your-docs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

search.py•9.46 KiB