Skip to main content
Glama
anton-prosterity

Documentation Search MCP Server

site_index_builder.py8.88 kB
#!/usr/bin/env python3 """Build a preindexed docs site index for Serper-free search. This module is used by CI to generate `docs_site_index.json` (+ optional `.gz`) assets that are published to GitHub Releases and auto-downloaded by the server at startup. """ from __future__ import annotations import asyncio import argparse import gzip import json import os import sys from dataclasses import dataclass from datetime import datetime from typing import Any, Dict, Iterable, Mapping, Optional from urllib.parse import urlparse import httpx from . import site_search @dataclass(frozen=True) class SiteIndexBuildSettings: user_agent: str max_concurrent_sites: int = 5 sitemap_mode: str = "missing" # none|missing|all max_sitemap_urls: int = 5_000 timeout_seconds: float = 60.0 def load_docs_urls_from_config(config_path: str) -> Dict[str, str]: if not config_path: raise ValueError("config_path must be non-empty") with open(config_path, "r", encoding="utf-8") as fh: data = json.load(fh) raw = data.get("docs_urls", {}) if not isinstance(raw, dict): return {} docs_urls: Dict[str, str] = {} for name, value in raw.items(): if not isinstance(name, str): continue if isinstance(value, dict): url = str(value.get("url") or "").strip() else: url = str(value or "").strip() if url: docs_urls[name] = url return docs_urls def _parse_libraries(value: Optional[str]) -> Optional[list[str]]: if not value: return None parts = [part.strip() for part in value.split(",")] libraries = [part for part in parts if part] return libraries or None def _origin_from_url(url: str) -> Optional[str]: parsed = urlparse(url) if not parsed.scheme or not parsed.netloc: return None return f"{parsed.scheme}://{parsed.netloc}" async def build_site_index_file( docs_urls: Mapping[str, str], *, output_path: str, gzip_output: bool, settings: SiteIndexBuildSettings, client: Optional[httpx.AsyncClient] = None, ) -> Dict[str, Any]: if not output_path: raise ValueError("output_path must be non-empty") if settings.sitemap_mode not in {"none", "missing", "all"}: raise ValueError("sitemap_mode must be one of: none, missing, all") site_search._sitemap_cache.clear() site_search._sitemap_locks.clear() site_search._index_cache.clear() site_search._index_locks.clear() original_max_sitemap_urls = getattr(site_search, "_MAX_SITEMAP_URLS", None) if settings.max_sitemap_urls and settings.max_sitemap_urls > 0: site_search._MAX_SITEMAP_URLS = int(settings.max_sitemap_urls) created_client = client is None if client is None: client = httpx.AsyncClient(timeout=httpx.Timeout(settings.timeout_seconds)) try: concurrency = max(1, min(int(settings.max_concurrent_sites), 20)) semaphore = asyncio.Semaphore(concurrency) results: list[dict[str, Any]] = [] async def run_one(library: str, url: str) -> None: async with semaphore: summary = await site_search.preindex_site( url, client, user_agent=settings.user_agent, include_sitemap=False, ) has_index = bool( summary.get("mkdocs_index") or summary.get("sphinx_index") ) include_sitemap = settings.sitemap_mode == "all" or ( settings.sitemap_mode == "missing" and not has_index ) if include_sitemap: origin = summary.get("origin") or _origin_from_url(url) try: urls = await site_search._load_site_sitemap_urls( # type: ignore[attr-defined] client, url, user_agent=settings.user_agent, ) except Exception as e: summary.setdefault("errors", []).append(f"sitemap:{e}") else: if origin and urls: site_search._sitemap_cache[origin] = ( site_search._SitemapCacheEntry( # type: ignore[attr-defined] fetched_at=datetime.now(), urls=tuple(urls), ) ) summary["sitemap"] = {"urls": len(urls)} summary["library"] = library results.append(summary) tasks = [ run_one(library, url) for library, url in sorted(docs_urls.items(), key=lambda item: item[0]) if url ] if tasks: await asyncio.gather(*tasks) site_search.save_preindexed_state(output_path) gz_path: Optional[str] = None if gzip_output: gz_path = f"{output_path}.gz" with open(output_path, "rb") as fh: blob = fh.read() with gzip.open(gz_path, "wb", compresslevel=9) as fh: fh.write(blob) indexed_sites = sum( 1 for summary in results if summary.get("mkdocs_index") or summary.get("sphinx_index") or summary.get("sitemap") ) return { "status": "ok" if indexed_sites else "error", "output_path": output_path, "gzip_path": gz_path, "total_libraries": len(docs_urls), "indexed_libraries": indexed_sites, "results": sorted(results, key=lambda item: str(item.get("library") or "")), } finally: if original_max_sitemap_urls is not None: site_search._MAX_SITEMAP_URLS = original_max_sitemap_urls if created_client: await client.aclose() def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Build docs_site_index.json for releases." ) parser.add_argument( "--config", default=os.path.join("src", "documentation_search_enhanced", "config.json"), help="Path to config.json (default: bundled src config).", ) parser.add_argument( "--output", default="docs_site_index.json", help="Path to write the index JSON (default: docs_site_index.json).", ) parser.add_argument( "--gzip", action="store_true", help="Also write a .gz next to the JSON file.", ) parser.add_argument( "--libraries", default=None, help="Comma-separated list of libraries to index (default: all).", ) parser.add_argument( "--sitemap", choices=("none", "missing", "all"), default="missing", help="Whether to include sitemap URLs: none, missing, or all (default: missing).", ) parser.add_argument( "--max-sitemap-urls", type=int, default=5_000, help="Max sitemap URLs per site when included (default: 5000).", ) parser.add_argument( "--max-concurrency", type=int, default=5, help="Max concurrent sites to index (default: 5).", ) parser.add_argument( "--timeout-seconds", type=float, default=60.0, help="HTTP timeout (default: 60s).", ) parser.add_argument( "--user-agent", default="documentation-search-enhanced/index-builder", help="User-Agent header to use for fetches.", ) return parser def main(argv: Optional[Iterable[str]] = None) -> int: args = _build_parser().parse_args(list(argv) if argv is not None else None) docs_urls = load_docs_urls_from_config(args.config) libraries = _parse_libraries(args.libraries) if libraries: docs_urls = {lib: docs_urls[lib] for lib in libraries if lib in docs_urls} settings = SiteIndexBuildSettings( user_agent=args.user_agent, max_concurrent_sites=args.max_concurrency, sitemap_mode=args.sitemap, max_sitemap_urls=args.max_sitemap_urls, timeout_seconds=args.timeout_seconds, ) if not docs_urls: print("No documentation sources found to index.", file=sys.stderr) return 2 result = asyncio.run( build_site_index_file( docs_urls, output_path=args.output, gzip_output=bool(args.gzip), settings=settings, ) ) print(json.dumps(result, indent=2, sort_keys=True), file=sys.stderr) return 0 if result.get("status") == "ok" else 1 if __name__ == "__main__": raise SystemExit(main())

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/anton-prosterity/documentation-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server