Scripting Docs MCP

ingest_docs.py•5.71 KiB

#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "llama-index", # "llama-index-embeddings-huggingface", # "llama-index-embeddings-openai", # "sentence-transformers", # "transformers", # ] # /// """Build a LlamaIndex vector store for one or more documentation directories.""" from __future__ import annotations import argparse import shutil from pathlib import Path from typing import Dict, List, Sequence, Tuple from llama_index.core import Settings, StorageContext, VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter from llama_index.core.readers import SimpleDirectoryReader from llama_index.embeddings.huggingface import HuggingFaceEmbedding try: from llama_index.embeddings.openai import OpenAIEmbedding except ImportError: # pragma: no cover - optional dependency OpenAIEmbedding = None def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Ingest Markdown documentation into a LlamaIndex vector store", ) parser.add_argument( "--docs-root", action="append", type=str, default=None, metavar="PATH[:LANG]", help=( "Documentation root plus optional language tag (e.g. docs/en:en, " "docs/zh:zh); repeat flag to ingest multiple languages" ), ) parser.add_argument( "--persist-dir", type=Path, default=Path("storage/llamaindex"), help="Directory where the index will be persisted", ) parser.add_argument( "--chunk-size", type=int, default=750, help="Approximate token count per chunk", ) parser.add_argument( "--chunk-overlap", type=int, default=120, help="Token overlap between chunks", ) parser.add_argument( "--extensions", nargs="+", default=(".md", ".mdx"), help="File extensions to ingest (default: .md .mdx)", ) parser.add_argument( "--embedding-model", type=str, default="BAAI/bge-base-zh-v1.5", help="Embedding model identifier (interpreted per backend)", ) parser.add_argument( "--embed-backend", type=str, choices=("huggingface", "openai"), default="huggingface", help="Embedding backend to use; huggingface runs fully local", ) parser.add_argument( "--clean", action="store_true", help="Remove the persist directory before rebuilding the index", ) return parser.parse_args() def _parse_docs_roots(values: Sequence[str] | None) -> List[Tuple[Path, str]]: """Normalize --docs-root inputs into (path, language) tuples.""" raw_values = values or ["docs/en:en"] normalized: List[Tuple[Path, str]] = [] for entry in raw_values: path_part, _, lang = entry.partition(":") path = Path(path_part).expanduser() resolved_lang = (lang or path.name or "en").lower() normalized.append((path.resolve(), resolved_lang)) return normalized def _metadata_factory(root: Path, lang: str): def build_metadata(file_path: str) -> Dict[str, str]: rel = Path(file_path).resolve().relative_to(root) section = rel.parts[0] if len(rel.parts) > 1 else "root" return { "path": str(rel).replace("\\", "/"), "lang": lang, "section": section, } return build_metadata def configure_embedding(args: argparse.Namespace) -> None: if args.embed_backend == "huggingface": Settings.embed_model = HuggingFaceEmbedding(model_name=args.embedding_model) elif args.embed_backend == "openai": if OpenAIEmbedding is None: raise SystemExit( "OpenAI embedding backend requested but llama-index-embeddings-openai is not installed", ) Settings.embed_model = OpenAIEmbedding(model=args.embedding_model) else: # pragma: no cover - argparse guards choices raise SystemExit(f"Unsupported embed backend: {args.embed_backend}") def main() -> None: args = parse_args() docs_roots = _parse_docs_roots(args.docs_root) missing = [str(path) for path, _ in docs_roots if not path.exists()] if missing: raise SystemExit(f"Docs root(s) do not exist: {', '.join(missing)}") target_exts = tuple(ext.lower() for ext in args.extensions) if args.clean and args.persist_dir.exists(): shutil.rmtree(args.persist_dir) args.persist_dir.mkdir(parents=True, exist_ok=True) configure_embedding(args) Settings.node_parser = SentenceSplitter( chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, paragraph_separator="\n\n", ) documents = [] total_files = 0 unique_langs = set() for root, lang in docs_roots: md_files = sorted( path for path in root.rglob("*") if path.suffix.lower() in target_exts ) if not md_files: raise SystemExit(f"No markdown files found under {root}") reader = SimpleDirectoryReader( input_files=[str(path) for path in md_files], file_metadata=_metadata_factory(root, lang), ) documents.extend(reader.load_data()) total_files += len(md_files) unique_langs.add(lang) storage_context = StorageContext.from_defaults() VectorStoreIndex.from_documents(documents, storage_context=storage_context) storage_context.persist(persist_dir=str(args.persist_dir)) langs_str = ", ".join(sorted(unique_langs)) print( f"Indexed {len(documents)} documents from {total_files} markdown files " f"across {len(docs_roots)} root(s) ({langs_str}) into {args.persist_dir}", ) if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JaxsonWang/Scripting-Docs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ingest_docs.py•5.71 KiB