Skip to main content
Glama
ingest_docs.py5.85 kB
#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "llama-index", # "llama-index-embeddings-huggingface", # "llama-index-embeddings-openai", # "sentence-transformers", # "transformers", # ] # /// """Build a LlamaIndex vector store for one or more documentation directories.""" from __future__ import annotations import argparse import shutil from pathlib import Path from typing import Dict, List, Sequence, Tuple from llama_index.core import Settings, StorageContext, VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter from llama_index.core.readers import SimpleDirectoryReader from llama_index.embeddings.huggingface import HuggingFaceEmbedding try: from llama_index.embeddings.openai import OpenAIEmbedding except ImportError: # pragma: no cover - optional dependency OpenAIEmbedding = None def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Ingest Markdown documentation into a LlamaIndex vector store", ) parser.add_argument( "--docs-root", action="append", type=str, default=None, metavar="PATH[:LANG]", help=( "Documentation root plus optional language tag (e.g. docs/en:en, " "docs/zh:zh); repeat flag to ingest multiple languages" ), ) parser.add_argument( "--persist-dir", type=Path, default=Path("storage/llamaindex"), help="Directory where the index will be persisted", ) parser.add_argument( "--chunk-size", type=int, default=750, help="Approximate token count per chunk", ) parser.add_argument( "--chunk-overlap", type=int, default=120, help="Token overlap between chunks", ) parser.add_argument( "--extensions", nargs="+", default=(".md", ".mdx"), help="File extensions to ingest (default: .md .mdx)", ) parser.add_argument( "--embedding-model", type=str, default="BAAI/bge-base-zh-v1.5", help="Embedding model identifier (interpreted per backend)", ) parser.add_argument( "--embed-backend", type=str, choices=("huggingface", "openai"), default="huggingface", help="Embedding backend to use; huggingface runs fully local", ) parser.add_argument( "--clean", action="store_true", help="Remove the persist directory before rebuilding the index", ) return parser.parse_args() def _parse_docs_roots(values: Sequence[str] | None) -> List[Tuple[Path, str]]: """Normalize --docs-root inputs into (path, language) tuples.""" raw_values = values or ["docs/en:en"] normalized: List[Tuple[Path, str]] = [] for entry in raw_values: path_part, _, lang = entry.partition(":") path = Path(path_part).expanduser() resolved_lang = (lang or path.name or "en").lower() normalized.append((path.resolve(), resolved_lang)) return normalized def _metadata_factory(root: Path, lang: str): def build_metadata(file_path: str) -> Dict[str, str]: rel = Path(file_path).resolve().relative_to(root) section = rel.parts[0] if len(rel.parts) > 1 else "root" return { "path": str(rel).replace("\\", "/"), "lang": lang, "section": section, } return build_metadata def configure_embedding(args: argparse.Namespace) -> None: if args.embed_backend == "huggingface": Settings.embed_model = HuggingFaceEmbedding(model_name=args.embedding_model) elif args.embed_backend == "openai": if OpenAIEmbedding is None: raise SystemExit( "OpenAI embedding backend requested but llama-index-embeddings-openai is not installed", ) Settings.embed_model = OpenAIEmbedding(model=args.embedding_model) else: # pragma: no cover - argparse guards choices raise SystemExit(f"Unsupported embed backend: {args.embed_backend}") def main() -> None: args = parse_args() docs_roots = _parse_docs_roots(args.docs_root) missing = [str(path) for path, _ in docs_roots if not path.exists()] if missing: raise SystemExit(f"Docs root(s) do not exist: {', '.join(missing)}") target_exts = tuple(ext.lower() for ext in args.extensions) if args.clean and args.persist_dir.exists(): shutil.rmtree(args.persist_dir) args.persist_dir.mkdir(parents=True, exist_ok=True) configure_embedding(args) Settings.node_parser = SentenceSplitter( chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, paragraph_separator="\n\n", ) documents = [] total_files = 0 unique_langs = set() for root, lang in docs_roots: md_files = sorted( path for path in root.rglob("*") if path.suffix.lower() in target_exts ) if not md_files: raise SystemExit(f"No markdown files found under {root}") reader = SimpleDirectoryReader( input_files=[str(path) for path in md_files], file_metadata=_metadata_factory(root, lang), ) documents.extend(reader.load_data()) total_files += len(md_files) unique_langs.add(lang) storage_context = StorageContext.from_defaults() VectorStoreIndex.from_documents(documents, storage_context=storage_context) storage_context.persist(persist_dir=str(args.persist_dir)) langs_str = ", ".join(sorted(unique_langs)) print( f"Indexed {len(documents)} documents from {total_files} markdown files " f"across {len(docs_roots)} root(s) ({langs_str}) into {args.persist_dir}", ) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JaxsonWang/Scripting-Docs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server