#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "llama-index",
# "llama-index-embeddings-huggingface",
# "llama-index-embeddings-openai",
# "sentence-transformers",
# "transformers",
# ]
# ///
"""Build a LlamaIndex vector store for one or more documentation directories."""
from __future__ import annotations
import argparse
import shutil
from pathlib import Path
from typing import Dict, List, Sequence, Tuple
from llama_index.core import Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
try:
from llama_index.embeddings.openai import OpenAIEmbedding
except ImportError: # pragma: no cover - optional dependency
OpenAIEmbedding = None
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Ingest Markdown documentation into a LlamaIndex vector store",
)
parser.add_argument(
"--docs-root",
action="append",
type=str,
default=None,
metavar="PATH[:LANG]",
help=(
"Documentation root plus optional language tag (e.g. docs/en:en, "
"docs/zh:zh); repeat flag to ingest multiple languages"
),
)
parser.add_argument(
"--persist-dir",
type=Path,
default=Path("storage/llamaindex"),
help="Directory where the index will be persisted",
)
parser.add_argument(
"--chunk-size",
type=int,
default=750,
help="Approximate token count per chunk",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=120,
help="Token overlap between chunks",
)
parser.add_argument(
"--extensions",
nargs="+",
default=(".md", ".mdx"),
help="File extensions to ingest (default: .md .mdx)",
)
parser.add_argument(
"--embedding-model",
type=str,
default="BAAI/bge-base-zh-v1.5",
help="Embedding model identifier (interpreted per backend)",
)
parser.add_argument(
"--embed-backend",
type=str,
choices=("huggingface", "openai"),
default="huggingface",
help="Embedding backend to use; huggingface runs fully local",
)
parser.add_argument(
"--clean",
action="store_true",
help="Remove the persist directory before rebuilding the index",
)
return parser.parse_args()
def _parse_docs_roots(values: Sequence[str] | None) -> List[Tuple[Path, str]]:
"""Normalize --docs-root inputs into (path, language) tuples."""
raw_values = values or ["docs/en:en"]
normalized: List[Tuple[Path, str]] = []
for entry in raw_values:
path_part, _, lang = entry.partition(":")
path = Path(path_part).expanduser()
resolved_lang = (lang or path.name or "en").lower()
normalized.append((path.resolve(), resolved_lang))
return normalized
def _metadata_factory(root: Path, lang: str):
def build_metadata(file_path: str) -> Dict[str, str]:
rel = Path(file_path).resolve().relative_to(root)
section = rel.parts[0] if len(rel.parts) > 1 else "root"
return {
"path": str(rel).replace("\\", "/"),
"lang": lang,
"section": section,
}
return build_metadata
def configure_embedding(args: argparse.Namespace) -> None:
if args.embed_backend == "huggingface":
Settings.embed_model = HuggingFaceEmbedding(model_name=args.embedding_model)
elif args.embed_backend == "openai":
if OpenAIEmbedding is None:
raise SystemExit(
"OpenAI embedding backend requested but llama-index-embeddings-openai is not installed",
)
Settings.embed_model = OpenAIEmbedding(model=args.embedding_model)
else: # pragma: no cover - argparse guards choices
raise SystemExit(f"Unsupported embed backend: {args.embed_backend}")
def main() -> None:
args = parse_args()
docs_roots = _parse_docs_roots(args.docs_root)
missing = [str(path) for path, _ in docs_roots if not path.exists()]
if missing:
raise SystemExit(f"Docs root(s) do not exist: {', '.join(missing)}")
target_exts = tuple(ext.lower() for ext in args.extensions)
if args.clean and args.persist_dir.exists():
shutil.rmtree(args.persist_dir)
args.persist_dir.mkdir(parents=True, exist_ok=True)
configure_embedding(args)
Settings.node_parser = SentenceSplitter(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
paragraph_separator="\n\n",
)
documents = []
total_files = 0
unique_langs = set()
for root, lang in docs_roots:
md_files = sorted(
path for path in root.rglob("*") if path.suffix.lower() in target_exts
)
if not md_files:
raise SystemExit(f"No markdown files found under {root}")
reader = SimpleDirectoryReader(
input_files=[str(path) for path in md_files],
file_metadata=_metadata_factory(root, lang),
)
documents.extend(reader.load_data())
total_files += len(md_files)
unique_langs.add(lang)
storage_context = StorageContext.from_defaults()
VectorStoreIndex.from_documents(documents, storage_context=storage_context)
storage_context.persist(persist_dir=str(args.persist_dir))
langs_str = ", ".join(sorted(unique_langs))
print(
f"Indexed {len(documents)} documents from {total_files} markdown files "
f"across {len(docs_roots)} root(s) ({langs_str}) into {args.persist_dir}",
)
if __name__ == "__main__":
main()