Pybricks RAG Assistant

ingest_pybricks.py•13.5 kB

#!/usr/bin/env python3 """ Build a beginner-friendly Pybricks RAG corpus into ChromaDB. - Crawls/loads selected Pybricks docs/tutorials + key GitHub changelogs. - Normalizes & chunks (HTML/Markdown-aware; preserves code blocks). - Creates multi-view docs (full text + headings/signatures + identifiers). - Indexes into ChromaDB collections: pybricks_docs and pybricks_snippets. Requires: requests, beautifulsoup4, lxml, markdownify, tiktoken, chromadb, sentence-transformers pip install requests beautifulsoup4 lxml markdownify tiktoken chromadb sentence-transformers """ import re import argparse import time import json import hashlib from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from markdownify import markdownify as md import tiktoken import chromadb from chromadb.utils import embedding_functions ############################ # Config ############################ SEED_URLS = [ # API docs (latest & versioned) "https://docs.pybricks.com/en/latest/robotics.html", # DriveBase, etc. "https://docs.pybricks.com/en/latest/parameters/", # Port, Direction, Stop, etc. "https://docs.pybricks.com/", # root (to discover other key pages) "https://docs.pybricks.com/en/v3.5.0/robotics.html", # older ref (version contrast) "https://docs.pybricks.com/en/v3.3.0/robotics.html", # Tutorials / Getting started "https://pybricks.com/learn/getting-started/pybricks-environment/", "https://pybricks.com/learn/getting-started/install-pybricks/", "https://pybricks.com/projects/tutorials/wireless/hub-to-device/pc-communication/", "https://pybricks.com/projects/tutorials/wireless/hub-to-hub/broadcast/", "https://pybricks.com/projects/tutorials/wireless/remote-control/button-basics/", "https://pybricks.com/projects/sets/mindstorms-robot-inventor/other-models/quick-reference/", "https://code.pybricks.com/", # Changelogs (version awareness) "https://github.com/pybricks/pybricks-micropython/blob/master/CHANGELOG.md", "https://github.com/pybricks/pybricks-code/blob/master/CHANGELOG.md", # pybricksdev (optional power tool docs) "https://docs.pybricks.com/projects/pybricksdev/en/latest/api/", ] ALLOWED_DOMAINS = { "docs.pybricks.com", "pybricks.com", "code.pybricks.com", "github.com", } MAX_PAGES = 80 # keep crawl polite; bump when needed CRAWL_TIMEOUT = 15 USER_AGENT = "pybricks-rag-ingestor/1.0 (+for educational/FLL use)" # Chunking defaults DOC_CHUNK_TOKENS = 600 DOC_OVERLAP_TOKENS = 100 # Chroma config CHROMA_DIR = "./chroma_pybricks" COLL_DOCS = "pybricks_docs" COLL_SNIPPETS = "pybricks_snippets" # short task-oriented code templates/snips # Embeddings: sentence-transformers (local) EMBED_MODEL = "all-MiniLM-L6-v2" ############################ # Helpers ############################ tok = tiktoken.get_encoding("cl100k_base") def count_tokens(text: str) -> int: return len(tok.encode(text)) def hash_id(s: str) -> str: return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32] def within_domain(url: str) -> bool: try: host = urlparse(url).netloc return any(host.endswith(d) for d in ALLOWED_DOMAINS) except Exception: return False def get(url: str) -> requests.Response: return requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=CRAWL_TIMEOUT) def extract_links(url: str, soup: BeautifulSoup): for a in soup.select("a[href]"): href = a["href"] if href.startswith("#"): continue u = urljoin(url, href) if within_domain(u): yield u.split("#")[0] def html_to_markdown_keep_code(html: str) -> str: # Convert HTML to Markdown but keep code blocks and headings clean # md() handles most; we'll fix common issues text = md(html, heading_style="ATX", strip=["nav","footer","script","style","noscript"]) # compact extra blank lines text = re.sub(r"\n{3,}", "\n\n", text).strip() return text def normalize_markdown(text: str) -> str: # de-dup whitespace, make headings spaced text = re.sub(r"[ \t]+\n", "\n", text) return text.strip() ############################ # Chunkers ############################ HEADING_PATTERN = re.compile(r"^(#{1,3})\s+(.*)", re.MULTILINE) def split_by_headings(markdown: str): """ Split markdown by H1–H3. Keep fenced code blocks with their section. """ lines = markdown.splitlines() sections = [] current = {"title": None, "content": []} def push(): if current["content"]: sections.append({ "title": current["title"] or "", "text": "\n".join(current["content"]).strip() }) for i, line in enumerate(lines): if HEADING_PATTERN.match(line): # new section if current["content"]: push() current = {"title": None, "content": []} current["title"] = HEADING_PATTERN.match(line).group(2) current["content"].append(line) else: current["content"].append(line) push() return [s for s in sections if s["text"]] def window_tokens(text: str, size_tokens: int, overlap_tokens: int): ids = tok.encode(text) n = len(ids) if n <= size_tokens: yield text return start = 0 while start < n: end = min(n, start + size_tokens) chunk_ids = ids[start:end] yield tok.decode(chunk_ids) if end == n: break start = max(0, end - overlap_tokens) def identifiers_view(text: str) -> str: # Pull likely “identifier-ish” tokens: backticked names, UPPER_CASE, CamelCase, Port.X, Direction.X, drivebase methods, HTML ids ids = re.findall(r"`([^`]+)`|([A-Z_]{2,})|([A-Za-z_][A-Za-z0-9_]+)|id=\"([^\"]+)\"", text) flat = [] for g in ids: flat.extend([x for x in g if x]) # de-noise trivial words flat = [w for w in flat if len(w) > 2] return "\n".join(sorted(set(flat))) ############################ # Crawl ############################ def crawl(max_pages: int = MAX_PAGES): seen = set() queue = list(SEED_URLS) pages = [] while queue and len(pages) < max_pages: url = queue.pop(0) if url in seen or not within_domain(url): continue seen.add(url) try: r = get(url) if r.status_code != 200: continue ct = r.headers.get("Content-Type","") html = r.text soup = BeautifulSoup(html, "lxml") # strip obvious boilerplate for tag in soup(["nav","footer","script","style","noscript"]): tag.decompose() body = soup.body or soup main = body.select_one("main") or body # Many docs are Sphinx → good structure + headings # Convert to markdown for easier heading split markdown = html_to_markdown_keep_code(str(main)) markdown = normalize_markdown(markdown) pages.append({ "url": url, "markdown": markdown }) # discover more links from key roots (docs/tutorials) if any(url.startswith(prefix) for prefix in [ "https://docs.pybricks.com/", "https://pybricks.com/learn/", "https://pybricks.com/projects/", ]): for u in set(extract_links(url, soup)): if u not in seen and within_domain(u): # keep within same area if len(pages) + len(queue) < max_pages: queue.append(u) time.sleep(0.3) # be polite except Exception: continue return pages ############################ # Build records & index ############################ def build_records(pages): records = [] for p in pages: url = p["url"] mdtext = p["markdown"] # version tags from URL version = "latest" if "/en/latest/" in url else ( re.search(r"/en/v?(\d+\.\d+\.\d+)/", url).group(1) if re.search(r"/en/v?(\d+\.\d+\.\d+)/", url) else "stable" ) # sectionize by H1–H3 sections = split_by_headings(mdtext) or [{"title":"","text":mdtext}] for sec_idx, sec in enumerate(sections): # windowing to stay within token budget for win_idx, win in enumerate(window_tokens(sec["text"], DOC_CHUNK_TOKENS, DOC_OVERLAP_TOKENS)): # Include indices to avoid duplicates when titles and leading text repeat rec_key = f"{url}|sec={sec_idx}|win={win_idx}|title={sec['title']}" rec_id = hash_id(rec_key) records.append({ "id": rec_id, "text": win, "meta": { "url": url, "title": sec["title"], "version": version, "source": urlparse(url).netloc, "breadcrumbs": sec["title"], "type": "doc" }, "identifiers": identifiers_view(win) }) return records ############################ # ChromaDB ############################ def upsert_chroma(records): client = chromadb.PersistentClient(path=CHROMA_DIR) ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL) docs = client.get_or_create_collection(name=COLL_DOCS, embedding_function=ef, metadata={"hnsw:space":"cosine"}) ids = [r["id"] for r in records] texts = [r["text"] for r in records] metas = [r["meta"] for r in records] # Write in small batches and use upsert for idempotency BATCH = 128 for i in range(0, len(ids), BATCH): sl = slice(i, i+BATCH) try: docs.upsert(ids=ids[sl], documents=texts[sl], metadatas=metas[sl]) except Exception: # Fallback for older chromadb without upsert or if a validation error occurs try: docs.delete(ids=ids[sl]) except Exception: pass docs.add(ids=ids[sl], documents=texts[sl], metadatas=metas[sl]) # Optional: create a lightweight identifiers collection to help rerank exact symbol hits ident = client.get_or_create_collection(name=f"{COLL_DOCS}_identifiers", embedding_function=ef, metadata={"hnsw:space":"cosine"}) ident_ids = [f"I_{i}" for i in ids] for i in range(0, len(ident_ids), BATCH): sl = slice(i, i+BATCH) try: ident.upsert( ids=ident_ids[sl], documents=[r["identifiers"] for r in records][sl], metadatas=[{**r["meta"], "type":"identifiers"} for r in records][sl] ) except Exception: try: ident.delete(ids=ident_ids[sl]) except Exception: pass ident.add( ids=ident_ids[sl], documents=[r["identifiers"] for r in records][sl], metadatas=[{**r["meta"], "type":"identifiers"} for r in records][sl] ) # Starter snippet/templates (task-oriented) snippets = [ { "id": "snip_drivebase_basic", "text": """# Drive straight and turn from pybricks.hubs import PrimeHub from pybricks.pupdevices import Motor from pybricks.parameters import Port from pybricks.robotics import DriveBase hub = PrimeHub() left = Motor(Port.A) right = Motor(Port.B) bot = DriveBase(left, right, wheel_diameter=56, axle_track=114) bot.straight(300) # mm bot.turn(90) # + is clockwise/right """, "meta": {"topic":"drivebase","version":"stable","url":"https://docs.pybricks.com/en/latest/robotics.html"} }, ] sn = client.get_or_create_collection(name=COLL_SNIPPETS, embedding_function=ef, metadata={"hnsw:space":"cosine"}) try: sn.upsert( ids=[s["id"] for s in snippets], documents=[s["text"] for s in snippets], metadatas=[s["meta"] for s in snippets] ) except Exception: try: sn.delete(ids=[s["id"] for s in snippets]) except Exception: pass sn.add( ids=[s["id"] for s in snippets], documents=[s["text"] for s in snippets], metadatas=[s["meta"] for s in snippets] ) return client def main(): parser = argparse.ArgumentParser(description="Build Pybricks RAG corpus into ChromaDB") parser.add_argument("--test", action="store_true", help="Enable quick test mode: limit crawl pages for fast debug") parser.add_argument("--max-pages", type=int, default=None, help="Override max pages to crawl") args = parser.parse_args() # Determine crawl size max_pages = args.max_pages if args.max_pages is not None else (8 if args.test else MAX_PAGES) if args.test: print(f"[TEST MODE] Limiting crawl to {max_pages} pages for rapid debugging") print("Crawling…") pages = crawl(max_pages=max_pages) print(f"Crawled {len(pages)} pages") records = build_records(pages) print(f"Built {len(records)} records") client = upsert_chroma(records) print("ChromaDB ready at", CHROMA_DIR) if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/beallac/pybricks_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server