LiveKit RAG Assistant

lktxt.py•3.78 KiB

import time import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, urldefrag from collections import deque START_URL = "https://docs.livekit.io/" ALLOWED_NETLOC = urlparse(START_URL).netloc OUTPUT_FILE = "livekit_docs.txt" HEADERS = {"User-Agent": "livekit-docs-dumper/1.0 (+https://github.com/yourname)"} # Crawl settings MAX_PAGES = 300 # limit for safety; change if you need more DELAY_BETWEEN_REQUESTS = 0.8 # seconds ONLY_PATH_PREFIX = "/" # restrict to paths under root; adjust if needed def is_same_site(url): p = urlparse(url) return (p.scheme in ("http", "https")) and (p.netloc == ALLOWED_NETLOC) def normalize_url(base, link): if not link: return None # remove fragment, join relative to base joined = urljoin(base, link) nofrag, _ = urldefrag(joined) return nofrag def extract_text_from_html(html, page_url): soup = BeautifulSoup(html, "html.parser") # remove scripts, styles, nav, footer (common noisy elements) for tag in soup(["script", "style", "header", "footer", "nav", "aside"]): tag.decompose() # Optionally keep h1-h3 as headings texts = [] title = soup.title.string.strip() if soup.title and soup.title.string else "" if title: texts.append(f"PAGE TITLE: {title}") texts.append(f"URL: {page_url}") # Add headings for h in soup.find_all(["h1","h2","h3"]): t = h.get_text(separator=" ", strip=True) if t: texts.append("\n== " + t + " ==") # Main textual content body = soup.get_text(separator="\n", strip=True) # Keep body but trim redundant whitespace and short repeated lines lines = [ln.strip() for ln in body.splitlines() if ln.strip()] # Avoid duplicating the title line inside body filtered = [ln for ln in lines if ln not in (title,)] texts.append("\n".join(filtered)) texts.append("\n" + "="*80 + "\n") return "\n".join(texts) def crawl_and_dump(start_url, output_file, max_pages=MAX_PAGES): visited = set() queue = deque([start_url]) pages_processed = 0 with open(output_file, "w", encoding="utf-8") as out: while queue and pages_processed < max_pages: url = queue.popleft() if url in visited: continue if not is_same_site(url): continue visited.add(url) try: resp = requests.get(url, headers=HEADERS, timeout=20) if resp.status_code != 200 or "text/html" not in resp.headers.get("Content-Type", ""): # skip non-html pages continue html = resp.text except Exception as e: print(f"[WARN] Failed to fetch {url}: {e}") continue text_blob = extract_text_from_html(html, url) out.write(text_blob + "\n") pages_processed += 1 print(f"[INFO] Saved ({pages_processed}): {url}") # Discover links to follow soup = BeautifulSoup(html, "html.parser") for a in soup.find_all("a", href=True): normalized = normalize_url(url, a["href"]) if not normalized: continue # restrict to same host and optional path prefix parsed = urlparse(normalized) # ensure it's same origin and under the chosen path (e.g., keep docs pages only) if parsed.netloc == ALLOWED_NETLOC and parsed.path.startswith(ONLY_PATH_PREFIX): if normalized not in visited: queue.append(normalized) time.sleep(DELAY_BETWEEN_REQUESTS) print(f"[DONE] Wrote {pages_processed} pages to {output_file}") if __name__ == "__main__": crawl_and_dump(START_URL, OUTPUT_FILE)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/THENABILMAN/THENABILMAN_LiveKit_MCP_Assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

lktxt.py•3.78 KiB