Skip to main content
Glama

LiveKit RAG Assistant

by THENABILMAN
ingest_from_file.py4.44 kB
""" Ingest LiveKit documentation from livekit_docs.txt file Chunks the text and stores in Pinecone vector database """ import os import time from dotenv import load_dotenv from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from pinecone import Pinecone, ServerlessSpec load_dotenv() PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME") or os.environ.get("PINECONE_INDEX", "livekit-docs") def read_docs_from_file(filepath: str) -> str: """Read documentation from local text file""" try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() print(f"✓ Read {len(content):,} characters from {filepath}") return content except Exception as e: print(f"❌ Error reading file: {str(e)}") return None def main(): if not PINECONE_API_KEY: print("❌ PINECONE_API_KEY not found in .env") return print("📖 Ingesting LiveKit documentation from file...") print("=" * 60) # Read documentation from file doc_content = read_docs_from_file("livekit_docs.txt") if not doc_content: return print("\n📝 Splitting documents into chunks...") text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, # Smaller chunks for maximum accuracy chunk_overlap=150, # More overlap for better context preservation separators=["\n\n", "\n", ". ", " ", ""] ) docs_chunks = text_splitter.split_text(doc_content) print(f"✓ Created {len(docs_chunks)} document chunks") # Filter out very small chunks docs_chunks = [chunk for chunk in docs_chunks if len(chunk.strip()) > 50] print(f"✓ After filtering: {len(docs_chunks)} quality chunks") print("\n🔤 Creating embeddings with HuggingFace...") embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"} ) print("✓ HuggingFace embeddings model loaded") print(f"\n📍 Connecting to Pinecone index: {PINECONE_INDEX_NAME}...") # Initialize Pinecone pc = Pinecone(api_key=PINECONE_API_KEY) # Check if index exists, if not create it try: index = pc.Index(PINECONE_INDEX_NAME) print(f"✓ Connected to existing index") except: print(f"📌 Creating new index: {PINECONE_INDEX_NAME}") pc.create_index( name=PINECONE_INDEX_NAME, dimension=384, # HuggingFace embeddings dimension metric="cosine", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) time.sleep(2) index = pc.Index(PINECONE_INDEX_NAME) print("✓ Index created and ready") print(f"\n⬆️ Uploading {len(docs_chunks)} chunks to Pinecone...") # Upload in batches batch_size = 50 total_uploaded = 0 for i in range(0, len(docs_chunks), batch_size): batch = docs_chunks[i:i+batch_size] # Create embeddings for this batch batch_embeddings = embeddings.embed_documents(batch) # Create vectors for upsert vectors = [] for j, (text, embedding) in enumerate(zip(batch, batch_embeddings)): vector_id = f"livekit-doc-{i+j}" vectors.append({ "id": vector_id, "values": embedding, "metadata": { "text": text[:500], # Store first 500 chars as preview "source": "livekit_docs.txt", "chunk_index": i+j } }) # Upsert to Pinecone index.upsert(vectors=vectors) total_uploaded += len(vectors) print(f" ⬆️ Uploaded batch {i//batch_size + 1}/{(len(docs_chunks)-1)//batch_size + 1} ({total_uploaded} total)") time.sleep(0.5) # Rate limiting print(f"\n✅ Successfully uploaded {total_uploaded} chunks to Pinecone!") print(f"📊 Index: {PINECONE_INDEX_NAME}") print(f"📈 Total chunks: {len(docs_chunks)}") print(f"💾 Total content: {sum(len(chunk) for chunk in docs_chunks):,} characters") print(f"\n🎉 LiveKit documentation is now ready for searching!") if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/THENABILMAN/THENABILMAN_LiveKit_MCP_Assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server