Personal Semantic Search MCP

vector_store.py•9.21 kB

""" Vector Store Module ChromaDB wrapper for storing and querying embeddings. Handles persistence, upserts, deletions, and similarity search. """ import os from dataclasses import dataclass from pathlib import Path import chromadb from chromadb.config import Settings from text_chunker import Chunk # Default storage location DEFAULT_DB_PATH = os.path.expanduser("~/.semantic-search/chroma") @dataclass class SearchResult: """Represents a search result with chunk and similarity score.""" chunk: Chunk # Original chunk with all metadata score: float # Similarity score (0-1, higher = more similar) distance: float # Raw distance from query def make_chunk_id(chunk: Chunk) -> str: """ Create unique, deterministic ID for a chunk. Normalized for cross-platform consistency. """ normalized_path = chunk.source_path.replace('\\', '/') return f"{normalized_path}::chunk_{chunk.chunk_index}" def chunk_to_metadata(chunk: Chunk) -> dict: """Convert chunk to ChromaDB metadata dict.""" return { "source_path": chunk.source_path, "file_name": Path(chunk.source_path).name, "file_type": chunk.file_type, "chunk_index": chunk.chunk_index, "total_chunks": chunk.total_chunks, "modified": chunk.modified, "headers": " > ".join(chunk.headers) if chunk.headers else "", "token_count": chunk.token_count, "char_start": chunk.char_start, "char_end": chunk.char_end } def metadata_to_chunk(metadata: dict, content: str) -> Chunk: """Reconstruct Chunk from ChromaDB metadata and content.""" headers = metadata.get("headers", "") return Chunk( content=content, source_path=metadata["source_path"], file_type=metadata["file_type"], chunk_index=metadata["chunk_index"], total_chunks=metadata["total_chunks"], modified=metadata["modified"], char_start=metadata["char_start"], char_end=metadata["char_end"], headers=headers.split(" > ") if headers else [], token_count=metadata["token_count"] ) def init_db(path: str = DEFAULT_DB_PATH) -> chromadb.Collection: """ Initialize ChromaDB with persistent storage. Args: path: Directory for database persistence Returns: ChromaDB collection ready for use """ # Ensure directory exists Path(path).mkdir(parents=True, exist_ok=True) # Initialize client with persistence client = chromadb.PersistentClient(path=path) # Get or create collection collection = client.get_or_create_collection( name="semantic_search", metadata={"description": "Personal semantic search index"} ) return collection def upsert_chunks( collection: chromadb.Collection, chunks: list[Chunk], embeddings: list[list[float]] ) -> int: """ Insert or update chunks in the database. Args: collection: ChromaDB collection chunks: List of Chunk objects embeddings: Corresponding embeddings Returns: Number of chunks upserted """ if not chunks: return 0 ids = [make_chunk_id(c) for c in chunks] documents = [c.content for c in chunks] metadatas = [chunk_to_metadata(c) for c in chunks] collection.upsert( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas ) return len(chunks) def delete_by_source(collection: chromadb.Collection, source_path: str) -> int: """ Delete all chunks from a specific source file. Args: collection: ChromaDB collection source_path: Path to the source file Returns: Number of chunks deleted """ # Query to find all chunks from this source results = collection.get( where={"source_path": source_path}, include=[] ) if results["ids"]: collection.delete(ids=results["ids"]) return len(results["ids"]) return 0 def search( collection: chromadb.Collection, query_embedding: list[float], n_results: int = 10, filters: dict = None ) -> list[SearchResult]: """ Search for similar chunks. Args: collection: ChromaDB collection query_embedding: Query vector n_results: Maximum number of results filters: Optional metadata filters (e.g., {"file_type": "md"}) Returns: List of SearchResult objects sorted by relevance """ # Build query parameters query_params = { "query_embeddings": [query_embedding], "n_results": n_results, "include": ["documents", "metadatas", "distances"] } if filters: query_params["where"] = filters results = collection.query(**query_params) # Convert to SearchResult objects search_results = [] if results["ids"] and results["ids"][0]: for i, doc_id in enumerate(results["ids"][0]): content = results["documents"][0][i] metadata = results["metadatas"][0][i] distance = results["distances"][0][i] # Convert distance to similarity score (0-1, higher = more similar) # ChromaDB uses L2 distance by default score = 1 / (1 + distance) chunk = metadata_to_chunk(metadata, content) search_results.append(SearchResult( chunk=chunk, score=score, distance=distance )) return search_results def get_indexed_files(collection: chromadb.Collection) -> dict[str, float]: """ Get map of source_path -> modified timestamp for all indexed files. Returns: Dict mapping file paths to their indexed modification times """ # Get all unique source paths and their modified times results = collection.get(include=["metadatas"]) file_map = {} if results["metadatas"]: for metadata in results["metadatas"]: source_path = metadata["source_path"] modified = metadata["modified"] # Keep the most recent modified time for each file if source_path not in file_map or modified > file_map[source_path]: file_map[source_path] = modified return file_map def get_collection_stats(collection: chromadb.Collection) -> dict: """ Get statistics about the collection. Returns: Dict with count, file types, etc. """ count = collection.count() if count == 0: return {"total_chunks": 0, "total_files": 0, "by_type": {}} results = collection.get(include=["metadatas"]) files = set() by_type = {} for metadata in results["metadatas"]: files.add(metadata["source_path"]) file_type = metadata["file_type"] by_type[file_type] = by_type.get(file_type, 0) + 1 return { "total_chunks": count, "total_files": len(files), "by_type": by_type } # CLI for testing if __name__ == '__main__': import tempfile from embedding_engine import get_embedding, get_embeddings_batch print("Testing vector store...") # Use temp directory for test with tempfile.TemporaryDirectory() as tmpdir: db_path = os.path.join(tmpdir, "test_chroma") collection = init_db(db_path) print(f"✓ Database initialized at {db_path}") # Create test chunks test_chunks = [ Chunk( content="How to schedule meetings in the calendar app", source_path="/test/calendar.md", file_type="md", chunk_index=0, total_chunks=1, modified=1234567890.0, char_start=0, char_end=50, headers=["# Calendar"], token_count=10 ), Chunk( content="Setting up task reminders and notifications", source_path="/test/tasks.md", file_type="md", chunk_index=0, total_chunks=1, modified=1234567890.0, char_start=0, char_end=50, headers=["# Tasks"], token_count=8 ) ] # Generate embeddings and upsert texts = [c.content for c in test_chunks] embeddings = get_embeddings_batch(texts) count = upsert_chunks(collection, test_chunks, embeddings) print(f"✓ Upserted {count} chunks") # Test search query = "How do I add events to my calendar?" query_embedding = get_embedding(query) results = search(collection, query_embedding, n_results=2) print(f"✓ Search returned {len(results)} results") for r in results: print(f" - {r.chunk.content[:50]}... (score: {r.score:.3f})") # Test stats stats = get_collection_stats(collection) print(f"✓ Stats: {stats}") # Test delete deleted = delete_by_source(collection, "/test/calendar.md") print(f"✓ Deleted {deleted} chunks") print("\n✓ Vector store working correctly!")

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Ethan2298/personal-semantic-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

vector_store.py•9.21 kB