import chromadb
from chromadb.config import Settings
from typing import List, Dict
from .config import Config
try:
from langchain_openai import OpenAIEmbeddings
USE_OPENAI = True
except:
USE_OPENAI = False
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
class VectorStore:
def __init__(self, use_local=False):
Config.validate()
if use_local or not Config.OPENAI_API_KEY:
print("Using local embeddings (sentence-transformers)")
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
else:
self.embeddings = OpenAIEmbeddings(
model=Config.EMBEDDING_MODEL,
openai_api_key=Config.OPENAI_API_KEY
)
self.client = chromadb.PersistentClient(
path=Config.CHROMA_DB_PATH
)
self.vectorstore = Chroma(
client=self.client,
collection_name=Config.COLLECTION_NAME,
embedding_function=self.embeddings
)
def add_documents(self, chunks: List[Dict]) -> int:
"""Add document chunks to vector store"""
texts = [chunk["text"] for chunk in chunks]
metadatas = [chunk["metadata"] for chunk in chunks]
self.vectorstore.add_texts(texts=texts, metadatas=metadatas)
return len(chunks)
def search(self, query: str, top_k: int = None, filters: Dict = None) -> List[Dict]:
"""Semantic search with optional metadata filters"""
k = top_k or Config.TOP_K_RESULTS
results = self.vectorstore.similarity_search_with_score(
query=query,
k=k,
filter=filters
)
return [
{
"text": doc.page_content,
"metadata": doc.metadata,
"score": float(score)
}
for doc, score in results
]
def get_collection_stats(self) -> Dict:
"""Get collection statistics"""
collection = self.client.get_collection(Config.COLLECTION_NAME)
return {
"total_chunks": collection.count(),
"collection_name": Config.COLLECTION_NAME
}
def delete_by_source(self, source_path: str):
"""Delete all chunks from a document"""
self.vectorstore.delete(
where={"source_path": source_path}
)