# setup_db.py: Initializes Chroma vector database with sample medical documents
# Located in C:\Users\sniki\OneDrive\Desktop\health_rag_mcp\mcprag\
from langchain_community.document_loaders import TextLoader # Updated import
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_directory = "./chroma_db"
documents_dir = "./documents"
os.makedirs(documents_dir, exist_ok=True)
sample_docs = [
("doc1.txt", "Flu symptoms include fever, cough, sore throat, and fatigue. Treatment involves rest and hydration."),
("doc2.txt", "Asthma is characterized by wheezing, shortness of breath, and chest tightness. Use inhalers for management.")
]
for filename, content in sample_docs:
with open(os.path.join(documents_dir, filename), "w", encoding="utf-8") as f:
f.write(content)
docs = []
for filename in os.listdir(documents_dir):
loader = TextLoader(os.path.join(documents_dir, filename), encoding="utf-8")
docs.extend(loader.load())
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
print("Vector database initialized with sample medical documents.")