import streamlit as st
import sys
import os
# Fix path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(current_dir))
sys.path.insert(0, parent_dir)
from rag_pipeline import DocumentProcessor, TextChunker, VectorStore, MetadataExtractor
import tempfile
st.title("📤 Upload Documents")
processor = DocumentProcessor()
chunker = TextChunker()
vector_store = VectorStore(use_local=True)
uploaded_files = st.file_uploader(
"Choose files",
type=['pdf', 'docx', 'html', 'md', 'txt'],
accept_multiple_files=True
)
if uploaded_files and st.button("Index Documents"):
progress_bar = st.progress(0)
status = st.empty()
for idx, file in enumerate(uploaded_files):
status.text(f"Processing {file.name}...")
try:
# Save temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
# Process
doc_data = processor.extract_text(tmp_path)
enhanced_meta = MetadataExtractor.enhance_metadata(
doc_data["metadata"],
file.name,
doc_data["text"]
)
doc_data["metadata"] = enhanced_meta
# Chunk and index
chunks = chunker.chunk_document(doc_data, file.name)
added = vector_store.add_documents(chunks)
st.success(f"✓ {file.name}: {added} chunks indexed")
# Cleanup
os.unlink(tmp_path)
except Exception as e:
st.error(f"✗ {file.name}: {e}")
progress_bar.progress((idx + 1) / len(uploaded_files))
status.text("✅ Complete!")
st.balloons()