import os
import sys
import argparse
from pathlib import Path
from tqdm import tqdm
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from rag_pipeline import DocumentProcessor, TextChunker, VectorStore, MetadataExtractor
def index_documents(folder_path: str, recursive: bool = False, force: bool = False):
"""Index all documents in a folder"""
# Initialize components
processor = DocumentProcessor()
chunker = TextChunker()
vector_store = VectorStore()
# Get all supported files
supported_exts = ['.pdf', '.docx', '.html', '.md', '.txt']
files = []
if recursive:
for ext in supported_exts:
files.extend(Path(folder_path).rglob(f'*{ext}'))
else:
for ext in supported_exts:
files.extend(Path(folder_path).glob(f'*{ext}'))
print(f"Found {len(files)} documents to index\n")
total_chunks = 0
successful = 0
failed = 0
for filepath in tqdm(files, desc="Indexing documents"):
try:
filepath_str = str(filepath)
# Extract text and metadata
doc_data = processor.extract_text(filepath_str)
# Enhance metadata
enhanced_metadata = MetadataExtractor.enhance_metadata(
doc_data["metadata"],
filepath_str,
doc_data["text"]
)
doc_data["metadata"] = enhanced_metadata
# Chunk document
chunks = chunker.chunk_document(doc_data, filepath_str)
# Add to vector store
added = vector_store.add_documents(chunks)
total_chunks += added
successful += 1
print(f"✓ {filepath.name}: {added} chunks")
except Exception as e:
failed += 1
print(f"✗ {filepath.name}: {str(e)}")
# Print summary
print(f"\n{'='*60}")
print(f"Indexing Complete!")
print(f"{'='*60}")
print(f"✓ Successful: {successful}")
print(f"✗ Failed: {failed}")
print(f"📦 Total chunks: {total_chunks}")
# Get collection stats
stats = vector_store.get_collection_stats()
print(f"🗄️ Collection total: {stats['total_chunks']} chunks")
print(f"{'='*60}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Index documents into vector store")
parser.add_argument("--folder", required=True, help="Folder containing documents")
parser.add_argument("--recursive", action="store_true", help="Index subdirectories")
parser.add_argument("--force", action="store_true", help="Re-index existing documents")
args = parser.parse_args()
if not os.path.exists(args.folder):
print(f"Error: Folder '{args.folder}' does not exist")
sys.exit(1)
index_documents(args.folder, args.recursive, args.force)