index_move_files.py•4.13 kB
#!/usr/bin/env python
"""
Script to index Move files that are already in the docs directory.
"""
import os
import logging
import argparse
import numpy as np
from mcp_server.utils.document_processor import DocumentProcessor
from mcp_server.models.vector_store import FAISSVectorStore
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def index_move_files(docs_dir="docs/move_files", index_file="data/faiss_index.bin"):
"""
Index Move files from a local directory.
Args:
docs_dir: Directory containing Move files
index_file: Path to save the FAISS index
Returns:
Number of documents indexed
"""
# Check if docs directory exists
if not os.path.exists(docs_dir):
logger.error(f"Directory not found: {docs_dir}")
return 0
# Initialize document processor and vector store
logger.info(f"Initializing document processor and vector store")
doc_processor = DocumentProcessor()
# Check embedding dimension of the model
test_embedding = doc_processor.get_embedding("This is a test")
embed_dim = test_embedding.shape[0]
logger.info(f"Embedding dimension of the model: {embed_dim}")
# Initialize vector store with correct dimension
vector_store = FAISSVectorStore(dimension=embed_dim)
# Load existing index if it exists
if os.path.exists(index_file):
logger.info(f"Loading existing index from {index_file}")
try:
vector_store.load(index_file)
logger.info(f"Loaded existing index with {len(vector_store.documents)} documents")
except Exception as e:
logger.error(f"Error loading existing index: {str(e)}")
logger.info("Will create a new index instead")
# Process document directory
logger.info(f"Processing documents from {docs_dir}")
documents = doc_processor.process_documents(docs_dir)
if not documents:
logger.warning("No documents processed. Check the file formats and content.")
return 0
# Check dimensions of document embeddings
if documents:
sample_embedding = np.array(documents[0]['embedding'])
logger.info(f"Sample document embedding dimension: {sample_embedding.shape}")
# Check if all embeddings have the same dimension
dims = [doc['embedding'].shape[0] for doc in documents]
if len(set(dims)) > 1:
logger.warning(f"Different embedding dimensions found: {set(dims)}")
# Index documents
logger.info(f"Indexing {len(documents)} document chunks")
vector_store.index_documents(documents)
# Save index
os.makedirs(os.path.dirname(index_file), exist_ok=True)
vector_store.save(index_file)
logger.info(f"Index saved to {index_file} with {len(vector_store.documents)} total documents")
return len(documents)
def main():
"""Entry point for command-line execution and pipx"""
parser = argparse.ArgumentParser(description="Index Move files for the MCP Server")
parser.add_argument("--docs-dir", default="docs/move_files",
help="Directory containing Move files (default: docs/move_files)")
parser.add_argument("--index-file", default="data/faiss_index.bin",
help="Path to save the FAISS index (default: data/faiss_index.bin)")
parser.add_argument("--verbose", action="store_true",
help="Enable verbose logging")
args = parser.parse_args()
# Set logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Create directories if they don't exist
os.makedirs(os.path.dirname(args.index_file), exist_ok=True)
os.makedirs(args.docs_dir, exist_ok=True)
num_indexed = index_move_files(docs_dir=args.docs_dir, index_file=args.index_file)
print(f"Indexed {num_indexed} document chunks from Move files")
return num_indexed
if __name__ == "__main__":
main()