download_move_files.py•8.64 kB
#!/usr/bin/env python
"""
Script to download Move files from GitHub and index them using the MCP server.
"""
import os
import argparse
import logging
import requests
from dotenv import load_dotenv
from mcp_server.utils.github_extractor import GitHubExtractor, extract_and_index_move_files
from mcp_server.utils.document_processor import DocumentProcessor
from mcp_server.models.vector_store import FAISSVectorStore
from mcp_server.index_move_files import index_move_files
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
def download_and_index_move_files(query="use sui",
output_dir="docs/move_files",
index_file="data/faiss_index.bin",
github_token=None,
use_scraping=True,
max_results=100,
merge_with_existing=True):
"""
Download Move files from GitHub and index them in FAISS.
Args:
query: GitHub search query
output_dir: Directory to save downloaded files
index_file: Path to save/load FAISS index
github_token: GitHub API token
use_scraping: Whether to use web scraping as fallback
max_results: Maximum number of files to download
merge_with_existing: Whether to merge with existing index
Returns:
Number of files indexed
"""
# Extract Move files from GitHub
num_files, file_paths = extract_and_index_move_files(
query=query,
output_dir=output_dir,
github_token=github_token,
use_scraping=use_scraping,
max_results=max_results
)
if num_files == 0:
logger.warning("No Move files found. Check your search query and GitHub token.")
return 0
logger.info(f"Downloaded {num_files} Move files to {output_dir}")
# Initialize document processor and vector store
doc_processor = DocumentProcessor()
vector_store = FAISSVectorStore()
# Load existing index if it exists and merge_with_existing is True
if os.path.exists(index_file) and merge_with_existing:
logger.info(f"Loading existing index from {index_file}")
try:
vector_store.load(index_file)
logger.info(f"Loaded existing index with {len(vector_store.documents)} documents")
except Exception as e:
logger.error(f"Error loading existing index: {str(e)}")
logger.info("Will create a new index instead")
# Process document directory
logger.info(f"Processing documents from {output_dir}")
documents = doc_processor.process_documents(output_dir)
if not documents:
logger.warning("No documents processed. Check the file formats and content.")
return 0
# Index documents
logger.info(f"Indexing {len(documents)} document chunks")
vector_store.index_documents(documents)
# Save index
os.makedirs(os.path.dirname(index_file), exist_ok=True)
vector_store.save(index_file)
logger.info(f"Index saved to {index_file} with {len(vector_store.documents)} total documents")
return len(documents)
def check_mcp_server(server_url="http://localhost:8000"):
"""Check if MCP server is running and handle indexing through API if available"""
try:
response = requests.get(f"{server_url}/docs")
if response.status_code == 200:
return True
return False
except:
return False
def main():
"""Main function to parse arguments and run the script"""
parser = argparse.ArgumentParser(
description="Download Move files from GitHub and index them for the MCP Server"
)
parser.add_argument(
"--query",
default="use sui",
help="GitHub search query (default: 'use sui')"
)
parser.add_argument(
"--output-dir",
default="docs/move_files",
help="Directory to save downloaded files (default: docs/move_files)"
)
parser.add_argument(
"--index-file",
default="data/faiss_index.bin",
help="Path to save/load FAISS index (default: data/faiss_index.bin)"
)
parser.add_argument(
"--token",
default=os.getenv("GITHUB_TOKEN"),
help="GitHub personal access token (default: from GITHUB_TOKEN env var)"
)
parser.add_argument(
"--no-scraping",
action="store_true",
help="Disable web scraping fallback, use only GitHub API"
)
parser.add_argument(
"--max-results",
type=int,
default=100,
help="Maximum number of files to download (default: 100)"
)
parser.add_argument(
"--new-index",
action="store_true",
help="Create a new index instead of merging with existing one"
)
parser.add_argument(
"--server-url",
default="http://localhost:8000",
help="MCP Server URL if already running (default: http://localhost:8000)"
)
parser.add_argument(
"--use-api",
action="store_true",
help="Use MCP server API for indexing if server is running"
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Set logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Create data directory for index if needed
os.makedirs(os.path.dirname(args.index_file), exist_ok=True)
# Check if server is running and we should use the API
server_running = check_mcp_server(args.server_url) if args.use_api else False
# Step 1: Download files from GitHub first
num_files, file_paths = extract_and_index_move_files(
query=args.query,
output_dir=args.output_dir,
github_token=args.token,
use_scraping=not args.no_scraping,
max_results=args.max_results
)
if num_files == 0:
logger.warning("No Move files found. Check your search query and GitHub token.")
return 1
logger.info(f"Downloaded {num_files} Move files to {args.output_dir}")
# Step 2: Index the files
if server_running and args.use_api:
logger.info(f"MCP Server is running at {args.server_url}, using API for indexing")
# Call the server's index endpoint with the directory
try:
response = requests.post(
f"{args.server_url}/index",
params={"directory_path": args.output_dir}
)
if response.status_code == 200:
result = response.json()
logger.info(f"API indexing successful: {result['message']}")
else:
logger.error(f"API indexing failed: {response.status_code} - {response.text}")
# Fall back to local indexing if API fails
logger.info("Falling back to local indexing...")
num_indexed = index_move_files(docs_dir=args.output_dir, index_file=args.index_file)
logger.info(f"Indexed {num_indexed} document chunks locally")
except Exception as e:
logger.error(f"Error using API for indexing: {str(e)}")
# Fall back to local indexing if API fails
logger.info("Falling back to local indexing...")
num_indexed = index_move_files(docs_dir=args.output_dir, index_file=args.index_file)
logger.info(f"Indexed {num_indexed} document chunks locally")
else:
# Do local processing
logger.info("Using local processing for indexing")
num_indexed = index_move_files(docs_dir=args.output_dir, index_file=args.index_file)
logger.info(f"Indexed {num_indexed} document chunks to the index")
# Always verify that the index exists and is accessible
try:
vector_store = FAISSVectorStore()
vector_store.load(args.index_file)
logger.info(f"Successfully loaded index with {len(vector_store.documents)} documents")
except Exception as e:
logger.error(f"Error loading index after building: {str(e)}")
logger.info("Processing complete")
return 0
if __name__ == "__main__":
main()