Skip to main content
Glama
index_codebase.py5.78 kB
#!/usr/bin/env python3 """Command-line tool for indexing a Python codebase.""" import sys import argparse import logging from pathlib import Path # Add the parent directory to the path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from chunking.multi_language_chunker import MultiLanguageChunker from embeddings.embedder import CodeEmbedder from search.indexer import CodeIndexManager def setup_logging(verbose: bool = False): """Setup logging configuration.""" level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) def main(): parser = argparse.ArgumentParser( description="Index a Python codebase for semantic search" ) parser.add_argument( "directory", help="Directory containing Python files to index" ) parser.add_argument( "--storage-dir", default=str(Path.home() / ".claude_code_search"), help="Directory to store index and embeddings (default: ~/.claude_code_search)" ) parser.add_argument( "--batch-size", type=int, default=8, help="Batch size for embedding generation (default: 8)" ) parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose logging" ) parser.add_argument( "--clear", action="store_true", help="Clear existing index before indexing" ) args = parser.parse_args() setup_logging(args.verbose) logger = logging.getLogger(__name__) # Validate directory directory_path = Path(args.directory).resolve() if not directory_path.exists(): logger.error(f"Directory does not exist: {directory_path}") sys.exit(1) if not directory_path.is_dir(): logger.error(f"Path is not a directory: {directory_path}") sys.exit(1) # Setup storage storage_dir = Path(args.storage_dir) storage_dir.mkdir(parents=True, exist_ok=True) try: logger.info(f"Indexing directory: {directory_path}") logger.info(f"Storage directory: {storage_dir}") # Initialize components logger.info("Initializing components...") chunker = MultiLanguageChunker(str(directory_path)) # Initialize embedder with cache in storage directory models_dir = storage_dir / "models" models_dir.mkdir(exist_ok=True) embedder = CodeEmbedder(cache_dir=str(models_dir)) # Initialize index manager index_dir = storage_dir / "index" index_manager = CodeIndexManager(str(index_dir)) # Clear existing index if requested if args.clear: logger.info("Clearing existing index...") index_manager.clear_index() # Chunk the codebase logger.info("Parsing and chunking Python files...") chunks = chunker.chunk_directory() if not chunks: logger.error("No Python files found or no chunks extracted") sys.exit(1) logger.info(f"Generated {len(chunks)} chunks from Python files") # Display some statistics chunk_types = {} file_count = {} for chunk in chunks: # Count chunk types chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1 # Count files file_count[chunk.relative_path] = file_count.get(chunk.relative_path, 0) + 1 logger.info(f"Chunk types: {dict(chunk_types)}") logger.info(f"Files processed: {len(file_count)}") # Generate embeddings logger.info("Generating embeddings (this may take a while)...") embedding_results = embedder.embed_chunks(chunks, batch_size=args.batch_size) logger.info(f"Generated {len(embedding_results)} embeddings") # Add to index logger.info("Building search index...") index_manager.add_embeddings(embedding_results) # Save index logger.info("Saving index to disk...") index_manager.save_index() # Display final statistics stats = index_manager.get_stats() model_info = embedder.get_model_info() logger.info("=" * 50) logger.info("INDEXING COMPLETED SUCCESSFULLY") logger.info("=" * 50) logger.info(f"Total chunks indexed: {stats['total_chunks']}") logger.info(f"Files processed: {stats['files_indexed']}") logger.info(f"Embedding dimension: {stats['embedding_dimension']}") logger.info(f"Index type: {stats['index_type']}") logger.info(f"Model: {model_info['model_name']}") if stats.get('chunk_types'): logger.info("\nChunk type distribution:") for chunk_type, count in stats['chunk_types'].items(): logger.info(f" {chunk_type}: {count}") if stats.get('top_tags'): logger.info("\nTop semantic tags:") for tag, count in list(stats['top_tags'].items())[:10]: logger.info(f" {tag}: {count}") logger.info(f"\nStorage location: {storage_dir}") logger.info("\nYou can now use the MCP server for Claude Code integration:") logger.info(f" python {Path(__file__).parent.parent / 'mcp_server' / 'server.py'}") except KeyboardInterrupt: logger.info("\nIndexing interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Indexing failed: {e}", exc_info=True) sys.exit(1) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/FarhanAliRaza/claude-context-local'

If you have feedback or need assistance with the MCP directory API, please join our Discord server