en es ja ko zh

MCP Codebase Insight

#!/usr/bin/env python
"""
Store Code Component Relationships in Vector Database

This script analyzes the codebase to extract relationships between components
and stores them in the vector database for use in build verification.
"""

import os
import sys
import json
import logging
import asyncio
import argparse
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Set, Tuple
import uuid

# Add the project root to the Python path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from src.mcp_codebase_insight.core.vector_store import VectorStore
from src.mcp_codebase_insight.core.embeddings import SentenceTransformerEmbedding
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(Path('logs/code_relationships.log'))
    ]
)
logger = logging.getLogger('code_relationships')

class CodeRelationshipAnalyzer:
    """Code relationship analyzer for storing component relationships in vector database."""
    
    def __init__(self, config_path: str = None):
        """Initialize the code relationship analyzer.
        
        Args:
            config_path: Path to configuration file (optional)
        """
        self.config = self._load_config(config_path)
        self.vector_store = None
        self.embedder = None
        self.dependency_map = {}
        self.critical_components = set()
        self.source_files = []
    
    def _load_config(self, config_path: str) -> Dict[str, Any]:
        """Load configuration from file or environment variables.
        
        Args:
            config_path: Path to configuration file
            
        Returns:
            Configuration dictionary
        """
        config = {
            'qdrant_url': os.environ.get('QDRANT_URL', 'http://localhost:6333'),
            'qdrant_api_key': os.environ.get('QDRANT_API_KEY', ''),
            'collection_name': os.environ.get('COLLECTION_NAME', 'mcp-codebase-insight'),
            'embedding_model': os.environ.get('EMBEDDING_MODEL', 'sentence-transformers/all-MiniLM-L6-v2'),
            'source_dirs': ['src'],
            'exclude_dirs': ['__pycache__', '.git', '.venv', 'test_env', 'dist', 'build'],
            'critical_modules': [
                'mcp_codebase_insight.core.vector_store',
                'mcp_codebase_insight.core.knowledge',
                'mcp_codebase_insight.server'
            ]
        }
        
        # Override with config file if provided
        if config_path:
            try:
                with open(config_path, 'r') as f:
                    file_config = json.load(f)
                    config.update(file_config)
            except Exception as e:
                logger.error(f"Failed to load config from {config_path}: {e}")
        
        return config
    
    async def initialize(self):
        """Initialize the analyzer."""
        logger.info("Initializing code relationship analyzer...")
        
        # Initialize embedder
        logger.info("Initializing embedder...")
        self.embedder = SentenceTransformerEmbedding(model_name=self.config['embedding_model'])
        await self.embedder.initialize()
        
        # Initialize vector store
        logger.info(f"Connecting to vector store at {self.config['qdrant_url']}...")
        self.vector_store = VectorStore(
            url=self.config['qdrant_url'],
            embedder=self.embedder,
            collection_name=self.config['collection_name'],
            api_key=self.config.get('qdrant_api_key'),
            vector_name="default"  # Specify a vector name for the collection
        )
        await self.vector_store.initialize()
        
        # Set critical components
        self.critical_components = set(self.config.get('critical_modules', []))
        
        logger.info("Code relationship analyzer initialized successfully")
    
    def find_source_files(self) -> List[Path]:
        """Find all source files to analyze.
        
        Returns:
            List of source file paths
        """
        logger.info("Finding source files...")
        
        source_files = []
        source_dirs = [Path(dir_name) for dir_name in self.config['source_dirs']]
        exclude_dirs = self.config['exclude_dirs']
        
        for source_dir in source_dirs:
            if not source_dir.exists():
                logger.warning(f"Source directory {source_dir} does not exist")
                continue
                
            for root, dirs, files in os.walk(source_dir):
                # Skip excluded directories
                dirs[:] = [d for d in dirs if d not in exclude_dirs]
                
                for file in files:
                    if file.endswith('.py'):
                        source_files.append(Path(root) / file)
        
        logger.info(f"Found {len(source_files)} source files")
        self.source_files = source_files
        return source_files
    
    def analyze_file_dependencies(self, file_path: Path) -> Dict[str, List[str]]:
        """Analyze dependencies for a single file.
        
        Args:
            file_path: Path to the file to analyze
            
        Returns:
            Dictionary mapping module name to list of dependencies
        """
        dependencies = []
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
            # Extract imports
            lines = content.split('\n')
            for line in lines:
                line = line.strip()
                
                # Skip comments
                if line.startswith('#'):
                    continue
                    
                # Handle import statements
                if line.startswith('import ') or ' import ' in line:
                    if line.startswith('import '):
                        # Handle "import module" or "import module as alias"
                        import_part = line[7:].strip()
                        if ' as ' in import_part:
                            import_part = import_part.split(' as ')[0].strip()
                        dependencies.append(import_part)
                    elif line.startswith('from ') and ' import ' in line:
                        # Handle "from module import something"
                        from_part = line[5:].split(' import ')[0].strip()
                        dependencies.append(from_part)
            
            # Convert file path to module name
            module_name = str(file_path).replace('/', '.').replace('\\', '.').replace('.py', '')
            for source_dir in self.config['source_dirs']:
                prefix = f"{source_dir}."
                if module_name.startswith(prefix):
                    module_name = module_name[len(prefix):]
            
            return {module_name: dependencies}
            
        except Exception as e:
            logger.error(f"Error analyzing file {file_path}: {e}")
            return {}
    
    def analyze_all_dependencies(self) -> Dict[str, List[str]]:
        """Analyze dependencies for all source files.
        
        Returns:
            Dictionary mapping module names to lists of dependencies
        """
        logger.info("Analyzing dependencies for all source files...")
        
        if not self.source_files:
            self.find_source_files()
        
        dependency_map = {}
        
        for file_path in self.source_files:
            file_dependencies = self.analyze_file_dependencies(file_path)
            dependency_map.update(file_dependencies)
        
        logger.info(f"Analyzed dependencies for {len(dependency_map)} modules")
        self.dependency_map = dependency_map
        return dependency_map
    
    def identify_critical_components(self) -> Set[str]:
        """Identify critical components in the codebase.
        
        Returns:
            Set of critical component names
        """
        logger.info("Identifying critical components...")
        
        # Start with configured critical modules
        critical_components = set(self.critical_components)
        
        # Add modules with many dependents
        if self.dependency_map:
            # Count how many times each module is a dependency
            dependent_count = {}
            for module, dependencies in self.dependency_map.items():
                for dependency in dependencies:
                    if dependency in dependent_count:
                        dependent_count[dependency] += 1
                    else:
                        dependent_count[dependency] = 1
            
            # Add modules with more than 3 dependents to critical components
            for module, count in dependent_count.items():
                if count > 3:
                    critical_components.add(module)
        
        logger.info(f"Identified {len(critical_components)} critical components")
        self.critical_components = critical_components
        return critical_components
    
    async def store_in_vector_database(self):
        """Store code relationships in vector database."""
        try:
            # Store dependency map
            dependency_text = json.dumps({
                'type': 'dependency_map',
                'dependencies': self.dependency_map
            })
            dependency_vector = await self.vector_store.embedder.embed(dependency_text)
            dependency_data = {
                'id': str(uuid.uuid4()),
                'vector': dependency_vector,
                'payload': {
                    'type': 'dependency_map',
                    'timestamp': datetime.now().isoformat(),
                    'module_count': len(self.dependency_map)
                }
            }
            
            # Store critical components
            critical_text = json.dumps({
                'type': 'critical_components',
                'components': list(self.critical_components)
            })
            critical_vector = await self.vector_store.embedder.embed(critical_text)
            critical_data = {
                'id': str(uuid.uuid4()),
                'vector': critical_vector,
                'payload': {
                    'type': 'critical_components',
                    'timestamp': datetime.now().isoformat(),
                    'component_count': len(self.critical_components)
                }
            }
            
            # Store build verification criteria
            criteria_text = json.dumps({
                'type': 'build_criteria',
                'critical_modules': list(self.critical_components),
                'min_test_coverage': 80.0,
                'max_allowed_failures': 0
            })
            criteria_vector = await self.vector_store.embedder.embed(criteria_text)
            criteria_data = {
                'id': str(uuid.uuid4()),
                'vector': criteria_vector,
                'payload': {
                    'type': 'build_criteria',
                    'timestamp': datetime.now().isoformat()
                }
            }
            
            # Store all data points
            data_points = [dependency_data, critical_data, criteria_data]
            self.vector_store.client.upsert(
                collection_name=self.vector_store.collection_name,
                points=[rest.PointStruct(
                    id=data['id'],
                    vectors={self.vector_store.vector_name: data['vector']},
                    payload=data['payload']
                ) for data in data_points]
            )
            
            logger.info("Successfully stored code relationships in vector database")
            
        except Exception as e:
            logger.error(f"Error storing in vector database: {e}")
            raise
    
    async def analyze_and_store(self):
        """Analyze code relationships and store them in the vector database."""
        try:
            # Find source files
            self.find_source_files()
            
            # Analyze dependencies
            self.analyze_all_dependencies()
            
            # Identify critical components
            self.identify_critical_components()
            
            # Store in vector database
            await self.store_in_vector_database()
            
            logger.info("Analysis and storage completed successfully")
            return True
            
        except Exception as e:
            logger.error(f"Error analyzing and storing code relationships: {e}")
            return False
        
    async def cleanup(self):
        """Clean up resources."""
        if self.vector_store:
            await self.vector_store.cleanup()
            await self.vector_store.close()

async def main():
    """Main function."""
    parser = argparse.ArgumentParser(description="Code Relationship Analyzer")
    parser.add_argument("--config", help="Path to configuration file")
    args = parser.parse_args()
    
    # Create logs directory if it doesn't exist
    os.makedirs("logs", exist_ok=True)
    
    analyzer = CodeRelationshipAnalyzer(args.config)
    
    try:
        await analyzer.initialize()
        success = await analyzer.analyze_and_store()
        
        if success:
            logger.info("Code relationship analysis completed successfully")
            return 0
        else:
            logger.error("Code relationship analysis failed")
            return 1
            
    except Exception as e:
        logger.error(f"Error in code relationship analysis: {e}")
        return 1
        
    finally:
        await analyzer.cleanup()

if __name__ == "__main__":
    sys.exit(asyncio.run(main())) 

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/tosin2013/mcp-codebase-insight'

If you have feedback or need assistance with the MCP directory API, please join our Discord server