Skip to main content
Glama
vrppaul
by vrppaul

index_codebase

Index Python codebases for semantic search by scanning files, extracting functions and classes, and generating embeddings to enable natural language queries for finding relevant code snippets.

Instructions

Index a codebase for semantic search.

Scans Python files, extracts functions/classes/methods, generates embeddings, and stores them for fast semantic search.

Use force=True to re-index everything even if files haven't changed. Otherwise, only new and modified files are indexed (incremental).

Args: project_path: Absolute path to the project root directory. force: If True, re-index all files regardless of changes.

Returns: Statistics about the indexing operation.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
project_pathYes
forceNo

Implementation Reference

  • The main handler function for index_codebase tool - decorated with @mcp.tool() for registration, validates project path, creates IndexService via container, executes indexing with progress reporting, and returns IndexCodebaseResponse with statistics.
    @mcp.tool()
    @profile_async("index_codebase")
    async def index_codebase(
        project_path: str,
        ctx: Context[ServerSession, None],
        force: bool = False,
    ) -> IndexCodebaseResponse | ErrorResponse:
        """Index a codebase for semantic search.
    
        Scans Python files, extracts functions/classes/methods, generates embeddings,
        and stores them for fast semantic search.
    
        Use force=True to re-index everything even if files haven't changed.
        Otherwise, only new and modified files are indexed (incremental).
    
        Args:
            project_path: Absolute path to the project root directory.
            force: If True, re-index all files regardless of changes.
    
        Returns:
            Statistics about the indexing operation.
        """
        await ctx.info(f"Indexing: {project_path}")
    
        path = Path(project_path)
        if not path.exists():
            await ctx.warning(f"Project path does not exist: {project_path}")
            return ErrorResponse(error=f"Path does not exist: {project_path}")
    
        container = get_container()
        index_service = container.create_index_service(path)
        result = await index_service.index(path, force=force, on_progress=ctx.report_progress)
    
        await ctx.info(
            f"Indexed {result.files_indexed} files, {result.chunks_indexed} chunks "
            f"in {result.duration_seconds:.2f}s"
        )
    
        return IndexCodebaseResponse(
            files_indexed=result.files_indexed,
            chunks_indexed=result.chunks_indexed,
            files_deleted=result.files_deleted,
            duration_seconds=result.duration_seconds,
        )
  • Response schema for index_codebase tool - defines the structure returned containing files_indexed, chunks_indexed, files_deleted, and duration_seconds fields.
    class IndexCodebaseResponse(BaseModel):
        """Response from index_codebase tool."""
    
        files_indexed: int
        chunks_indexed: int
        files_deleted: int
        duration_seconds: float
  • Core IndexService.index() method - orchestrates the full indexing pipeline including file scanning, change detection, chunking, embedding, storage, and cache updates with progress callbacks.
    async def index(
        self,
        project_path: Path,
        force: bool = False,
        on_progress: ProgressCallback | None = None,
    ) -> IndexResult:
        """Full index: scan, detect changes, chunk, embed, with timing + progress.
    
        Args:
            project_path: Root directory of the project.
            force: If True, re-index all files regardless of changes.
            on_progress: Optional callback matching ctx.report_progress(progress, total, message).
    
        Returns:
            IndexResult with counts and total duration.
        """
        start = time.perf_counter()
    
        async def _progress(percent: float, message: str) -> None:
            if on_progress is not None:
                await on_progress(percent, 100, message)
    
        await _progress(5, "Scanning files...")
        files = await asyncio.to_thread(self.scan_files, project_path)
    
        await _progress(10, f"Found {len(files)} files, detecting changes...")
        plan = self.detect_changes(project_path, files, force=force)
    
        if not plan.has_work:
            return IndexResult(
                files_indexed=0,
                chunks_indexed=0,
                files_deleted=0,
                duration_seconds=round(time.perf_counter() - start, 3),
            )
    
        await _progress(20, f"Chunking {len(plan.files_to_index)} files...")
        chunks = await self.chunk_files(plan.files_to_index)
    
        await _progress(70, "Embedding and storing...")
        await self.indexer.embed_and_store(plan, chunks)
    
        # Update cache after successful embed+store
        cache_dir = resolve_cache_dir(self.settings, project_path, self._cache_dir)
        cache = FileChangeCache(cache_dir)
        if plan.files_to_delete:
            cache.remove_files(plan.files_to_delete)
        if plan.files_to_index:
            cache.update_files(plan.files_to_index)
    
        return IndexResult(
            files_indexed=len(plan.files_to_index),
            chunks_indexed=len(chunks),
            files_deleted=len(plan.files_to_delete),
            duration_seconds=round(time.perf_counter() - start, 3),
        )
  • File scanning logic - scans for source files with supported extensions using git ls-files when available, falling back to os.walk with directory pruning and .gitignore support.
    def scan_files(self, project_path: Path) -> list[str]:
        """Scan for source files with supported extensions.
    
        Uses git ls-files if available (fast, respects .gitignore).
        Falls back to os.walk with directory pruning.
    
        Args:
            project_path: Root directory to scan.
    
        Returns:
            List of absolute file paths.
        """
        project_path = project_path.resolve()
    
        if self._is_git_repo(project_path):
            files = self._scan_with_git(project_path)
            if files is not None:
                log.debug("scanned_files_git", project=str(project_path), count=len(files))
                return files
    
        files = self._scan_with_walk(project_path)
        log.debug("scanned_files_walk", project=str(project_path), count=len(files))
        return files
  • Dependency injection method - creates IndexService wired with cached vector store, embedder, chunker, and cache directory for efficient resource sharing across requests.
    def create_index_service(self, project_path: Path) -> IndexService:
        """Create an IndexService wired to cached store/embedder."""
        indexer = Indexer(embedder=self.embedder, store=self.get_store(project_path))
        return IndexService(
            settings=self.settings,
            indexer=indexer,
            chunker=self.create_chunker(),
            cache_dir=get_index_path(self.settings, project_path),
        )
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vrppaul/semantic-code-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server