Qdrant MCP Server

Overview Schema Related Servers Score Discussions

indexer.ts•26.7 KiB

/** * CodeIndexer - Main orchestrator for code vectorization */ import { execFile } from "node:child_process"; import { createHash } from "node:crypto"; import { promises as fs } from "node:fs"; import { extname, join, relative, resolve } from "node:path"; import { promisify } from "node:util"; import picomatch from "picomatch"; import type { EmbeddingProvider } from "../embeddings/base.js"; import { BM25SparseVectorGenerator } from "../embeddings/sparse.js"; import { normalizeRemoteUrl } from "../git/extractor.js"; import type { QdrantManager } from "../qdrant/client.js"; import { TreeSitterChunker } from "./chunker/tree-sitter-chunker.js"; import { MetadataExtractor } from "./metadata.js"; import { FileScanner } from "./scanner.js"; import { FileSynchronizer } from "./sync/synchronizer.js"; import type { ChangeStats, CodeChunk, CodeConfig, CodeSearchResult, IndexOptions, IndexStats, IndexStatus, ProgressCallback, SearchOptions, } from "./types.js"; const execFileAsync = promisify(execFile); /** Reserved ID for storing indexing metadata in the collection */ const INDEXING_METADATA_ID = "__indexing_metadata__"; export class CodeIndexer { constructor( private qdrant: QdrantManager, private embeddings: EmbeddingProvider, private config: CodeConfig, ) {} /** * Validate that a path doesn't attempt directory traversal * @throws Error if path traversal is detected */ private async validatePath(path: string): Promise<string> { const absolutePath = resolve(path); try { // Resolve the real path (follows symlinks) const realPath = await fs.realpath(absolutePath); // For now, we just ensure the path exists and is resolved // In a more restrictive environment, you could check against an allowlist return realPath; } catch (error) { // If realpath fails, the path doesn't exist yet or is invalid // For operations like indexing, we still need to accept non-existent paths // so we just return the resolved absolute path return absolutePath; } } /** * Index a codebase from scratch or force re-index */ async indexCodebase( path: string, options?: IndexOptions, progressCallback?: ProgressCallback, ): Promise<IndexStats> { const startTime = Date.now(); const stats: IndexStats = { filesScanned: 0, filesIndexed: 0, chunksCreated: 0, durationMs: 0, status: "completed", errors: [], }; const absolutePath = await this.validatePath(path); const collectionName = await this.getCollectionName(absolutePath); try { // 1. Scan files progressCallback?.({ phase: "scanning", current: 0, total: 100, percentage: 0, message: "Scanning files...", }); const scanner = new FileScanner({ supportedExtensions: options?.extensions || this.config.supportedExtensions, ignorePatterns: this.config.ignorePatterns, customIgnorePatterns: options?.ignorePatterns || this.config.customIgnorePatterns, }); await scanner.loadIgnorePatterns(absolutePath); const files = await scanner.scanDirectory(absolutePath); stats.filesScanned = files.length; if (files.length === 0) { stats.status = "completed"; stats.durationMs = Date.now() - startTime; return stats; } // 2. Create or verify collection const collectionExists = await this.qdrant.collectionExists(collectionName); if (options?.forceReindex && collectionExists) { await this.qdrant.deleteCollection(collectionName); } if (!collectionExists || options?.forceReindex) { const vectorSize = this.embeddings.getDimensions(); await this.qdrant.createCollection( collectionName, vectorSize, "Cosine", this.config.enableHybridSearch, ); } // Store "indexing in progress" marker immediately after collection is ready await this.storeIndexingMarker(collectionName, false); // 3. Process files and create chunks const chunker = new TreeSitterChunker({ chunkSize: this.config.chunkSize, chunkOverlap: this.config.chunkOverlap, maxChunkSize: this.config.chunkSize * 2, }); const metadataExtractor = new MetadataExtractor(); const allChunks: Array<{ chunk: CodeChunk; id: string }> = []; for (const [index, filePath] of files.entries()) { try { progressCallback?.({ phase: "chunking", current: index + 1, total: files.length, percentage: Math.round(((index + 1) / files.length) * 40), // 0-40% message: `Chunking file ${index + 1}/${files.length}`, }); const code = await fs.readFile(filePath, "utf-8"); // Check for secrets (basic detection) if (metadataExtractor.containsSecrets(code)) { stats.errors?.push( `Skipped ${filePath}: potential secrets detected`, ); continue; } const language = metadataExtractor.extractLanguage(filePath); const chunks = await chunker.chunk(code, filePath, language); // Apply chunk limits if configured const chunksToAdd = this.config.maxChunksPerFile ? chunks.slice(0, this.config.maxChunksPerFile) : chunks; for (const chunk of chunksToAdd) { const id = metadataExtractor.generateChunkId(chunk); allChunks.push({ chunk, id }); // Check total chunk limit if ( this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks ) { break; } } stats.filesIndexed++; // Check total chunk limit if ( this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks ) { break; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); stats.errors?.push(`Failed to process ${filePath}: ${errorMessage}`); } } stats.chunksCreated = allChunks.length; // Save snapshot for incremental updates (even if no chunks were created) try { const synchronizer = new FileSynchronizer(absolutePath, collectionName); await synchronizer.updateSnapshot(files); } catch (error) { // Snapshot failure shouldn't fail the entire indexing const errorMessage = error instanceof Error ? error.message : String(error); console.error("Failed to save snapshot:", errorMessage); stats.errors?.push(`Snapshot save failed: ${errorMessage}`); } if (allChunks.length === 0) { // Still store completion marker even with no chunks await this.storeIndexingMarker(collectionName, true); stats.status = "completed"; stats.durationMs = Date.now() - startTime; return stats; } // 4. Generate embeddings and store in batches const batchSize = this.config.batchSize; for (let i = 0; i < allChunks.length; i += batchSize) { const batch = allChunks.slice(i, i + batchSize); progressCallback?.({ phase: "embedding", current: i + batch.length, total: allChunks.length, percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30), // 40-70% message: `Generating embeddings ${i + batch.length}/${allChunks.length}`, }); try { const texts = batch.map((b) => b.chunk.content); const embeddings = await this.embeddings.embedBatch(texts); // 5. Store to Qdrant const points = batch.map((b, idx) => ({ id: b.id, vector: embeddings[idx].embedding, payload: { content: b.chunk.content, relativePath: relative(absolutePath, b.chunk.metadata.filePath), startLine: b.chunk.startLine, endLine: b.chunk.endLine, fileExtension: extname(b.chunk.metadata.filePath), language: b.chunk.metadata.language, codebasePath: absolutePath, chunkIndex: b.chunk.metadata.chunkIndex, ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }), ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType, }), }, })); progressCallback?.({ phase: "storing", current: i + batch.length, total: allChunks.length, percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30), // 70-100% message: `Storing chunks ${i + batch.length}/${allChunks.length}`, }); if (this.config.enableHybridSearch) { // Generate sparse vectors for hybrid search const sparseGenerator = new BM25SparseVectorGenerator(); const hybridPoints = batch.map((b, idx) => ({ id: b.id, vector: embeddings[idx].embedding, sparseVector: sparseGenerator.generate(b.chunk.content), payload: { content: b.chunk.content, relativePath: relative(absolutePath, b.chunk.metadata.filePath), startLine: b.chunk.startLine, endLine: b.chunk.endLine, fileExtension: extname(b.chunk.metadata.filePath), language: b.chunk.metadata.language, codebasePath: absolutePath, chunkIndex: b.chunk.metadata.chunkIndex, ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }), ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType, }), }, })); await this.qdrant.addPointsWithSparse(collectionName, hybridPoints); } else { await this.qdrant.addPoints(collectionName, points); } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); stats.errors?.push( `Failed to process batch at index ${i}: ${errorMessage}`, ); stats.status = "partial"; } } // Store completion marker to indicate indexing is complete await this.storeIndexingMarker(collectionName, true); stats.durationMs = Date.now() - startTime; return stats; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); stats.status = "failed"; stats.errors?.push(`Indexing failed: ${errorMessage}`); stats.durationMs = Date.now() - startTime; return stats; } } /** * Store an indexing status marker in the collection. * Called at the start of indexing with complete=false, and at the end with complete=true. */ private async storeIndexingMarker( collectionName: string, complete: boolean, ): Promise<void> { try { // Create a dummy vector of zeros (required by Qdrant) const vectorSize = this.embeddings.getDimensions(); const zeroVector = new Array(vectorSize).fill(0); // Check if collection uses hybrid mode const collectionInfo = await this.qdrant.getCollectionInfo(collectionName); const payload = { _type: "indexing_metadata", indexingComplete: complete, ...(complete ? { completedAt: new Date().toISOString() } : { startedAt: new Date().toISOString() }), }; if (collectionInfo.hybridEnabled) { await this.qdrant.addPointsWithSparse(collectionName, [ { id: INDEXING_METADATA_ID, vector: zeroVector, sparseVector: { indices: [], values: [] }, payload, }, ]); } else { await this.qdrant.addPoints(collectionName, [ { id: INDEXING_METADATA_ID, vector: zeroVector, payload, }, ]); } } catch (error) { // Non-fatal: log but don't fail the indexing console.error("Failed to store indexing marker:", error); } } /** * Search code semantically */ async searchCode( path: string, query: string, options?: SearchOptions, ): Promise<CodeSearchResult[]> { const absolutePath = await this.validatePath(path); const collectionName = await this.getCollectionName(absolutePath); // Check if collection exists const exists = await this.qdrant.collectionExists(collectionName); if (!exists) { throw new Error(`Codebase not indexed: ${path}`); } // Check if collection has hybrid search enabled const collectionInfo = await this.qdrant.getCollectionInfo(collectionName); const useHybrid = (options?.useHybrid ?? this.config.enableHybridSearch) && collectionInfo.hybridEnabled; // Generate query embedding const { embedding } = await this.embeddings.embed(query); // Build filter for Qdrant (only fileTypes - pathPattern uses post-filtering) let filter: any; if (options?.fileTypes && options.fileTypes.length > 0) { filter = { must: [ { key: "fileExtension", match: { any: options.fileTypes }, }, ], }; } // Prepare pathPattern matcher for post-filtering // Qdrant doesn't support regex/glob filtering, so we filter results in JS const pathMatcher = options?.pathPattern ? picomatch(options.pathPattern, { dot: true }) : null; // When using pathPattern, fetch more results to account for filtering const fetchLimit = pathMatcher ? Math.min((options?.limit || this.config.defaultSearchLimit) * 5, 100) : options?.limit || this.config.defaultSearchLimit; // Search with hybrid or standard search let results; if (useHybrid) { const sparseGenerator = new BM25SparseVectorGenerator(); const sparseVector = sparseGenerator.generate(query); results = await this.qdrant.hybridSearch( collectionName, embedding, sparseVector, fetchLimit, filter, ); } else { results = await this.qdrant.search( collectionName, embedding, fetchLimit, filter, ); } // Apply pathPattern post-filtering if specified let filteredResults = results; if (pathMatcher) { filteredResults = results.filter((r) => { const relativePath = r.payload?.relativePath || ""; return pathMatcher(relativePath); }); } // Apply score threshold if specified if (options?.scoreThreshold) { filteredResults = filteredResults.filter( (r) => r.score >= (options.scoreThreshold || 0), ); } // Apply the requested limit after all filtering const requestedLimit = options?.limit || this.config.defaultSearchLimit; const finalResults = filteredResults.slice(0, requestedLimit); // Format results return finalResults.map((r) => ({ content: r.payload?.content || "", filePath: r.payload?.relativePath || "", startLine: r.payload?.startLine || 0, endLine: r.payload?.endLine || 0, language: r.payload?.language || "unknown", score: r.score, fileExtension: r.payload?.fileExtension || "", })); } /** * Get indexing status for a codebase */ async getIndexStatus(path: string): Promise<IndexStatus> { const absolutePath = await this.validatePath(path); const collectionName = await this.getCollectionName(absolutePath); const exists = await this.qdrant.collectionExists(collectionName); if (!exists) { return { isIndexed: false, status: "not_indexed" }; } // Check for indexing marker in Qdrant (persisted across instances) const indexingMarker = await this.qdrant.getPoint( collectionName, INDEXING_METADATA_ID, ); const info = await this.qdrant.getCollectionInfo(collectionName); // Check marker status const isComplete = indexingMarker?.payload?.indexingComplete === true; const isInProgress = indexingMarker?.payload?.indexingComplete === false; // Subtract 1 from points count if marker exists (metadata point doesn't count as a chunk) const actualChunksCount = indexingMarker ? Math.max(0, info.pointsCount - 1) : info.pointsCount; if (isInProgress) { // Indexing in progress - marker exists with indexingComplete=false return { isIndexed: false, status: "indexing", collectionName, chunksCount: actualChunksCount, }; } if (isComplete) { // Indexing completed - marker exists with indexingComplete=true return { isIndexed: true, status: "indexed", collectionName, chunksCount: actualChunksCount, lastUpdated: indexingMarker.payload?.completedAt ? new Date(indexingMarker.payload.completedAt) : undefined, }; } // Legacy collection (no marker) - check if it has content // If it has chunks, assume it's indexed (backwards compatibility) if (actualChunksCount > 0) { return { isIndexed: true, status: "indexed", collectionName, chunksCount: actualChunksCount, }; } // Collection exists but no chunks and no marker - not indexed return { isIndexed: false, status: "not_indexed", collectionName, chunksCount: 0, }; } /** * Incrementally re-index only changed files */ async reindexChanges( path: string, progressCallback?: ProgressCallback, ): Promise<ChangeStats> { const startTime = Date.now(); const stats: ChangeStats = { filesAdded: 0, filesModified: 0, filesDeleted: 0, chunksAdded: 0, chunksDeleted: 0, durationMs: 0, }; try { const absolutePath = await this.validatePath(path); const collectionName = await this.getCollectionName(absolutePath); // Check if collection exists const exists = await this.qdrant.collectionExists(collectionName); if (!exists) { throw new Error(`Codebase not indexed: ${path}`); } // Initialize synchronizer const synchronizer = new FileSynchronizer(absolutePath, collectionName); const hasSnapshot = await synchronizer.initialize(); if (!hasSnapshot) { throw new Error( "No previous snapshot found. Use index_codebase for initial indexing.", ); } // Scan current files progressCallback?.({ phase: "scanning", current: 0, total: 100, percentage: 0, message: "Scanning for changes...", }); const scanner = new FileScanner({ supportedExtensions: this.config.supportedExtensions, ignorePatterns: this.config.ignorePatterns, customIgnorePatterns: this.config.customIgnorePatterns, }); await scanner.loadIgnorePatterns(absolutePath); const currentFiles = await scanner.scanDirectory(absolutePath); // Detect changes const changes = await synchronizer.detectChanges(currentFiles); stats.filesAdded = changes.added.length; stats.filesModified = changes.modified.length; stats.filesDeleted = changes.deleted.length; if ( stats.filesAdded === 0 && stats.filesModified === 0 && stats.filesDeleted === 0 ) { stats.durationMs = Date.now() - startTime; return stats; } const chunker = new TreeSitterChunker({ chunkSize: this.config.chunkSize, chunkOverlap: this.config.chunkOverlap, maxChunkSize: this.config.chunkSize * 2, }); const metadataExtractor = new MetadataExtractor(); // Delete chunks for modified and deleted files BEFORE adding new ones const filesToDelete = [...changes.modified, ...changes.deleted]; if (filesToDelete.length > 0) { progressCallback?.({ phase: "scanning", current: 0, total: filesToDelete.length, percentage: 5, message: `Deleting old chunks for ${filesToDelete.length} files...`, }); for (const relativePath of filesToDelete) { try { const filter = { must: [{ key: "relativePath", match: { value: relativePath } }], }; await this.qdrant.deletePointsByFilter(collectionName, filter); } catch (error) { // Log but don't fail - file might not have any chunks console.error( `Failed to delete chunks for ${relativePath}:`, error, ); } } } const filesToIndex = [...changes.added, ...changes.modified]; const allChunks: Array<{ chunk: CodeChunk; id: string }> = []; for (const [index, filePath] of filesToIndex.entries()) { try { progressCallback?.({ phase: "chunking", current: index + 1, total: filesToIndex.length, percentage: Math.round(((index + 1) / filesToIndex.length) * 40), message: `Processing file ${index + 1}/${filesToIndex.length}`, }); const absoluteFilePath = join(absolutePath, filePath); const code = await fs.readFile(absoluteFilePath, "utf-8"); // Check for secrets if (metadataExtractor.containsSecrets(code)) { continue; } const language = metadataExtractor.extractLanguage(absoluteFilePath); const chunks = await chunker.chunk(code, absoluteFilePath, language); for (const chunk of chunks) { const id = metadataExtractor.generateChunkId(chunk); allChunks.push({ chunk, id }); } } catch (error) { console.error(`Failed to process ${filePath}:`, error); } } stats.chunksAdded = allChunks.length; // Generate embeddings and store in batches const batchSize = this.config.batchSize; for (let i = 0; i < allChunks.length; i += batchSize) { const batch = allChunks.slice(i, i + batchSize); progressCallback?.({ phase: "embedding", current: i + batch.length, total: allChunks.length, percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30), message: `Generating embeddings ${i + batch.length}/${allChunks.length}`, }); const texts = batch.map((b) => b.chunk.content); const embeddings = await this.embeddings.embedBatch(texts); const points = batch.map((b, idx) => ({ id: b.id, vector: embeddings[idx].embedding, payload: { content: b.chunk.content, relativePath: relative(absolutePath, b.chunk.metadata.filePath), startLine: b.chunk.startLine, endLine: b.chunk.endLine, fileExtension: extname(b.chunk.metadata.filePath), language: b.chunk.metadata.language, codebasePath: absolutePath, chunkIndex: b.chunk.metadata.chunkIndex, ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }), ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType, }), }, })); progressCallback?.({ phase: "storing", current: i + batch.length, total: allChunks.length, percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30), message: `Storing chunks ${i + batch.length}/${allChunks.length}`, }); if (this.config.enableHybridSearch) { const sparseGenerator = new BM25SparseVectorGenerator(); const hybridPoints = points.map((point, idx) => ({ ...point, sparseVector: sparseGenerator.generate( allChunks[i + idx].chunk.content, ), })); await this.qdrant.addPointsWithSparse(collectionName, hybridPoints); } else { await this.qdrant.addPoints(collectionName, points); } } // Update snapshot await synchronizer.updateSnapshot(currentFiles); stats.durationMs = Date.now() - startTime; return stats; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new Error(`Incremental re-indexing failed: ${errorMessage}`); } } /** * Clear all indexed data for a codebase */ async clearIndex(path: string): Promise<void> { const absolutePath = await this.validatePath(path); const collectionName = await this.getCollectionName(absolutePath); const exists = await this.qdrant.collectionExists(collectionName); if (exists) { await this.qdrant.deleteCollection(collectionName); } // Also delete snapshot try { const synchronizer = new FileSynchronizer(absolutePath, collectionName); await synchronizer.deleteSnapshot(); } catch (_error) { // Ignore snapshot deletion errors } } /** * Generate deterministic collection name from codebase path. * Uses git remote URL for consistent naming across machines, with fallback to directory name. */ private async getCollectionName(path: string): Promise<string> { const absolutePath = resolve(path); // Try git remote URL for consistent naming // Check if THIS directory is the git root (not just inside a git repo) try { // Clear git environment variables that may be set during pre-commit hooks // These variables cause git commands to use the wrong repository const cleanEnv = { ...process.env }; delete cleanEnv.GIT_DIR; delete cleanEnv.GIT_WORK_TREE; delete cleanEnv.GIT_INDEX_FILE; const { stdout: gitRootResult } = await execFileAsync( "git", ["rev-parse", "--show-toplevel"], { cwd: absolutePath, env: cleanEnv }, ); const gitRoot = gitRootResult.trim(); // Only use git remote if this path IS the git root if (gitRoot === absolutePath) { const { stdout } = await execFileAsync( "git", ["remote", "get-url", "origin"], { cwd: absolutePath, env: cleanEnv }, ); const normalized = normalizeRemoteUrl(stdout.trim()); if (normalized) { const hash = createHash("md5").update(normalized).digest("hex"); return `code_${hash.substring(0, 8)}`; } } } catch { // Not a git repo or no remote } // Fallback: full absolute path (consistent with original behavior) const hash = createHash("md5").update(absolutePath).digest("hex"); return `code_${hash.substring(0, 8)}`; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mhalder/qdrant-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexer.ts•26.7 KiB