Skip to main content
Glama

DocuMCP

by YannickTM
IndexDirTool.ts12.9 kB
import path from "path"; import chalk from "chalk"; import * as filesystem from "../services/filesystem.js"; import { createEmbedding, getEmbeddingDimension, } from "../services/embeddings.js"; import { logger } from "../services/logger.js"; import { collectionExists, createCollection, createPoint, upsertPoints, } from "../services/vectordb.js"; import { createSmartChunks } from "../helper/chunk.js"; /** * Result interface for indexed directory */ interface IndexDirectoryResult { success: boolean; dirPath: string; recursive: boolean; processedFiles: number; totalChunks: number; totalEmbeddingsGenerated: number; totalErrors: number; fileResults: any[]; embeddingDimension: number; filteredExtensions: string[] | "all"; collectionName: string; includeHidden: boolean; message: string; } /** * Tool for recursively indexing all files in a directory */ class IndexDirTool { /** * Process a single file and store its chunks in VectorDB */ private async processFile( filePath: string, collectionName: string, chunkSize: number, chunkOverlap: number, ): Promise<any> { try { // Read file content const readResult = await filesystem.readFile(filePath, "utf-8"); if (!readResult.success) { return { filePath, error: readResult.message, success: false, }; } if (!readResult.data || !readResult.data.content) { logger.warn("File content is empty or not found"); readResult.data = { content: "File content is empty or not found", metadata: { size: 0, created: new Date(), modified: new Date(), accessed: new Date(), extension: path.extname(filePath), filename: path.basename(filePath), directory: path.dirname(filePath), }, }; } const { content, metadata: fileMetadata } = readResult.data; // Create chunks using smart chunking based on file type const fileExtension = path.extname(filePath); const chunks = createSmartChunks(content, fileExtension, { chunkSize, overlapSize: chunkOverlap, paragraphsPerChunk: Math.floor(chunkSize / 150), // Rough estimate: ~150 chars per paragraph overlapParagraphs: Math.max(1, Math.floor(chunkOverlap / 150)), }); if (chunks.length === 0) { return { filePath, totalChunks: 0, embeddingsGenerated: 0, embeddingErrors: 0, storedInDatabase: false, sizeInBytes: Buffer.byteLength(content, "utf-8"), message: "File is empty or contains only whitespace", }; } /*logger.info( chalk.gray( `Processing: ${path.basename(filePath)} (${chunks.length} chunks)` ) );*/ // Generate embeddings for chunks const points = []; let successCount = 0; let errorCount = 0; for (let i = 0; i < chunks.length; i++) { try { const embeddingResult = await createEmbedding(chunks[i]); if (embeddingResult.error) { errorCount++; continue; } // Create point with metadata // Use a numeric ID as required by Qdrant const pointId = Date.now() + i; // Using timestamp + index to ensure uniqueness const point = createPoint(pointId, embeddingResult.embedding, { // Store the original filename-based ID in the payload for reference fileNameId: `${path.basename(filePath).replace(/[^a-z0-9]/gi, "_")}-${i}`, filePath, chunkIndex: i, content: chunks[i], startPosition: i * (chunkSize - chunkOverlap), fileType: path.extname(filePath).slice(1), filename: path.basename(filePath), extension: path.extname(filePath), directory: path.dirname(filePath), ...fileMetadata, }); points.push(point); successCount++; } catch (error) { errorCount++; logger.warn( chalk.yellow( `Warning: Error generating embedding for chunk ${i} in ${filePath}: ${(error as Error).message}`, ), ); } } // Store points in VectorDB const storedInDatabase = points.length > 0 ? await upsertPoints(collectionName, points) : false; return { filePath, totalChunks: chunks.length, embeddingsGenerated: successCount, embeddingErrors: errorCount, storedInDatabase, sizeInBytes: Buffer.byteLength(content, "utf-8"), }; } catch (error) { return { filePath, error: (error as Error).message, success: false, }; } } /** * Indexes all files in a directory for RAG retrieval */ async indexDirectory( dirPath: string, recursive: boolean = true, fileExtensions?: string[], includeHidden: boolean = false, chunkSize: number = 512, chunkOverlap: number = 50, collectionName: string = "codebase", ): Promise<IndexDirectoryResult> { try { const absolutePath = filesystem.resolvePath(dirPath); // Check if directory exists const dirExists = await filesystem.directoryExists(absolutePath); if (!dirExists) { throw new Error(`Directory not found: ${absolutePath}`); } // Get all files in the directory const readResult = await filesystem.readDirectory(absolutePath, { recursive, includeHidden, extensions: fileExtensions || [], }); if (!readResult.success) { throw new Error(readResult.message); } if (!readResult.data || !Array.isArray(readResult.data.entries)) { throw new Error("Invalid directory listing result"); } // Filter to only include files (not directories) const files = readResult.data.entries .filter((entry) => entry.type === "file") .map((entry) => path.join(absolutePath, entry.path)); if (files.length === 0) { return { success: true, dirPath: absolutePath, recursive, processedFiles: 0, totalChunks: 0, totalEmbeddingsGenerated: 0, totalErrors: 0, fileResults: [], embeddingDimension: 0, filteredExtensions: fileExtensions || "all", collectionName, includeHidden, message: `No files found to index in directory: ${absolutePath}`, }; } /*logger.info( chalk.blue(`Found ${files.length} files to process in ${absolutePath}`) );*/ // Get embedding dimension from configuration const embeddingDimension = getEmbeddingDimension(); // Ensure collection exists if (!(await collectionExists(collectionName))) { const created = await createCollection( collectionName, embeddingDimension, ); if (!created) { throw new Error(`Failed to create collection ${collectionName}`); } } // Process files const fileResults = []; let totalChunks = 0; let totalEmbeddingsGenerated = 0; let totalErrors = 0; for (const filePath of files) { const result = await this.processFile( filePath, collectionName, chunkSize, chunkOverlap, ); fileResults.push(result); if (result.totalChunks) totalChunks += result.totalChunks; if (result.embeddingsGenerated) totalEmbeddingsGenerated += result.embeddingsGenerated; if (result.embeddingErrors) totalErrors += result.embeddingErrors; if (result.error) totalErrors++; } return { success: true, dirPath: absolutePath, recursive, processedFiles: files.length, totalChunks, totalEmbeddingsGenerated, totalErrors, fileResults: fileResults.length <= 10 ? fileResults : [`${fileResults.length} files processed`], embeddingDimension, filteredExtensions: fileExtensions || "all", collectionName, includeHidden, message: `Directory indexed successfully: ${files.length} files with ${totalChunks} chunks and ${totalEmbeddingsGenerated} embeddings`, }; } catch (error) { logger.error( chalk.red(`Error indexing directory ${dirPath}:`), error as Error, ); throw new Error(`Failed to index directory: ${(error as Error).message}`); } } /** * Process index directory request - main entry point for the tool */ processIndexDirectory(input: any) { try { const { dirPath, recursive = true, fileExtensions, includeHidden = false, chunkSize = 512, chunkOverlap = 50, collectionName = "codebase", } = input; if (!dirPath || typeof dirPath !== "string") { throw new Error("Invalid dirPath: must be a string"); } // Log formatted information logger.info(` ${chalk.blue("🔍 Indexing Directory:")} ${dirPath} ${chalk.gray("├─")} Recursive: ${recursive ? chalk.green("✓") : chalk.red("✗")} ${chalk.gray("├─")} Include Hidden: ${ includeHidden ? chalk.green("✓") : chalk.red("✗") } ${chalk.gray("├─")} Chunk Size: ${chalk.yellow(chunkSize)} ${chalk.gray("├─")} Chunk Overlap: ${chalk.yellow(chunkOverlap)} ${chalk.gray("├─")} Collection: ${chalk.yellow(collectionName)} ${chalk.gray("└─")} Extensions: ${ fileExtensions ? chalk.yellow(fileExtensions.join(", ")) : chalk.gray("all") } `); // Execute the directory indexing operation return this.indexDirectory( dirPath, recursive, fileExtensions, includeHidden, chunkSize, chunkOverlap, collectionName, ) .then((result) => ({ content: [ { type: "text", text: JSON.stringify(result, null, 2), }, ], })) .catch((error) => ({ content: [ { type: "text", text: JSON.stringify( { error: error instanceof Error ? error.message : String(error), status: "failed", }, null, 2, ), }, ], isError: true, })); } catch (error) { return { content: [ { type: "text", text: JSON.stringify( { error: error instanceof Error ? error.message : String(error), status: "failed", }, null, 2, ), }, ], isError: true, }; } } } // Tool definition with improved description const INDEX_DIR_TOOL = { name: "index_directory", description: `Index all files in a directory for retrieval-augmented generation (RAG). Recursively processes files in a directory, chunks their content, and stores in the vector database. Key features: - Recursive directory processing - File type filtering by extension - Automatic chunking with configurable parameters - Progress tracking and error reporting - Hidden file inclusion control Use when you need to: - Index an entire codebase or documentation - Build a comprehensive knowledge base - Prepare multiple files for semantic search Parameters explained: - dirPath: Path to the directory to index (absolute or relative to project root) - recursive: Whether to recursively process subdirectories (default: true) - fileExtensions: Optional array of file extensions to include (e.g. [".js", ".ts"]) - includeHidden: Whether to include hidden files and directories (default: false)`, inputSchema: { type: "object", properties: { dirPath: { type: "string", description: "Directory path to index (absolute or relative to project root)", }, recursive: { type: "boolean", description: "Whether to recursively index subdirectories (default: true)", default: true, }, fileExtensions: { type: "array", items: { type: "string" }, description: 'Array of file extensions to include (e.g., [".js", ".ts"]). Indexes all files if omitted.', }, includeHidden: { type: "boolean", description: "Whether to include hidden files and directories (default: false)", default: false, }, }, required: ["dirPath"], }, }; export { IndexDirTool, INDEX_DIR_TOOL };

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/YannickTM/docu-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server