DocuMCP

IndexFileTool.ts•9.09 kB

import path from "path"; import chalk from "chalk"; import * as filesystem from "../services/filesystem.js"; import { createEmbedding, getEmbeddingDimension, } from "../services/embeddings.js"; import { logger } from "../services/logger.js"; import { collectionExists, createCollection, createPoint, upsertPoints, } from "../services/vectordb.js"; import { createSmartChunks } from "../helper/chunk.js"; /** * Result interface for indexed file */ interface IndexFileResult { success: boolean; filePath: string; fileType: string; totalChunks: number; embeddingsGenerated: number; embeddingErrors: number; embeddingDimension: number; storedInDatabase: boolean; collectionName: string; metadata: Record<string, any>; sizeInBytes: number; message: string; } /** * Tool for indexing a file's content into the vector database */ class IndexFileTool { /** * Indexes a file for RAG retrieval */ async indexFile( filePath: string, chunkSize: number = 512, chunkOverlap: number = 50, collectionName: string = "codebase", ): Promise<IndexFileResult> { try { // Read file content using filesystem functions const readResult = await filesystem.readFile(filePath, "utf-8"); if (!readResult.success) { throw new Error(readResult.message); } if (!readResult.data || !readResult.data.content) { logger.warn("File content is empty or not found"); readResult.data = { content: "File content is empty or not found", metadata: { size: 0, created: new Date(), modified: new Date(), accessed: new Date(), extension: path.extname(filePath), filename: path.basename(filePath), directory: path.dirname(filePath), }, }; } // Extract content and metadata const { content, metadata: fileMetadata } = readResult.data; const absolutePath = readResult.data.metadata?.directory ? path.join( readResult.data.metadata.directory, readResult.data.metadata.filename || "", ) : filesystem.resolvePath(filePath); // Create chunks using smart chunking based on file type const fileExtension = path.extname(absolutePath); const chunks = createSmartChunks(content, fileExtension, { chunkSize, overlapSize: chunkOverlap, paragraphsPerChunk: Math.floor(chunkSize / 150), // Rough estimate: ~150 chars per paragraph overlapParagraphs: Math.max(1, Math.floor(chunkOverlap / 150)), }); // Early return if no chunks if (chunks.length === 0) { return { success: true, filePath: absolutePath, fileType: path.extname(absolutePath).slice(1), totalChunks: 0, embeddingsGenerated: 0, embeddingErrors: 0, embeddingDimension: 0, storedInDatabase: false, collectionName, metadata: fileMetadata || {}, sizeInBytes: Buffer.byteLength(content, "utf-8"), message: "File is empty or contains only whitespace", }; } // Get embedding dimension from configuration const embeddingDimension = getEmbeddingDimension(); // Ensure collection exists if (!(await collectionExists(collectionName))) { const created = await createCollection( collectionName, embeddingDimension, ); if (!created) { throw new Error(`Failed to create collection ${collectionName}`); } } // Generate embeddings and create points const points = []; let successCount = 0; let errorCount = 0; for (let i = 0; i < chunks.length; i++) { try { const embeddingResult = await createEmbedding(chunks[i]); if (embeddingResult.error) { logger.warn( chalk.yellow( `Warning: Error generating embedding for chunk ${i}: ${embeddingResult.error}`, ), ); errorCount++; continue; } // Create point with all necessary metadata // Use a numeric ID as required by Qdrant const pointId = Date.now() + i; // Using timestamp + index to ensure uniqueness const point = createPoint(pointId, embeddingResult.embedding, { // Store the original filename-based ID in the payload for reference fileNameId: `${path.basename(absolutePath).replace(/[^a-z0-9]/gi, "_")}-${i}`, filePath: absolutePath, chunkIndex: i, content: chunks[i], startPosition: i * (chunkSize - chunkOverlap), fileType: path.extname(absolutePath).slice(1), filename: path.basename(absolutePath), extension: path.extname(absolutePath), directory: path.dirname(absolutePath), ...fileMetadata, }); points.push(point); successCount++; } catch (error) { logger.error(chalk.red(`Error processing chunk ${i}:`, error)); errorCount++; } } // Store points in VectorDB const storedInDatabase = points.length > 0 ? await upsertPoints(collectionName, points) : false; if (storedInDatabase) { logger.warn( chalk.green( `Successfully stored ${points.length} points in VectorDB`, ), ); } return { success: true, filePath: absolutePath, fileType: path.extname(absolutePath).slice(1), totalChunks: chunks.length, embeddingsGenerated: successCount, embeddingErrors: errorCount, embeddingDimension, storedInDatabase, collectionName, metadata: fileMetadata || {}, sizeInBytes: Buffer.byteLength(content, "utf-8"), message: `File indexed successfully with ${chunkSize} chunk size and ${chunkOverlap} overlap`, }; } catch (error) { logger.error(chalk.red(`Error indexing file ${filePath}:`, error)); throw new Error(`Failed to index file: ${(error as Error).message}`); } } /** * Process index file request - main entry point for the tool */ processIndexFile(input: any) { try { const { filePath, chunkSize = 512, chunkOverlap = 50, collectionName = "codebase", } = input; if (!filePath || typeof filePath !== "string") { throw new Error("Invalid filePath: must be a string"); } // Log formatted information logger.info(` ${chalk.blue("🔍 Indexing File:")} ${filePath} ${chalk.gray("├─")} Chunk Size: ${chalk.yellow(chunkSize)} ${chalk.gray("├─")} Chunk Overlap: ${chalk.yellow(chunkOverlap)} ${chalk.gray("└─")} Collection: ${chalk.yellow(collectionName)} `); // Execute the indexing operation return this.indexFile(filePath, chunkSize, chunkOverlap, collectionName) .then((result) => ({ content: [ { type: "text", text: JSON.stringify(result, null, 2), }, ], })) .catch((error) => ({ content: [ { type: "text", text: JSON.stringify( { error: error instanceof Error ? error.message : String(error), status: "failed", }, null, 2, ), }, ], isError: true, })); } catch (error) { return { content: [ { type: "text", text: JSON.stringify( { error: error instanceof Error ? error.message : String(error), status: "failed", }, null, 2, ), }, ], isError: true, }; } } } // Tool definition with improved description const INDEX_FILE_TOOL = { name: "index_file", description: `Index a file for retrieval-augmented generation (RAG). Processes a file into chunks and stores them in the vector database for semantic search. Key features: - Automatic chunking with configurable size and overlap - Preserves file metadata for better context - Generates embeddings using the configured provider - Stores in Vector Databse for fast similarity search - Handles various file types and encodings Use when you need to: - Make file content searchable - Prepare files for RAG operations - Build a knowledge base Parameters explained: - filePath: Path to the file to index (absolute or relative to project root)`, inputSchema: { type: "object", properties: { filePath: { type: "string", description: "Path to the file to index (absolute or relative to project root)", }, }, required: ["filePath"], }, }; export { IndexFileTool, INDEX_FILE_TOOL };

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/YannickTM/docu-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server