IndexFoundry MCP

index.ts•26.9 kB

#!/usr/bin/env node /** * IndexFoundry-MCP: Main Server Entry Point * * A deterministic vector index factory for MCP. * Five-phase pipeline: Connect → Extract → Normalize → Index → Serve * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. * * This source code is the property of vario.automation and is protected * by trade secret and copyright law. Unauthorized copying, modification, * distribution, or use of this software is strictly prohibited. */ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; // Tool implementations import { connectUrl, connectSitemap, connectFolder, connectPdf, } from "./tools/connect.js"; import { extractPdf, extractHtml, extractDocument, } from "./tools/extract.js"; import { normalizeChunk, normalizeEnrich, normalizeDedupe, } from "./tools/normalize.js"; import { indexEmbed, indexUpsert, indexBuildProfile, } from "./tools/index.js"; import { serveOpenapi, serveStart, serveStop, serveStatus, serveQuery, } from "./tools/serve.js"; import { runStatus, runList, runDiff, runCleanup, } from "./tools/utilities.js"; import { classifyQuery, ClassifyQueryInputSchema, } from "./tools/classify.js"; import { extractTables, ExtractTableInputSchema, } from "./tools/tables.js"; import { debugQuery, DebugQueryInputSchema, } from "./tools/debug.js"; import { projectCreate, projectList, projectGet, projectDelete, projectAddSource, projectBuild, projectQuery, projectExport, projectDeploy, projectServe, projectServeStop, projectServeStatus, initProjectManager, } from "./tools/projects.js"; // Schemas import { ConnectUrlSchema, ConnectSitemapSchema, ConnectFolderSchema, ConnectPdfSchema, ExtractPdfSchema, ExtractHtmlSchema, ExtractDocumentSchema, NormalizeChunkSchema, NormalizeEnrichSchema, NormalizeDedupeSchema, IndexEmbedSchema, IndexUpsertSchema, IndexBuildProfileSchema, ServeOpenapiSchema, ServeStartSchema, ServeStopSchema, ServeStatusSchema, ServeQuerySchema, RunStatusSchema, RunListSchema, RunDiffSchema, RunCleanupSchema, } from "./schemas.js"; import { ProjectCreateSchema, ProjectListSchema, ProjectGetSchema, ProjectDeleteSchema, ProjectAddSourceSchema, ProjectBuildSchema, ProjectQuerySchema, ProjectExportSchema, ProjectDeploySchema, ProjectServeSchema, ProjectServeStopSchema, ProjectServeStatusSchema, } from "./schemas-projects.js"; import { initRunManager } from "./run-manager.js"; import { fileURLToPath } from "url"; import * as path from "path"; // Get the directory where the MCP server is installed const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const SERVER_BASE_DIR = path.resolve(__dirname, ".."); // Initialize the MCP server const server = new McpServer({ name: "indexfoundry-mcp", version: "0.1.0", }); // ============================================================================ // PHASE 1: CONNECT TOOLS // ============================================================================ server.tool( "indexfoundry_connect_url", "Fetch a single URL and store raw content. Supports domain allowlisting, timeout configuration, and content validation.", ConnectUrlSchema.shape, async (args) => { const result = await connectUrl(args as z.infer<typeof ConnectUrlSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_connect_sitemap", "Crawl a sitemap XML file and fetch all linked pages. Supports URL pattern filtering, concurrent fetching, and depth limits.", ConnectSitemapSchema.shape, async (args) => { const result = await connectSitemap(args as z.infer<typeof ConnectSitemapSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_connect_folder", "Load files from a local folder using glob patterns. Validates file sizes and content types.", ConnectFolderSchema.shape, async (args) => { const result = await connectFolder(args as z.infer<typeof ConnectFolderSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_connect_pdf", "Fetch a PDF file from URL or local path with specialized validation and metadata extraction.", ConnectPdfSchema.shape, async (args) => { const result = await connectPdf(args as z.infer<typeof ConnectPdfSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // PHASE 2: EXTRACT TOOLS // ============================================================================ server.tool( "indexfoundry_extract_pdf", "Extract text from PDF files, producing page-by-page JSONL output. Handles multi-column layouts and embedded fonts.", ExtractPdfSchema.shape, async (args) => { const result = await extractPdf(args as z.infer<typeof ExtractPdfSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_extract_html", "Extract text and structure from HTML content. Preserves headings, tables, and semantic markup. Outputs clean text or markdown.", ExtractHtmlSchema.shape, async (args) => { const result = await extractHtml(args as z.infer<typeof ExtractHtmlSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_extract_document", "Generic document extractor for markdown, plain text, CSV, and JSON files. Normalizes encoding and line endings.", ExtractDocumentSchema.shape, async (args) => { const result = await extractDocument(args as z.infer<typeof ExtractDocumentSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // PHASE 3: NORMALIZE TOOLS // ============================================================================ server.tool( "indexfoundry_normalize_chunk", "Split extracted text into semantic chunks. Supports strategies: recursive (default), hierarchical (parent-child from markdown headings), paragraph, heading, page, sentence, fixed. Produces deterministic SHA256 chunk IDs.", NormalizeChunkSchema.shape, async (args) => { const result = await normalizeChunk(args as z.infer<typeof NormalizeChunkSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_normalize_enrich", "Enrich chunks with metadata: language detection, regex-based tagging, section classification, and taxonomy mapping.", NormalizeEnrichSchema.shape, async (args) => { const result = await normalizeEnrich(args as z.infer<typeof NormalizeEnrichSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_normalize_dedupe", "Deduplicate chunks by exact content hash or fuzzy similarity (simhash). Preserves the first occurrence and tracks duplicates.", NormalizeDedupeSchema.shape, async (args) => { const result = await normalizeDedupe(args as z.infer<typeof NormalizeDedupeSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // PHASE 4: INDEX TOOLS // ============================================================================ server.tool( "indexfoundry_index_embed", "Generate vector embeddings for chunks using OpenAI or local models. Batch processing with retry logic and rate limiting.", IndexEmbedSchema.shape, async (args) => { const result = await indexEmbed(args as z.infer<typeof IndexEmbedSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_index_upsert", "Upsert vectors to a vector database. Supports local file-based storage and external providers (Pinecone, Weaviate, Qdrant, Milvus, Chroma).", IndexUpsertSchema.shape, async (args) => { const result = await indexUpsert(args as z.infer<typeof IndexUpsertSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_index_build_profile", "Define retrieval configuration: top_k, hybrid search settings, reranking, metadata filters, and scoring adjustments.", IndexBuildProfileSchema.shape, async (args) => { const result = await indexBuildProfile(args as z.infer<typeof IndexBuildProfileSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // PHASE 5: SERVE TOOLS // ============================================================================ server.tool( "indexfoundry_serve_openapi", "Generate an OpenAPI 3.1 specification for the index API. Configurable endpoints: search_semantic, search_hybrid, get_chunk, health, stats.", ServeOpenapiSchema.shape, async (args) => { const result = await serveOpenapi(args as z.infer<typeof ServeOpenapiSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_serve_start", "Start an HTTP search API server for a run. Loads vectors and chunks into memory, serves semantic/hybrid/keyword search endpoints.", ServeStartSchema.shape, async (args) => { const result = await serveStart(args as z.infer<typeof ServeStartSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_serve_stop", "Stop a running search API server for a run. Returns server uptime and request count.", ServeStopSchema.shape, async (args) => { const result = await serveStop(args as z.infer<typeof ServeStopSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_serve_status", "Get status of running search servers. Shows endpoint, uptime, request count, and loaded vector/chunk counts.", ServeStatusSchema.shape, async (args) => { const result = await serveStatus(args as z.infer<typeof ServeStatusSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_serve_query", "Query a running search server directly (without HTTP). Supports semantic, keyword, and hybrid search modes.", ServeQuerySchema.shape, async (args) => { const result = await serveQuery(args as z.infer<typeof ServeQuerySchema>); return { content: [{ type: "text", text: formatQueryResults(result) }], }; } ); // ============================================================================ // UTILITY TOOLS // ============================================================================ server.tool( "indexfoundry_run_status", "Get detailed status of a run including phase completion, timing, errors, and artifact counts.", RunStatusSchema.shape, async (args) => { const result = await runStatus(args as z.infer<typeof RunStatusSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_run_list", "List all runs with optional filtering by status, date range, and sorting options.", RunListSchema.shape, async (args) => { const result = await runList(args as z.infer<typeof RunListSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_run_diff", "Compare two runs: configuration differences, source changes, chunk deltas, and timing comparisons.", RunDiffSchema.shape, async (args) => { const result = await runDiff(args as z.infer<typeof RunDiffSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_run_cleanup", "Delete old runs with optional manifest preservation. Supports age-based and count-based retention policies.", RunCleanupSchema.shape, async (args) => { const result = await runCleanup(args as z.infer<typeof RunCleanupSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // PROJECT TOOLS // ============================================================================ server.tool( "indexfoundry_project_create", `🏗️ [STEP 1/5: CREATE] Create a new RAG project. PROJECT PIPELINE OVERVIEW: 1. project_create → Initialize project structure 2. project_add_source → Add URLs, PDFs, folders, or sitemaps 3. project_build → Chunk and embed content 4. project_export → Generate deployment files 5. project_serve → Start local server for testing WHAT THIS DOES: - Creates project directory structure (data/, src/, frontend/) - Initializes project.json manifest with embedding config - Generates deployment boilerplate (Dockerfile, package.json) - Creates frontend/index.html chat interface NEXT STEPS: - Use project_add_source to add your content (URLs, PDFs, folders) - Multiple sources can be added before building`, ProjectCreateSchema.shape, async (args) => { const result = await projectCreate(args as z.infer<typeof ProjectCreateSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_list", `📋 List all IndexFoundry projects. USE WHEN: You need to see what projects exist or check their stats RETURNS: Array of { project_id, name, created_at, stats? }`, ProjectListSchema.shape, async (args) => { const result = await projectList(args as z.infer<typeof ProjectListSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_get", `📖 Get detailed information about a specific project. USE WHEN: You need to check project status, sources, or configuration RETURNS: { manifest, sources[], path } - full project state`, ProjectGetSchema.shape, async (args) => { const result = await projectGet(args as z.infer<typeof ProjectGetSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_delete", `🗑️ Delete a project and all its data. Requires confirm: true for safety.`, ProjectDeleteSchema.shape, async (args) => { const result = await projectDelete(args as z.infer<typeof ProjectDeleteSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_add_source", `📥 [STEP 2/5: ADD] Add a data source to a project. PREREQUISITES: project_create must have been run first SOURCE TYPES (use exactly one): - url: Single webpage (HTML content extracted) - sitemap_url: Crawl all pages in sitemap.xml - folder_path: Local folder with text/markdown/PDF files - pdf_path: Single PDF file (local path or URL) WHAT THIS DOES: - Validates source is accessible - Creates source record in sources.jsonl - Queues source for processing by project_build NEXT STEPS: - Add more sources with additional calls to project_add_source - Run project_build when all sources are added`, ProjectAddSourceSchema.shape, async (args) => { const result = await projectAddSource(args as z.infer<typeof ProjectAddSourceSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_build", `⚙️ [STEP 3/5: BUILD] Process all pending sources into searchable chunks. PREREQUISITES: - project_create must have been run - project_add_source must have added at least one source WHAT THIS DOES: 1. Fetches content from each pending source 2. Extracts text (HTML parsing, PDF extraction, etc.) 3. Chunks text with overlap for context continuity 4. Generates embeddings using OpenAI API (requires OPENAI_API_KEY) 5. Appends chunks and vectors to data/chunks.jsonl and data/vectors.jsonl COST: ~$0.02 per 1M tokens embedded (text-embedding-3-small) NEXT STEPS: - Use project_query to test search quality - Run project_export to generate server code - Run project_serve to start local server for testing`, ProjectBuildSchema.shape, async (args) => { const result = await projectBuild(args as z.infer<typeof ProjectBuildSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // Format query results (project_query, serve_query) as clean markdown for LLM consumption function formatQueryResults(result: unknown): string { if (!result || typeof result !== 'object') { return JSON.stringify(result, null, 2); } const r = result as Record<string, unknown>; // Handle errors if (r.success === false) { return `## ❌ Query Failed\n\n**Error:** ${r.error || 'Unknown error'}\n**Code:** ${r.code || 'UNKNOWN'}`; } // Format successful results const results = r.results as Array<{ chunk_id?: string; id?: string; score: number; text?: string; source_id?: string; metadata?: Record<string, unknown>; }> | undefined; if (!results || results.length === 0) { return `## 🔍 Query Results\n\n**No results found.**\n\nTry:\n- Different search terms\n- Using hybrid mode for better recall\n- Checking if the project has been built`; } const tookMs = r.took_ms as number | undefined; const timing = tookMs ? ` | **Time:** ${tookMs}ms` : ''; const lines: string[] = [ `## 🔍 Query Results`, ``, `**Found:** ${results.length} result${results.length !== 1 ? 's' : ''} | **Mode:** ${r.mode || 'unknown'}${timing}`, ``, `---`, ]; for (let i = 0; i < results.length; i++) { const item = results[i]; const scorePercent = (item.score * 100).toFixed(1); const chunkId = item.chunk_id || item.id || 'unknown'; const sourceId = item.source_id || (item.metadata?.source_id as string) || chunkId; lines.push(``); lines.push(`### Result ${i + 1} — Score: ${scorePercent}%`); lines.push(``); lines.push(`**Source:** \`${sourceId}\``); if (item.text) { lines.push(``); lines.push(item.text); } else { lines.push(``); lines.push(`*[Text not included - set include_text=true]*`); } lines.push(``); lines.push(`---`); } return lines.join('\n'); } server.tool( "indexfoundry_project_query", `🔍 Search a project's vector database (for testing). PREREQUISITES: project_build must have processed sources MODES: - keyword: Fast exact-match search - semantic: Embedding similarity (requires query embedding) - hybrid: Combines keyword + semantic with RRF fusion USE WHEN: You want to test search quality before deploying`, ProjectQuerySchema.shape, async (args) => { const result = await projectQuery(args as z.infer<typeof ProjectQuerySchema>); return { content: [{ type: "text", text: formatQueryResults(result) }], }; } ); server.tool( "indexfoundry_project_export", `📦 [STEP 4/5: EXPORT] Generate deployment files for the project. PREREQUISITES: project_build must have processed sources WHAT THIS DOES: 1. Generates src/index.ts - MCP server with HTTP endpoints 2. Creates Dockerfile for containerized deployment 3. Creates railway.toml for Railway deployment 4. Generates DEPLOYMENT.md with step-by-step instructions 5. Creates frontend/ with chat UI NEXT STEPS: - Run project_serve to test locally - Push to GitHub and deploy to Railway - Or use project_deploy for automated Railway deployment`, ProjectExportSchema.shape, async (args) => { const result = await projectExport(args as z.infer<typeof ProjectExportSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_deploy", `☁️ Deploy a project to Railway (production deployment). PREREQUISITES: - project_export must have generated deployment files - Railway CLI must be installed and authenticated - OPENAI_API_KEY must be available for /chat endpoint USE dry_run=true FIRST to preview commands without executing WHAT THIS DOES: 1. Initializes Railway project in project directory 2. Sets environment variables (OPENAI_API_KEY, etc.) 3. Deploys to Railway using Dockerfile 4. Returns public URL ALTERNATIVE: Use project_serve for local testing first`, ProjectDeploySchema.shape, async (args) => { const result = await projectDeploy(args as z.infer<typeof ProjectDeploySchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_serve", `🚀 [STEP 5/5: SERVE] Start a local development server for testing a project. PREREQUISITES: - project_create must have been run - project_add_source must have added at least one source - project_build must have processed sources into chunks/vectors - project_export must have generated the server code WHAT THIS DOES: 1. Checks if dependencies are installed (runs npm install if needed) 2. For mode='dev': uses tsx for hot reload during development 3. For mode='build': compiles TypeScript then runs production Node.js 4. Polls /health endpoint to confirm server is ready 5. Optionally opens frontend/index.html in browser RETURNS: { endpoint, pid, port, mode } - use endpoint to test the chat UI NEXT STEPS: - Open frontend/index.html in browser to test the chat interface - Use project_serve_status to check server health - Use project_serve_stop when done testing - Use project_deploy to deploy to Railway for production`, ProjectServeSchema.shape, async (args) => { const result = await projectServe(args as z.infer<typeof ProjectServeSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_serve_stop", `🛑 Stop a running project development server. USE WHEN: You need to stop a server started with project_serve WHAT THIS DOES: 1. Sends SIGTERM for graceful shutdown 2. Waits 2 seconds for cleanup 3. Uses SIGKILL if force=true or process won't stop 4. Cleans up PID file and tracking state RETURNS: { pid, uptime_seconds } - confirms server was stopped`, ProjectServeStopSchema.shape, async (args) => { const result = await projectServeStop(args as z.infer<typeof ProjectServeStopSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); server.tool( "indexfoundry_project_serve_status", `📊 Get status of running project servers. USE WHEN: You need to check if a server is running or find its endpoint WHAT THIS DOES: - If project_id provided: checks status of that specific project's server - If project_id omitted: scans all projects for running servers - Validates process is actually running (handles stale PID files) RETURNS: Array of running servers with { endpoint, pid, port, mode, uptime_seconds }`, ProjectServeStatusSchema.shape, async (args) => { const result = await projectServeStatus(args as z.infer<typeof ProjectServeStatusSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // CLASSIFICATION TOOLS // ============================================================================ server.tool( "indexfoundry_classify_query", "🔍 Classify a query to determine if RAG retrieval is needed and what type of query it is. Returns query type (factual/procedural/conceptual/navigational/conversational), complexity, confidence, and retrieval hints.", ClassifyQueryInputSchema.shape, async (args) => { const result = await classifyQuery(args as z.infer<typeof ClassifyQueryInputSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // TABLE PROCESSING TOOLS // ============================================================================ server.tool( "indexfoundry_extract_tables", "📊 Extract and linearize tables from markdown, HTML, or CSV content. Produces structured table data, linearized text for vector embedding, and chunks for RAG retrieval.", ExtractTableInputSchema.shape, async (args) => { const result = await extractTables(args as z.infer<typeof ExtractTableInputSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // DEBUG TOOLS // ============================================================================ server.tool( "indexfoundry_debug_query", "🔍 Debug retrieval queries with pipeline tracing, similarity scores, and expected/actual comparison. Diagnose why queries don't return expected results.", DebugQueryInputSchema.shape, async (args) => { const result = await debugQuery(args as z.infer<typeof DebugQueryInputSchema>); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }], }; } ); // ============================================================================ // SERVER STARTUP // ============================================================================ async function main() { // Initialize the run manager with absolute path to server directory // The runs_dir in config will create ./runs under this base initRunManager(SERVER_BASE_DIR, { storage: { runs_dir: "runs", // Just "runs", not "./runs" to avoid path issues max_runs: 100, cleanup_policy: "fifo", }, }); // Initialize the project manager initProjectManager(SERVER_BASE_DIR); // Connect via stdio transport const transport = new StdioServerTransport(); await server.connect(transport); console.error("IndexFoundry-MCP server started"); } main().catch((error) => { console.error("Fatal error:", error); process.exit(1); });

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server