IndexFoundry MCP

connect.ts•16.9 KiB

/** * IndexFoundry-MCP: Connect Tools (Phase 1) * * Fetchers for various content sources: URLs, sitemaps, folders, PDFs. * All tools are idempotent and produce deterministic outputs. * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import * as path from "path"; import * as fs from "fs/promises"; import { XMLParser } from "fast-xml-parser"; import { glob } from "glob"; import type { RawArtifact, ToolError } from "../types.js"; import type { ConnectUrlInput, ConnectSitemapInput, ConnectFolderInput, ConnectPdfInput } from "../schemas.js"; import { sha256, pathExists, ensureDir, extensionFromContentType, contentTypeFromExtension, appendJsonl, writeJson, createToolError, now, } from "../utils.js"; import { getRunManager } from "../run-manager.js"; // ============================================================================ // Connect URL // ============================================================================ export interface ConnectUrlResult { success: boolean; artifact: { path: string; sha256: string; size_bytes: number; content_type: string; fetched_at: string; }; skipped?: boolean; error?: string; } export async function connectUrl(input: ConnectUrlInput): Promise<ConnectUrlResult | ToolError> { const manager = getRunManager(); const config = manager.getConfig(); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const rawDir = manager.getRawDir(input.run_id); try { // Check domain allowlist const url = new URL(input.url); const blockedDomains = config.security.blocked_domains; if (blockedDomains.includes(url.hostname)) { return createToolError("DOMAIN_BLOCKED", `Domain ${url.hostname} is blocked`, { recoverable: false, suggestion: "Remove domain from blocked_domains in config", }); } if (input.allowed_domains?.length && !input.allowed_domains.includes(url.hostname)) { return createToolError("DOMAIN_BLOCKED", `Domain ${url.hostname} not in allowlist`, { recoverable: false, suggestion: "Add domain to allowed_domains parameter", }); } // Fetch the URL const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), input.timeout_ms); const headers: Record<string, string> = { "User-Agent": config.defaults.connect.user_agent, ...input.headers, }; let response: Response; try { response = await fetch(input.url, { headers, signal: controller.signal, }); } finally { clearTimeout(timeout); } if (!response.ok) { return createToolError("FETCH_FAILED", `HTTP ${response.status}: ${response.statusText}`, { details: { url: input.url, status: response.status }, recoverable: response.status >= 500, suggestion: response.status >= 500 ? "Retry later" : "Check URL validity", }); } // Get content const buffer = Buffer.from(await response.arrayBuffer()); const contentType = response.headers.get("content-type") || "application/octet-stream"; const hash = sha256(buffer); const ext = extensionFromContentType(contentType); const filename = `${hash}${ext}`; const outputPath = path.join(rawDir, filename); // Check if already exists (idempotency) if (await pathExists(outputPath) && !input.force) { const stats = await fs.stat(outputPath); return { success: true, skipped: true, artifact: { path: `raw/${filename}`, sha256: hash, size_bytes: stats.size, content_type: contentType.split(";")[0].trim(), fetched_at: now(), }, }; } // Check file size if (buffer.length > config.defaults.connect.max_file_size_mb * 1024 * 1024) { return createToolError("FILE_TOO_LARGE", `File exceeds max size of ${config.defaults.connect.max_file_size_mb}MB`, { details: { size_mb: buffer.length / (1024 * 1024) }, recoverable: false, suggestion: "Increase max_file_size_mb in config", }); } // Write to disk await fs.writeFile(outputPath, buffer); // Record in raw manifest const artifact: RawArtifact = { uri: input.url, sha256: hash, fetched_at: now(), size_bytes: buffer.length, content_type: contentType.split(";")[0].trim(), local_path: `raw/${filename}`, }; await appendJsonl(path.join(rawDir, "raw_manifest.jsonl"), [artifact]); return { success: true, artifact: { path: `raw/${filename}`, sha256: hash, size_bytes: buffer.length, content_type: contentType.split(";")[0].trim(), fetched_at: artifact.fetched_at, }, }; } catch (err) { if (err instanceof Error && err.name === "AbortError") { return createToolError("FETCH_TIMEOUT", `Request timed out after ${input.timeout_ms}ms`, { details: { url: input.url }, recoverable: true, suggestion: "Increase timeout_ms or check network", }); } return createToolError("FETCH_FAILED", `Failed to fetch URL: ${err}`, { details: { url: input.url, error: String(err) }, recoverable: true, suggestion: "Check URL and network connectivity", }); } } // ============================================================================ // Connect Sitemap // ============================================================================ export interface ConnectSitemapResult { success: boolean; urls_discovered: number; urls_fetched: number; urls_skipped: number; urls_failed: number; artifacts: Array<{ url: string; path: string; sha256: string; }>; errors: Array<{ url: string; error: string }>; } export async function connectSitemap(input: ConnectSitemapInput): Promise<ConnectSitemapResult | ToolError> { const manager = getRunManager(); const config = manager.getConfig(); try { // Fetch sitemap XML const response = await fetch(input.sitemap_url, { headers: { "User-Agent": config.defaults.connect.user_agent }, }); if (!response.ok) { return createToolError("FETCH_FAILED", `Failed to fetch sitemap: HTTP ${response.status}`, { recoverable: response.status >= 500, }); } const xml = await response.text(); const parser = new XMLParser({ ignoreAttributes: false }); const parsed = parser.parse(xml); // Extract URLs from sitemap let urls: string[] = []; // Handle both regular sitemaps and sitemap indexes if (parsed.sitemapindex?.sitemap) { // Sitemap index - for now just get URLs from first level const sitemaps = Array.isArray(parsed.sitemapindex.sitemap) ? parsed.sitemapindex.sitemap : [parsed.sitemapindex.sitemap]; urls = sitemaps.map((s: { loc: string }) => s.loc); } else if (parsed.urlset?.url) { const urlEntries = Array.isArray(parsed.urlset.url) ? parsed.urlset.url : [parsed.urlset.url]; urls = urlEntries.map((u: { loc: string }) => u.loc); } // Apply include/exclude patterns if (input.include_patterns?.length) { const regexes = input.include_patterns.map(p => new RegExp(p)); urls = urls.filter(url => regexes.some(r => r.test(url))); } if (input.exclude_patterns?.length) { const regexes = input.exclude_patterns.map(p => new RegExp(p)); urls = urls.filter(url => !regexes.some(r => r.test(url))); } // Sort for determinism urls.sort(); // Limit URLs urls = urls.slice(0, input.max_pages); const result: ConnectSitemapResult = { success: true, urls_discovered: urls.length, urls_fetched: 0, urls_skipped: 0, urls_failed: 0, artifacts: [], errors: [], }; // Fetch URLs with concurrency control const queue = [...urls]; const inFlight: Promise<void>[] = []; const fetchOne = async (url: string): Promise<void> => { const urlResult = await connectUrl({ run_id: input.run_id, url, allowed_domains: input.allowed_domains, timeout_ms: 30000, force: input.force, }); if ("isError" in urlResult) { result.urls_failed++; result.errors.push({ url, error: urlResult.message }); } else if (urlResult.skipped) { result.urls_skipped++; result.artifacts.push({ url, path: urlResult.artifact.path, sha256: urlResult.artifact.sha256, }); } else { result.urls_fetched++; result.artifacts.push({ url, path: urlResult.artifact.path, sha256: urlResult.artifact.sha256, }); } }; while (queue.length > 0 || inFlight.length > 0) { // Fill up to concurrency limit while (queue.length > 0 && inFlight.length < input.concurrency) { const url = queue.shift()!; const promise = fetchOne(url).then(() => { const idx = inFlight.indexOf(promise); if (idx >= 0) inFlight.splice(idx, 1); }); inFlight.push(promise); } // Wait for at least one to complete if (inFlight.length > 0) { await Promise.race(inFlight); } } return result; } catch (err) { return createToolError("FETCH_FAILED", `Failed to process sitemap: ${err}`, { recoverable: true, }); } } // ============================================================================ // Connect Folder // ============================================================================ export interface ConnectFolderResult { success: boolean; files_discovered: number; files_copied: number; files_skipped: number; files_failed: number; artifacts: Array<{ source_path: string; path: string; sha256: string; size_bytes: number; }>; errors: Array<{ path: string; error: string }>; } export async function connectFolder(input: ConnectFolderInput): Promise<ConnectFolderResult | ToolError> { const manager = getRunManager(); const config = manager.getConfig(); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const rawDir = manager.getRawDir(input.run_id); try { // Verify source path exists if (!await pathExists(input.path)) { return createToolError("FETCH_FAILED", `Folder not found: ${input.path}`, { recoverable: false, }); } // Find matching files const pattern = path.join(input.path, input.glob).replace(/\\/g, "/"); let files = await glob(pattern, { nodir: true }); // Apply exclude patterns if (input.exclude_patterns?.length) { const regexes = input.exclude_patterns.map(p => new RegExp(p)); files = files.filter(f => !regexes.some(r => r.test(f))); } // Sort for determinism files.sort(); const result: ConnectFolderResult = { success: true, files_discovered: files.length, files_copied: 0, files_skipped: 0, files_failed: 0, artifacts: [], errors: [], }; const maxBytes = config.defaults.connect.max_file_size_mb * 1024 * 1024; for (const filePath of files) { try { const stats = await fs.stat(filePath); // Skip large files if (stats.size > maxBytes) { result.files_skipped++; result.errors.push({ path: filePath, error: `File exceeds max size of ${config.defaults.connect.max_file_size_mb}MB` }); continue; } // Read file and hash const content = await fs.readFile(filePath); const hash = sha256(content); const ext = path.extname(filePath); const contentType = contentTypeFromExtension(ext); const filename = `${hash}${ext}`; const outputPath = path.join(rawDir, filename); // Check if exists if (await pathExists(outputPath) && !input.force) { result.files_skipped++; result.artifacts.push({ source_path: filePath, path: `raw/${filename}`, sha256: hash, size_bytes: stats.size, }); continue; } // Copy to raw directory await fs.copyFile(filePath, outputPath); // Record in manifest const artifact: RawArtifact = { uri: `file://${filePath}`, sha256: hash, fetched_at: now(), size_bytes: stats.size, content_type: contentType, local_path: `raw/${filename}`, }; await appendJsonl(path.join(rawDir, "raw_manifest.jsonl"), [artifact]); result.files_copied++; result.artifacts.push({ source_path: filePath, path: `raw/${filename}`, sha256: hash, size_bytes: stats.size, }); } catch (err) { result.files_failed++; result.errors.push({ path: filePath, error: String(err) }); } } return result; } catch (err) { return createToolError("FETCH_FAILED", `Failed to process folder: ${err}`, { recoverable: false, }); } } // ============================================================================ // Connect PDF // ============================================================================ export interface ConnectPdfResult { success: boolean; artifact: { path: string; sha256: string; size_bytes: number; page_count?: number; pdf_version?: string; has_ocr_layer?: boolean; metadata?: { title?: string; author?: string; created?: string; modified?: string; }; }; skipped?: boolean; } export async function connectPdf(input: ConnectPdfInput): Promise<ConnectPdfResult | ToolError> { const manager = getRunManager(); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const rawDir = manager.getRawDir(input.run_id); // Determine if source is URL or local path const isUrl = input.source.startsWith("http://") || input.source.startsWith("https://"); let content: Buffer; let originalUri: string; try { if (isUrl) { // Fetch from URL const result = await connectUrl({ run_id: input.run_id, url: input.source, timeout_ms: 60000, // Longer timeout for PDFs force: input.force, }); if ("isError" in result) { return result; } // Read the fetched file const fullPath = path.join(manager.getRunDir(input.run_id), result.artifact.path); content = await fs.readFile(fullPath); originalUri = input.source; // Basic PDF validation if (!content.toString("utf8", 0, 5).includes("%PDF")) { return createToolError("PARSE_ERROR", "File does not appear to be a valid PDF", { recoverable: false, }); } return { success: true, skipped: result.skipped, artifact: { path: result.artifact.path, sha256: result.artifact.sha256, size_bytes: result.artifact.size_bytes, }, }; } else { // Read from local path if (!await pathExists(input.source)) { return createToolError("FETCH_FAILED", `PDF file not found: ${input.source}`, { recoverable: false, }); } content = await fs.readFile(input.source); originalUri = `file://${input.source}`; // Basic PDF validation if (!content.toString("utf8", 0, 5).includes("%PDF")) { return createToolError("PARSE_ERROR", "File does not appear to be a valid PDF", { recoverable: false, }); } const hash = sha256(content); const filename = `${hash}.pdf`; const outputPath = path.join(rawDir, filename); // Check if exists if (await pathExists(outputPath) && !input.force) { return { success: true, skipped: true, artifact: { path: `raw/${filename}`, sha256: hash, size_bytes: content.length, }, }; } // Copy to raw directory await fs.writeFile(outputPath, content); // Record in manifest const artifact: RawArtifact = { uri: originalUri, sha256: hash, fetched_at: now(), size_bytes: content.length, content_type: "application/pdf", local_path: `raw/${filename}`, }; await appendJsonl(path.join(rawDir, "raw_manifest.jsonl"), [artifact]); return { success: true, artifact: { path: `raw/${filename}`, sha256: hash, size_bytes: content.length, }, }; } } catch (err) { return createToolError("FETCH_FAILED", `Failed to fetch PDF: ${err}`, { recoverable: true, }); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

connect.ts•16.9 KiB