Skip to main content
Glama
connect.ts18.9 kB
/** * IndexFoundry-MCP Connect Phase Tools * * Phase 1: Fetching raw content from various sources. * All connectors produce artifacts in runs/<run_id>/raw/ */ import type { ConnectUrlInput, ConnectSitemapInput, ConnectFolderInput, ConnectPdfInput } from "../schemas/index.js"; import type { RawArtifact, PdfArtifact } from "../types.js"; import { getRunDirectory, hashBuffer, getExtension, appendJsonl, fileExists, copyFileWithHash, getSortedFiles, Timer, formatError, RunLogger, } from "../utils.js"; import { promises as fs } from "fs"; import { join, basename } from "path"; import { lookup } from "mime-types"; // Configuration const RUNS_DIR = process.env.INDEXFOUNDRY_RUNS_DIR ?? "./runs"; const USER_AGENT = "IndexFoundry/0.1.0"; // ============================================================================= // Connect URL // ============================================================================= export async function handleConnectUrl( params: ConnectUrlInput ): Promise<{ content: Array<{ type: "text"; text: string }>; structuredContent?: unknown }> { const timer = new Timer(); // Get run directory const runDir = await getRunDirectory(RUNS_DIR, params.run_id); if (!runDir) { return { content: [{ type: "text", text: `Error: Run ${params.run_id} not found` }], structuredContent: { success: false, error: formatError("RUN_NOT_FOUND", `Run directory ${params.run_id} does not exist`, { recoverable: false, suggestion: "Create a run first or check the run_id" }) } }; } const logger = new RunLogger(runDir.paths.logs); logger.setContext("connect", "connect_url"); try { // Domain check if (params.allowed_domains && params.allowed_domains.length > 0) { const url = new URL(params.url); if (!params.allowed_domains.includes(url.hostname)) { await logger.warn(`Domain ${url.hostname} not in allowlist`); return { content: [{ type: "text", text: `Error: Domain ${url.hostname} not allowed` }], structuredContent: { success: false, error: formatError("DOMAIN_BLOCKED", `Domain ${url.hostname} is not in the allowed domains list`, { recoverable: false, suggestion: `Add ${url.hostname} to allowed_domains or remove the restriction` }) } }; } } await logger.info(`Fetching ${params.url}`); // Fetch with timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), params.timeout_ms); const response = await fetch(params.url, { signal: controller.signal, headers: { "User-Agent": USER_AGENT, ...params.headers, }, }); clearTimeout(timeoutId); if (!response.ok) { await logger.error(`HTTP ${response.status}: ${response.statusText}`); return { content: [{ type: "text", text: `Error: HTTP ${response.status} ${response.statusText}` }], structuredContent: { success: false, error: formatError("FETCH_FAILED", `HTTP ${response.status}: ${response.statusText}`, { details: { status: response.status, url: params.url }, recoverable: response.status >= 500, suggestion: response.status === 404 ? "Check if the URL is correct" : "Retry later" }) } }; } // Get content const buffer = Buffer.from(await response.arrayBuffer()); const contentType = response.headers.get("content-type") ?? "application/octet-stream"; const hash = hashBuffer(buffer); const ext = getExtension(contentType); // Check if already exists const artifactPath = join(runDir.paths.raw, `${hash}${ext}`); if (!params.force && await fileExists(artifactPath)) { await logger.info(`Skipped (already exists): ${hash}${ext}`); const artifact: RawArtifact = { path: artifactPath, sha256: hash, size_bytes: buffer.length, content_type: contentType, fetched_at: new Date().toISOString(), source_uri: params.url, }; return { content: [{ type: "text", text: `Skipped (already exists): ${hash}${ext}` }], structuredContent: { success: true, artifact, skipped: true, duration_ms: timer.elapsedMs() } }; } // Write file await fs.writeFile(artifactPath, buffer); // Append to raw manifest const manifestEntry = { uri: params.url, sha256: hash, fetched_at: new Date().toISOString(), size_bytes: buffer.length, content_type: contentType, }; await appendJsonl(join(runDir.paths.raw, "raw_manifest.jsonl"), manifestEntry); await logger.info(`Fetched ${hash}${ext} (${buffer.length} bytes)`); const artifact: RawArtifact = { path: artifactPath, sha256: hash, size_bytes: buffer.length, content_type: contentType, fetched_at: new Date().toISOString(), source_uri: params.url, }; return { content: [{ type: "text", text: `Fetched: ${hash}${ext} (${buffer.length} bytes)` }], structuredContent: { success: true, artifact, skipped: false, duration_ms: timer.elapsedMs() } }; } catch (error) { const message = error instanceof Error ? error.message : String(error); await logger.error(`Fetch failed: ${message}`); return { content: [{ type: "text", text: `Error: ${message}` }], structuredContent: { success: false, error: formatError("FETCH_FAILED", message, { details: { url: params.url }, recoverable: message.includes("timeout") || message.includes("network"), suggestion: "Check network connectivity and try again" }) } }; } } // ============================================================================= // Connect Sitemap // ============================================================================= export async function handleConnectSitemap( params: ConnectSitemapInput ): Promise<{ content: Array<{ type: "text"; text: string }>; structuredContent?: unknown }> { const timer = new Timer(); const runDir = await getRunDirectory(RUNS_DIR, params.run_id); if (!runDir) { return { content: [{ type: "text", text: `Error: Run ${params.run_id} not found` }], structuredContent: { success: false, error: formatError("RUN_NOT_FOUND", `Run directory ${params.run_id} does not exist`) } }; } const logger = new RunLogger(runDir.paths.logs); logger.setContext("connect", "connect_sitemap"); try { await logger.info(`Fetching sitemap: ${params.sitemap_url}`); // Fetch sitemap const response = await fetch(params.sitemap_url, { headers: { "User-Agent": USER_AGENT }, }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const sitemapXml = await response.text(); // Parse URLs from sitemap (simple regex extraction for now) // TODO: Proper XML parsing with xml2js const urlMatches = sitemapXml.matchAll(/<loc>([^<]+)<\/loc>/g); let urls = Array.from(urlMatches, m => m[1]); await logger.info(`Found ${urls.length} URLs in sitemap`); // Apply include patterns if (params.include_patterns && params.include_patterns.length > 0) { const includeRegexes = params.include_patterns.map(p => new RegExp(p)); urls = urls.filter(url => includeRegexes.some(r => r.test(url))); } // Apply exclude patterns if (params.exclude_patterns && params.exclude_patterns.length > 0) { const excludeRegexes = params.exclude_patterns.map(p => new RegExp(p)); urls = urls.filter(url => !excludeRegexes.some(r => r.test(url))); } // Sort for determinism urls.sort(); // Limit urls = urls.slice(0, params.max_pages); await logger.info(`Processing ${urls.length} URLs after filtering`); // Fetch URLs with concurrency control const results: Array<{ url: string; path?: string; sha256?: string; error?: string }> = []; let urlsFetched = 0; let urlsSkipped = 0; let urlsFailed = 0; // Simple sequential processing for now // TODO: Add proper concurrency with p-limit or similar for (const url of urls) { try { const result = await handleConnectUrl({ run_id: params.run_id, url, timeout_ms: 30000, force: params.force, }); const structured = result.structuredContent as { success: boolean; artifact?: RawArtifact; skipped?: boolean }; if (structured.success && structured.artifact) { results.push({ url, path: structured.artifact.path, sha256: structured.artifact.sha256, }); if (structured.skipped) { urlsSkipped++; } else { urlsFetched++; } } else { urlsFailed++; results.push({ url, error: "Fetch failed" }); } } catch (error) { urlsFailed++; results.push({ url, error: error instanceof Error ? error.message : String(error) }); } } await logger.info(`Completed: ${urlsFetched} fetched, ${urlsSkipped} skipped, ${urlsFailed} failed`); return { content: [{ type: "text", text: `Sitemap crawl complete: ${urlsFetched} fetched, ${urlsSkipped} skipped, ${urlsFailed} failed` }], structuredContent: { success: true, urls_discovered: urls.length, urls_fetched: urlsFetched, urls_skipped: urlsSkipped, urls_failed: urlsFailed, artifacts: results.filter(r => r.path).map(r => ({ url: r.url, path: r.path, sha256: r.sha256 })), errors: results.filter(r => r.error).map(r => ({ url: r.url, error: r.error })), duration_ms: timer.elapsedMs() } }; } catch (error) { const message = error instanceof Error ? error.message : String(error); await logger.error(`Sitemap crawl failed: ${message}`); return { content: [{ type: "text", text: `Error: ${message}` }], structuredContent: { success: false, error: formatError("FETCH_FAILED", message, { recoverable: true }) } }; } } // ============================================================================= // Connect Folder // ============================================================================= export async function handleConnectFolder( params: ConnectFolderInput ): Promise<{ content: Array<{ type: "text"; text: string }>; structuredContent?: unknown }> { const timer = new Timer(); const runDir = await getRunDirectory(RUNS_DIR, params.run_id); if (!runDir) { return { content: [{ type: "text", text: `Error: Run ${params.run_id} not found` }], structuredContent: { success: false, error: formatError("RUN_NOT_FOUND", `Run directory ${params.run_id} does not exist`) } }; } const logger = new RunLogger(runDir.paths.logs); logger.setContext("connect", "connect_folder"); try { await logger.info(`Loading folder: ${params.path}`); // Get sorted file list const files = await getSortedFiles(params.path, params.glob); await logger.info(`Found ${files.length} files matching pattern`); const maxBytes = params.max_file_size_mb * 1024 * 1024; const artifacts: RawArtifact[] = []; let filesCopied = 0; let filesSkipped = 0; let filesTooLarge = 0; for (const file of files) { const stat = await fs.stat(file); // Skip directories if (stat.isDirectory()) continue; // Check size if (stat.size > maxBytes) { filesTooLarge++; await logger.warn(`Skipped (too large): ${basename(file)} (${stat.size} bytes)`); continue; } // Copy file const contentType = lookup(file) || "application/octet-stream"; const ext = getExtension(file); const { hash, size } = await copyFileWithHash( file, join(runDir.paths.raw, `placeholder${ext}`) // Temp path ); const artifactPath = join(runDir.paths.raw, `${hash}${ext}`); // Check if already exists if (!params.force && await fileExists(artifactPath)) { filesSkipped++; // Remove the temp copy await fs.unlink(join(runDir.paths.raw, `placeholder${ext}`)).catch(() => {}); continue; } // Rename to hash-based name await fs.rename(join(runDir.paths.raw, `placeholder${ext}`), artifactPath); const artifact: RawArtifact = { path: artifactPath, sha256: hash, size_bytes: size, content_type: contentType, fetched_at: new Date().toISOString(), source_uri: `file://${file}`, }; artifacts.push(artifact); // Append to manifest await appendJsonl(join(runDir.paths.raw, "raw_manifest.jsonl"), { uri: artifact.source_uri, sha256: hash, fetched_at: artifact.fetched_at, size_bytes: size, content_type: contentType, }); filesCopied++; } await logger.info(`Completed: ${filesCopied} copied, ${filesSkipped} skipped, ${filesTooLarge} too large`); return { content: [{ type: "text", text: `Folder load complete: ${filesCopied} copied, ${filesSkipped} skipped, ${filesTooLarge} too large` }], structuredContent: { success: true, files_found: files.length, files_copied: filesCopied, files_skipped: filesSkipped, files_too_large: filesTooLarge, artifacts, duration_ms: timer.elapsedMs() } }; } catch (error) { const message = error instanceof Error ? error.message : String(error); await logger.error(`Folder load failed: ${message}`); return { content: [{ type: "text", text: `Error: ${message}` }], structuredContent: { success: false, error: formatError("FILE_ERROR", message, { recoverable: false }) } }; } } // ============================================================================= // Connect PDF // ============================================================================= export async function handleConnectPdf( params: ConnectPdfInput ): Promise<{ content: Array<{ type: "text"; text: string }>; structuredContent?: unknown }> { const timer = new Timer(); const runDir = await getRunDirectory(RUNS_DIR, params.run_id); if (!runDir) { return { content: [{ type: "text", text: `Error: Run ${params.run_id} not found` }], structuredContent: { success: false, error: formatError("RUN_NOT_FOUND", `Run directory ${params.run_id} does not exist`) } }; } const logger = new RunLogger(runDir.paths.logs); logger.setContext("connect", "connect_pdf"); try { let buffer: Buffer; let sourceUri: string; // Check if URL or local path if (params.source.startsWith("http://") || params.source.startsWith("https://")) { await logger.info(`Fetching PDF from URL: ${params.source}`); const response = await fetch(params.source, { headers: { "User-Agent": USER_AGENT }, }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } buffer = Buffer.from(await response.arrayBuffer()); sourceUri = params.source; } else { await logger.info(`Loading PDF from file: ${params.source}`); buffer = await fs.readFile(params.source); sourceUri = `file://${params.source}`; } const hash = hashBuffer(buffer); const artifactPath = join(runDir.paths.raw, `${hash}.pdf`); // Check if already exists if (!params.force && await fileExists(artifactPath)) { await logger.info(`Skipped (already exists): ${hash}.pdf`); // Still need to extract metadata for response // TODO: Read from cached metadata or extract again return { content: [{ type: "text", text: `Skipped (already exists): ${hash}.pdf` }], structuredContent: { success: true, skipped: true, artifact: { path: artifactPath, sha256: hash, size_bytes: buffer.length, content_type: "application/pdf", fetched_at: new Date().toISOString(), source_uri: sourceUri, // PDF metadata would go here page_count: 0, // TODO: Extract pdf_version: "unknown", has_ocr_layer: false, pdf_metadata: {} }, duration_ms: timer.elapsedMs() } }; } // Write file await fs.writeFile(artifactPath, buffer); // Extract PDF metadata // TODO: Use pdf-parse or similar for metadata extraction const pageCount = 0; // Placeholder const pdfVersion = "unknown"; const hasOcrLayer = false; const artifact: PdfArtifact = { path: artifactPath, sha256: hash, size_bytes: buffer.length, content_type: "application/pdf", fetched_at: new Date().toISOString(), source_uri: sourceUri, page_count: pageCount, pdf_version: pdfVersion, has_ocr_layer: hasOcrLayer, pdf_metadata: { // TODO: Extract from PDF }, }; // Append to manifest await appendJsonl(join(runDir.paths.raw, "raw_manifest.jsonl"), { uri: sourceUri, sha256: hash, fetched_at: artifact.fetched_at, size_bytes: buffer.length, content_type: "application/pdf", page_count: pageCount, }); await logger.info(`Fetched PDF: ${hash}.pdf (${buffer.length} bytes, ${pageCount} pages)`); return { content: [{ type: "text", text: `Fetched PDF: ${hash}.pdf (${buffer.length} bytes)` }], structuredContent: { success: true, artifact, skipped: false, duration_ms: timer.elapsedMs() } }; } catch (error) { const message = error instanceof Error ? error.message : String(error); await logger.error(`PDF fetch failed: ${message}`); return { content: [{ type: "text", text: `Error: ${message}` }], structuredContent: { success: false, error: formatError("FETCH_FAILED", message, { recoverable: true }) } }; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server