IndexFoundry MCP

extract.ts•16 KiB

/** * IndexFoundry-MCP: Extract Tools (Phase 2) * * Parsers for extracting text from various document formats. * All extractors produce deterministic outputs with detailed reports. * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import * as path from "path"; import * as fs from "fs/promises"; import * as cheerio from "cheerio"; import type { PageExtraction, ExtractionReport, ToolError } from "../types.js"; import type { ExtractPdfInput, ExtractHtmlInput, ExtractDocumentInput } from "../schemas.js"; import { sha256, pathExists, ensureDir, appendJsonl, writeJson, readJson, normalizeText, createToolError, now, } from "../utils.js"; import { getRunManager } from "../run-manager.js"; // ============================================================================ // Constants // ============================================================================ const PDF_PARSE_VERSION = "1.1.1"; const CHEERIO_VERSION = "1.0.0"; // ============================================================================ // Extract PDF // ============================================================================ export interface ExtractPdfResult { success: boolean; artifacts: { pages_jsonl: string; full_text?: string; }; stats: { pages_processed: number; pages_empty: number; pages_ocr_fallback: number; chars_extracted: number; }; extraction_report: ExtractionReport; } export async function extractPdf(input: ExtractPdfInput): Promise<ExtractPdfResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const pdfPath = path.join(runDir, input.pdf_path); if (!await pathExists(pdfPath)) { return createToolError("PARSE_ERROR", `PDF file not found: ${input.pdf_path}`, { recoverable: false, }); } // Get file hash for output naming const pdfContent = await fs.readFile(pdfPath); const pdfHash = sha256(pdfContent); // Check if already extracted const pagesPath = path.join(extractedDir, `${pdfHash}.pages.jsonl`); if (await pathExists(pagesPath) && !input.force) { // Return existing extraction const report = await readJson<ExtractionReport>( path.join(extractedDir, `${pdfHash}.report.json`) ).catch(() => ({ extractor_version: PDF_PARSE_VERSION, mode_used: input.mode, warnings: [], pages_processed: 0, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { pages_jsonl: `extracted/${pdfHash}.pages.jsonl`, full_text: `extracted/${pdfHash}.txt`, }, stats: { pages_processed: report.pages_processed, pages_empty: report.pages_empty, pages_ocr_fallback: 0, chars_extracted: report.chars_extracted, }, extraction_report: report, }; } // Import pdf-parse dynamically const pdfParse = (await import("pdf-parse")).default; // Parse PDF const data = await pdfParse(pdfContent, { // Limit page range if specified max: input.page_range?.end, }); // Process pages - pdf-parse gives us full text, we need to approximate pages const fullText = normalizeText(data.text); const pageCount = data.numpages; // Split text into approximate pages (heuristic: look for page markers or split evenly) const pages: PageExtraction[] = []; const avgCharsPerPage = Math.ceil(fullText.length / pageCount); let offset = 0; for (let i = 1; i <= pageCount; i++) { // Skip pages before range if (input.page_range && i < input.page_range.start) { offset += avgCharsPerPage; continue; } // Stop after range if (input.page_range && i > input.page_range.end) { break; } // Extract page text (approximate) const pageText = fullText.slice(offset, offset + avgCharsPerPage); offset += avgCharsPerPage; const page: PageExtraction = { page: i, text: pageText, char_count: pageText.length, is_empty: pageText.trim().length === 0, ocr_used: false, }; pages.push(page); } // Write pages JSONL await appendJsonl(pagesPath, pages); // Write full text const fullTextPath = path.join(extractedDir, `${pdfHash}.txt`); await fs.writeFile(fullTextPath, fullText, "utf-8"); // Generate report const report: ExtractionReport = { extractor_version: `pdf-parse@${PDF_PARSE_VERSION}`, mode_used: input.mode, warnings: [], pages_processed: pages.length, pages_empty: pages.filter(p => p.is_empty).length, chars_extracted: fullText.length, }; await writeJson(path.join(extractedDir, `${pdfHash}.report.json`), report); return { success: true, artifacts: { pages_jsonl: `extracted/${pdfHash}.pages.jsonl`, full_text: `extracted/${pdfHash}.txt`, }, stats: { pages_processed: pages.length, pages_empty: pages.filter(p => p.is_empty).length, pages_ocr_fallback: 0, chars_extracted: fullText.length, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract PDF: ${err}`, { recoverable: false, }); } } // ============================================================================ // Extract HTML // ============================================================================ export interface ExtractHtmlResult { success: boolean; artifacts: { text_file: string; markdown_file?: string; }; stats: { chars_extracted: number; headings_found: number; links_found: number; tables_found: number; }; extraction_report: ExtractionReport; } export async function extractHtml(input: ExtractHtmlInput): Promise<ExtractHtmlResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const htmlPath = path.join(runDir, input.html_path); if (!await pathExists(htmlPath)) { return createToolError("PARSE_ERROR", `HTML file not found: ${input.html_path}`, { recoverable: false, }); } // Read and hash content const htmlContent = await fs.readFile(htmlPath, "utf-8"); const htmlHash = sha256(htmlContent); // Check if already extracted const textPath = path.join(extractedDir, `${htmlHash}.txt`); if (await pathExists(textPath) && !input.force) { const report = await readJson<ExtractionReport>( path.join(extractedDir, `${htmlHash}.report.json`) ).catch(() => ({ extractor_version: CHEERIO_VERSION, mode_used: "text", warnings: [], pages_processed: 1, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { text_file: `extracted/${htmlHash}.txt`, markdown_file: input.preserve_headings ? `extracted/${htmlHash}.md` : undefined, }, stats: { chars_extracted: report.chars_extracted, headings_found: 0, links_found: 0, tables_found: 0, }, extraction_report: report, }; } // Parse HTML with Cheerio const $ = cheerio.load(htmlContent); // Remove unwanted elements if (input.remove_selectors?.length) { for (const selector of input.remove_selectors) { $(selector).remove(); } } // Also remove common boilerplate by default $("script, style, noscript, iframe, svg").remove(); let text = ""; let markdown = ""; let headingsFound = 0; let linksFound = 0; let tablesFound = 0; // Extract headings if (input.preserve_headings) { $("h1, h2, h3, h4, h5, h6").each((_, el) => { const level = parseInt(el.tagName[1]); const heading = $(el).text().trim(); if (heading) { markdown += `${"#".repeat(level)} ${heading}\n\n`; headingsFound++; } }); } // Extract links if (input.preserve_links) { $("a").each((_, el) => { linksFound++; }); } // Extract tables if (input.preserve_tables) { $("table").each((_, table) => { tablesFound++; const rows: string[][] = []; $(table).find("tr").each((_, tr) => { const row: string[] = []; $(tr).find("th, td").each((_, cell) => { row.push($(cell).text().trim()); }); if (row.length) rows.push(row); }); if (rows.length) { // Convert to markdown table const header = rows[0]; markdown += "| " + header.join(" | ") + " |\n"; markdown += "| " + header.map(() => "---").join(" | ") + " |\n"; for (let i = 1; i < rows.length; i++) { markdown += "| " + rows[i].join(" | ") + " |\n"; } markdown += "\n"; } }); } // Get clean text text = normalizeText($("body").text()); // If no markdown content, use plain text if (!markdown.trim()) { markdown = text; } // Write outputs await fs.writeFile(textPath, text, "utf-8"); if (input.preserve_headings || input.preserve_tables) { const mdPath = path.join(extractedDir, `${htmlHash}.md`); await fs.writeFile(mdPath, markdown, "utf-8"); } // Generate report const report: ExtractionReport = { extractor_version: `cheerio@${CHEERIO_VERSION}`, mode_used: input.preserve_headings ? "markdown" : "text", warnings: [], pages_processed: 1, pages_empty: text.trim().length === 0 ? 1 : 0, chars_extracted: text.length, }; await writeJson(path.join(extractedDir, `${htmlHash}.report.json`), report); return { success: true, artifacts: { text_file: `extracted/${htmlHash}.txt`, markdown_file: input.preserve_headings ? `extracted/${htmlHash}.md` : undefined, }, stats: { chars_extracted: text.length, headings_found: headingsFound, links_found: linksFound, tables_found: tablesFound, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract HTML: ${err}`, { recoverable: false, }); } } // ============================================================================ // Extract Document (Generic) // ============================================================================ export interface ExtractDocumentResult { success: boolean; artifacts: { text_file: string; }; stats: { chars_extracted: number; format_detected: string; rows_processed?: number; }; extraction_report: ExtractionReport; } export async function extractDocument(input: ExtractDocumentInput): Promise<ExtractDocumentResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const docPath = path.join(runDir, input.doc_path); if (!await pathExists(docPath)) { return createToolError("PARSE_ERROR", `Document not found: ${input.doc_path}`, { recoverable: false, }); } // Read content const content = await fs.readFile(docPath); const hash = sha256(content); // Detect format let format = input.format_hint; if (format === "auto") { const ext = path.extname(docPath).toLowerCase(); const mapping: Record<string, string> = { ".md": "markdown", ".txt": "txt", ".csv": "csv", ".json": "json", ".docx": "docx", } as const; format = (mapping[ext as keyof typeof mapping] || "txt") as typeof format; } // Check if already extracted const textPath = path.join(extractedDir, `${hash}.txt`); if (await pathExists(textPath) && !input.force) { const report = await readJson<ExtractionReport>( path.join(extractedDir, `${hash}.report.json`) ).catch(() => ({ extractor_version: "1.0.0", mode_used: format, warnings: [], pages_processed: 1, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { text_file: `extracted/${hash}.txt`, }, stats: { chars_extracted: report.chars_extracted, format_detected: format, }, extraction_report: report, }; } let text = ""; let rowsProcessed: number | undefined; const warnings: string[] = []; switch (format) { case "markdown": case "txt": text = normalizeText(content.toString("utf-8")); break; case "csv": // Parse CSV and convert to readable text const csvContent = content.toString("utf-8"); const lines = csvContent.split("\n"); const previewLines = lines.slice(0, input.csv_preview_rows + 1); // Simple CSV parsing (proper parsing would use a library) const rows = previewLines.map(line => { // Basic CSV splitting (doesn't handle quoted fields properly) return line.split(",").map(cell => cell.trim()); }); if (rows.length > 0) { // Format as readable table const header = rows[0]; text = header.join(" | ") + "\n"; text += header.map(() => "---").join(" | ") + "\n"; for (let i = 1; i < rows.length; i++) { text += rows[i].join(" | ") + "\n"; } rowsProcessed = rows.length - 1; if (lines.length > input.csv_preview_rows + 1) { warnings.push(`CSV truncated to ${input.csv_preview_rows} rows (total: ${lines.length - 1})`); } } break; case "json": // Pretty-print JSON for readability try { const json = JSON.parse(content.toString("utf-8")); text = JSON.stringify(json, null, 2); } catch { text = content.toString("utf-8"); warnings.push("Invalid JSON, returning raw content"); } break; case "docx": // DOCX extraction requires additional library // For now, return an error suggesting to use a different extractor return createToolError("PARSE_ERROR", "DOCX extraction not yet implemented", { recoverable: false, suggestion: "Convert DOCX to PDF or use external tool", }); default: text = normalizeText(content.toString("utf-8")); } // Write output await fs.writeFile(textPath, text, "utf-8"); // Generate report const report: ExtractionReport = { extractor_version: "1.0.0", mode_used: format, warnings, pages_processed: 1, pages_empty: text.trim().length === 0 ? 1 : 0, chars_extracted: text.length, }; await writeJson(path.join(extractedDir, `${hash}.report.json`), report); return { success: true, artifacts: { text_file: `extracted/${hash}.txt`, }, stats: { chars_extracted: text.length, format_detected: format, rows_processed: rowsProcessed, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract document: ${err}`, { recoverable: false, }); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extract.ts•16 KiB