Skip to main content
Glama
extract.ts16.4 kB
/** * IndexFoundry-MCP: Extract Tools (Phase 2) * * Parsers for extracting text from various document formats. * All extractors produce deterministic outputs with detailed reports. * * Copyright (c) 2024 vario.automation * Proprietary and confidential. All rights reserved. */ import * as path from "path"; import * as fs from "fs/promises"; import * as cheerio from "cheerio"; import type { PageExtraction, ExtractionReport, ToolError } from "../types.js"; import type { ExtractPdfInput, ExtractHtmlInput, ExtractDocumentInput } from "../schemas.js"; import { sha256, pathExists, ensureDir, appendJsonl, writeJson, readJson, normalizeText, createToolError, now, } from "../utils.js"; import { getRunManager } from "../run-manager.js"; // ============================================================================ // Constants // ============================================================================ const PDF_PARSE_VERSION = "1.1.1"; const CHEERIO_VERSION = "1.0.0"; // ============================================================================ // Extract PDF // ============================================================================ export interface ExtractPdfResult { success: boolean; artifacts: { pages_jsonl: string; full_text?: string; }; stats: { pages_processed: number; pages_empty: number; pages_ocr_fallback: number; chars_extracted: number; }; extraction_report: ExtractionReport; } export async function extractPdf(input: ExtractPdfInput): Promise<ExtractPdfResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const pdfPath = path.join(runDir, input.pdf_path); if (!await pathExists(pdfPath)) { return createToolError("PARSE_ERROR", `PDF file not found: ${input.pdf_path}`, { recoverable: false, }); } // Get file hash for output naming const pdfContent = await fs.readFile(pdfPath); const pdfHash = sha256(pdfContent); // Check if already extracted const pagesPath = path.join(extractedDir, `${pdfHash}.pages.jsonl`); if (await pathExists(pagesPath) && !input.force) { // Return existing extraction const report = await readJson<ExtractionReport>( path.join(extractedDir, `${pdfHash}.report.json`) ).catch(() => ({ extractor_version: PDF_PARSE_VERSION, mode_used: input.mode, warnings: [], pages_processed: 0, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { pages_jsonl: `extracted/${pdfHash}.pages.jsonl`, full_text: `extracted/${pdfHash}.txt`, }, stats: { pages_processed: report.pages_processed, pages_empty: report.pages_empty, pages_ocr_fallback: 0, chars_extracted: report.chars_extracted, }, extraction_report: report, }; } // Import pdf-parse dynamically const pdfParse = (await import("pdf-parse")).default; // Parse PDF const data = await pdfParse(pdfContent, { // Limit page range if specified max: input.page_range?.end, }); // Process pages - pdf-parse gives us full text, we need to approximate pages const fullText = normalizeText(data.text); const pageCount = data.numpages; // Split text into approximate pages (heuristic: look for page markers or split evenly) const pages: PageExtraction[] = []; const avgCharsPerPage = Math.ceil(fullText.length / pageCount); let offset = 0; for (let i = 1; i <= pageCount; i++) { // Skip pages before range if (input.page_range && i < input.page_range.start) { offset += avgCharsPerPage; continue; } // Stop after range if (input.page_range && i > input.page_range.end) { break; } // Extract page text (approximate) const pageText = fullText.slice(offset, offset + avgCharsPerPage); offset += avgCharsPerPage; const page: PageExtraction = { page: i, text: pageText, char_count: pageText.length, is_empty: pageText.trim().length === 0, ocr_used: false, }; pages.push(page); } // Write pages JSONL await appendJsonl(pagesPath, pages); // Write full text const fullTextPath = path.join(extractedDir, `${pdfHash}.txt`); await fs.writeFile(fullTextPath, fullText, "utf-8"); // Generate report const report: ExtractionReport = { extractor_version: `pdf-parse@${PDF_PARSE_VERSION}`, mode_used: input.mode, warnings: [], pages_processed: pages.length, pages_empty: pages.filter(p => p.is_empty).length, chars_extracted: fullText.length, }; await writeJson(path.join(extractedDir, `${pdfHash}.report.json`), report); return { success: true, artifacts: { pages_jsonl: `extracted/${pdfHash}.pages.jsonl`, full_text: `extracted/${pdfHash}.txt`, }, stats: { pages_processed: pages.length, pages_empty: pages.filter(p => p.is_empty).length, pages_ocr_fallback: 0, chars_extracted: fullText.length, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract PDF: ${err}`, { recoverable: false, }); } } // ============================================================================ // Extract HTML // ============================================================================ export interface ExtractHtmlResult { success: boolean; artifacts: { text_file: string; markdown_file?: string; }; stats: { chars_extracted: number; headings_found: number; links_found: number; tables_found: number; }; extraction_report: ExtractionReport; } export async function extractHtml(input: ExtractHtmlInput): Promise<ExtractHtmlResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const htmlPath = path.join(runDir, input.html_path); if (!await pathExists(htmlPath)) { return createToolError("PARSE_ERROR", `HTML file not found: ${input.html_path}`, { recoverable: false, }); } // Read and hash content const htmlContent = await fs.readFile(htmlPath, "utf-8"); const htmlHash = sha256(htmlContent); // Check if already extracted const textPath = path.join(extractedDir, `${htmlHash}.txt`); if (await pathExists(textPath) && !input.force) { const report = await readJson<ExtractionReport>( path.join(extractedDir, `${htmlHash}.report.json`) ).catch(() => ({ extractor_version: CHEERIO_VERSION, mode_used: "text", warnings: [], pages_processed: 1, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { text_file: `extracted/${htmlHash}.txt`, markdown_file: input.preserve_headings ? `extracted/${htmlHash}.md` : undefined, }, stats: { chars_extracted: report.chars_extracted, headings_found: 0, links_found: 0, tables_found: 0, }, extraction_report: report, }; } // Parse HTML with Cheerio const $ = cheerio.load(htmlContent); // Remove unwanted elements if (input.remove_selectors?.length) { for (const selector of input.remove_selectors) { $(selector).remove(); } } // Also remove common boilerplate by default $("script, style, noscript, iframe, svg").remove(); let text = ""; let markdown = ""; let headingsFound = 0; let linksFound = 0; let tablesFound = 0; // Extract headings if (input.preserve_headings) { $("h1, h2, h3, h4, h5, h6").each((_, el) => { const level = parseInt(el.tagName[1]); const heading = $(el).text().trim(); if (heading) { markdown += `${"#".repeat(level)} ${heading}\n\n`; headingsFound++; } }); } // Extract links if (input.preserve_links) { $("a").each((_, el) => { linksFound++; }); } // Extract tables if (input.preserve_tables) { $("table").each((_, table) => { tablesFound++; const rows: string[][] = []; $(table).find("tr").each((_, tr) => { const row: string[] = []; $(tr).find("th, td").each((_, cell) => { row.push($(cell).text().trim()); }); if (row.length) rows.push(row); }); if (rows.length) { // Convert to markdown table const header = rows[0]; markdown += "| " + header.join(" | ") + " |\n"; markdown += "| " + header.map(() => "---").join(" | ") + " |\n"; for (let i = 1; i < rows.length; i++) { markdown += "| " + rows[i].join(" | ") + " |\n"; } markdown += "\n"; } }); } // Get clean text text = normalizeText($("body").text()); // If no markdown content, use plain text if (!markdown.trim()) { markdown = text; } // Write outputs await fs.writeFile(textPath, text, "utf-8"); if (input.preserve_headings || input.preserve_tables) { const mdPath = path.join(extractedDir, `${htmlHash}.md`); await fs.writeFile(mdPath, markdown, "utf-8"); } // Generate report const report: ExtractionReport = { extractor_version: `cheerio@${CHEERIO_VERSION}`, mode_used: input.preserve_headings ? "markdown" : "text", warnings: [], pages_processed: 1, pages_empty: text.trim().length === 0 ? 1 : 0, chars_extracted: text.length, }; await writeJson(path.join(extractedDir, `${htmlHash}.report.json`), report); return { success: true, artifacts: { text_file: `extracted/${htmlHash}.txt`, markdown_file: input.preserve_headings ? `extracted/${htmlHash}.md` : undefined, }, stats: { chars_extracted: text.length, headings_found: headingsFound, links_found: linksFound, tables_found: tablesFound, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract HTML: ${err}`, { recoverable: false, }); } } // ============================================================================ // Extract Document (Generic) // ============================================================================ export interface ExtractDocumentResult { success: boolean; artifacts: { text_file: string; }; stats: { chars_extracted: number; format_detected: string; rows_processed?: number; }; extraction_report: ExtractionReport; } export async function extractDocument(input: ExtractDocumentInput): Promise<ExtractDocumentResult | ToolError> { const manager = getRunManager(); const runDir = manager.getRunDir(input.run_id); // Ensure run exists with full infrastructure await manager.ensureRun(input.run_id); const extractedDir = manager.getExtractedDir(input.run_id); try { // Resolve input path const docPath = path.join(runDir, input.doc_path); if (!await pathExists(docPath)) { return createToolError("PARSE_ERROR", `Document not found: ${input.doc_path}`, { recoverable: false, }); } // Read content const content = await fs.readFile(docPath); const hash = sha256(content); // Detect format let format = input.format_hint; if (format === "auto") { const ext = path.extname(docPath).toLowerCase(); const mapping: Record<string, string> = { ".md": "markdown", ".txt": "txt", ".csv": "csv", ".json": "json", ".docx": "docx", } as const; format = (mapping[ext as keyof typeof mapping] || "txt") as typeof format; } // Check if already extracted const textPath = path.join(extractedDir, `${hash}.txt`); if (await pathExists(textPath) && !input.force) { const report = await readJson<ExtractionReport>( path.join(extractedDir, `${hash}.report.json`) ).catch(() => ({ extractor_version: "1.0.0", mode_used: format, warnings: [], pages_processed: 1, pages_empty: 0, chars_extracted: 0, })); return { success: true, artifacts: { text_file: `extracted/${hash}.txt`, }, stats: { chars_extracted: report.chars_extracted, format_detected: format, }, extraction_report: report, }; } let text = ""; let rowsProcessed: number | undefined; const warnings: string[] = []; switch (format) { case "markdown": case "txt": text = normalizeText(content.toString("utf-8")); break; case "csv": // Parse CSV and convert to readable text const csvContent = content.toString("utf-8"); const lines = csvContent.split("\n"); const previewLines = lines.slice(0, input.csv_preview_rows + 1); // Simple CSV parsing (proper parsing would use a library) const rows = previewLines.map(line => { // Basic CSV splitting (doesn't handle quoted fields properly) return line.split(",").map(cell => cell.trim()); }); if (rows.length > 0) { // Format as readable table const header = rows[0]; text = header.join(" | ") + "\n"; text += header.map(() => "---").join(" | ") + "\n"; for (let i = 1; i < rows.length; i++) { text += rows[i].join(" | ") + "\n"; } rowsProcessed = rows.length - 1; if (lines.length > input.csv_preview_rows + 1) { warnings.push(`CSV truncated to ${input.csv_preview_rows} rows (total: ${lines.length - 1})`); } } break; case "json": // Pretty-print JSON for readability try { const json = JSON.parse(content.toString("utf-8")); text = JSON.stringify(json, null, 2); } catch { text = content.toString("utf-8"); warnings.push("Invalid JSON, returning raw content"); } break; case "docx": // DOCX extraction requires additional library // For now, return an error suggesting to use a different extractor return createToolError("PARSE_ERROR", "DOCX extraction not yet implemented", { recoverable: false, suggestion: "Convert DOCX to PDF or use external tool", }); default: text = normalizeText(content.toString("utf-8")); } // Write output await fs.writeFile(textPath, text, "utf-8"); // Generate report const report: ExtractionReport = { extractor_version: "1.0.0", mode_used: format, warnings, pages_processed: 1, pages_empty: text.trim().length === 0 ? 1 : 0, chars_extracted: text.length, }; await writeJson(path.join(extractedDir, `${hash}.report.json`), report); return { success: true, artifacts: { text_file: `extracted/${hash}.txt`, }, stats: { chars_extracted: text.length, format_detected: format, rows_processed: rowsProcessed, }, extraction_report: report, }; } catch (err) { return createToolError("PARSE_ERROR", `Failed to extract document: ${err}`, { recoverable: false, }); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server