IndexFoundry MCP

binary-handler.ts•13.5 KiB

/** * Unified Binary Handler * * This module provides text extraction from various content types: * - PDF documents * - HTML pages (with Jina fallback for JS-rendered content) * - Plain text * * Extracted and unified from src/tools/projects.ts for reuse across the codebase. */ import * as cheerio from "cheerio"; import pdfParse from "pdf-parse"; // ============================================================================ // Type Definitions // ============================================================================ /** * PDF parse function signature for dependency injection (testing). * Matches the pdf-parse library interface. */ export type PdfParseFunction = (buffer: Buffer) => Promise<{ text: string; numpages: number }>; /** * Options for text extraction from HTTP response. * * @example * ```ts * const result = await extractTextFromResponse({ * url: 'https://example.com/doc.pdf', * response, * maxSizeBytes: 10 * 1024 * 1024, * }); * ``` */ export interface ExtractTextOptions { /** The original URL (used for extension detection and error messages) */ url: string; /** The fetch Response object containing the content to extract */ response: Response; /** Maximum allowed size in bytes (default: 10MB) */ maxSizeBytes?: number; /** @internal Inject a custom PDF parser (for testing) */ _pdfParser?: PdfParseFunction; } /** * Result of text extraction, including metadata about the extraction process. */ export interface ExtractTextResult { /** The extracted text content */ text: string; /** The original Content-Type header from the response */ contentType: string; /** Which extractor was used: 'html', 'pdf', 'plain', or 'jina' (for JS-rendered pages) */ extractorUsed: 'html' | 'pdf' | 'plain' | 'jina'; } // ============================================================================ // Configuration Constants // ============================================================================ /** Default maximum file size (10MB) */ const DEFAULT_MAX_SIZE_BYTES = 10 * 1024 * 1024; /** Minimum text length threshold for HTML extraction before Jina fallback */ const MIN_HTML_TEXT_LENGTH = 50; /** Default timeout for Jina Reader API requests (30 seconds) */ const JINA_TIMEOUT_MS = 30000; // ============================================================================ // Content-Type Helpers // ============================================================================ /** * Extract the base content type from a Content-Type header (strips charset, etc.) */ function getBaseContentType(contentType: string | null): string | null { if (!contentType) return null; return contentType.split(';')[0].trim().toLowerCase(); } /** * Get file extension from URL, handling query params and fragments */ function getUrlExtension(url: string): string | null { try { const urlObj = new URL(url); const pathname = urlObj.pathname; const lastDot = pathname.lastIndexOf('.'); if (lastDot === -1 || lastDot === pathname.length - 1) return null; return pathname.slice(lastDot + 1).toLowerCase(); } catch { // If URL parsing fails, try simple extraction const pathPart = url.split('?')[0].split('#')[0]; const lastDot = pathPart.lastIndexOf('.'); if (lastDot === -1 || lastDot === pathPart.length - 1) return null; return pathPart.slice(lastDot + 1).toLowerCase(); } } /** * Determine which extractor to use based on content-type and URL */ function detectExtractorType( contentType: string | null, url: string ): 'pdf' | 'html' | 'plain' | 'unknown' { const baseType = getBaseContentType(contentType); const extension = getUrlExtension(url); // Content-type based detection if (baseType) { if (baseType === 'application/pdf') return 'pdf'; if (baseType.startsWith('text/html')) return 'html'; if (baseType === 'text/plain' || baseType === 'text/markdown') return 'plain'; // Handle generic binary types - fallback to URL extension if (baseType === 'application/octet-stream') { if (extension === 'pdf') return 'pdf'; if (extension === 'txt' || extension === 'md') return 'plain'; return 'unknown'; } // Check if it's any text/* type (treat as plain) if (baseType.startsWith('text/')) return 'plain'; // Unknown binary type return 'unknown'; } // No content-type - use URL extension fallback if (extension === 'pdf') return 'pdf'; if (extension === 'txt' || extension === 'md') return 'plain'; if (extension === 'html' || extension === 'htm') return 'html'; return 'unknown'; } // ============================================================================ // HTML Extraction (from projects.ts) // ============================================================================ /** * Strip HTML tags and extract text content using cheerio */ function extractTextFromHtml(html: string): string { const $ = cheerio.load(html); // Remove elements that don't contain useful content $('script, style, noscript, iframe, nav, footer, header, aside').remove(); $('[role="navigation"], [role="banner"], [role="contentinfo"]').remove(); $('.nav, .navbar, .sidebar, .footer, .header, .menu, .breadcrumb').remove(); $('[class*="cookie"], [class*="popup"], [class*="modal"], [class*="advertisement"]').remove(); // Try to find main content area let mainContent = $('main, article, [role="main"], .main-content, #main, #content').first(); if (mainContent.length === 0) { mainContent = $('body'); } // Get text with some structure preservation let text = ''; // Process headings and paragraphs mainContent.find('h1, h2, h3, h4, h5, h6, p, li, td, th, dd, dt, blockquote, pre, code').each((_, el) => { const $el = $(el); const tagName = el.tagName.toLowerCase(); const content = $el.text().trim(); if (!content) return; if (tagName.startsWith('h')) { text += '\n\n' + content + '\n'; } else if (tagName === 'li') { text += '\n• ' + content; } else if (tagName === 'pre' || tagName === 'code') { text += '\n```\n' + content + '\n```\n'; } else { text += '\n' + content; } }); // If structured extraction yielded little, fall back to full text if (text.trim().length < 100) { text = mainContent.text(); } // Clean up whitespace text = text.replace(/\n{3,}/g, '\n\n'); text = text.replace(/[ \t]+/g, ' '); text = text.trim(); return text; } /** * Detect if extracted text is likely from a JS-rendered shell page */ function detectShellHtml(text: string, originalHtml: string): boolean { // Check for repeated "Loading..." patterns const loadingCount = (text.match(/Loading/gi) || []).length; if (loadingCount >= 3) { return true; } // Check for "Not Found" at the beginning (common SPA fallback) if (text.startsWith('Not Found') || text.startsWith('404')) { return true; } // Check if text is mostly navigation (short fragments with many newlines) const lines = text.split('\n').filter(l => l.trim().length > 0); const avgLineLength = lines.reduce((a, b) => a + b.length, 0) / (lines.length || 1); if (avgLineLength < 30 && lines.length > 20) { // Lots of short lines = likely navigation menu return true; } // Check ratio of text to HTML size - shell pages have high HTML with little content const textToHtmlRatio = text.length / originalHtml.length; if (textToHtmlRatio < 0.05 && originalHtml.length > 5000) { // Very low text extraction ratio from large HTML = likely JS-rendered return true; } // Check for common SPA framework indicators in HTML const spaIndicators = ['__NEXT_DATA__', '__NUXT__', 'window.__INITIAL_STATE__', 'id="root"', 'id="app"']; const hasSpIndicator = spaIndicators.some(indicator => originalHtml.includes(indicator)); if (hasSpIndicator && text.length < 1000) { return true; } return false; } // ============================================================================ // Jina Reader API // ============================================================================ /** * Fetch via Jina Reader API for JS-rendered pages */ async function fetchViaJinaReader(url: string, timeoutMs: number = JINA_TIMEOUT_MS): Promise<string> { const jinaUrl = `https://r.jina.ai/${url}`; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeoutMs); try { const response = await fetch(jinaUrl, { headers: { 'Accept': 'text/plain' }, signal: controller.signal, }); if (!response.ok) { throw new Error(`Jina Reader failed: HTTP ${response.status}`); } const text = await response.text(); // Jina returns markdown, clean it up slightly return text.replace(/^#+\s*/gm, '').trim(); } catch (error) { if (error instanceof Error && error.name === 'AbortError') { throw new Error(`Jina Reader timed out after ${timeoutMs / 1000}s`); } throw error; } finally { clearTimeout(timeoutId); } } // ============================================================================ // Main Extraction Function // ============================================================================ /** * Extract text content from a Response object. * * Automatically detects content type from headers or URL extension, * applies the appropriate extractor, and returns normalized text. * * @param options - Extraction options including URL and Response * @returns Promise resolving to extracted text with metadata * @throws Error for unsupported content types or extraction failures */ export async function extractTextFromResponse( options: ExtractTextOptions ): Promise<ExtractTextResult> { const { url, response, maxSizeBytes = DEFAULT_MAX_SIZE_BYTES, _pdfParser } = options; // Get content-type from headers const contentTypeHeader = response.headers.get('content-type'); const baseContentType = getBaseContentType(contentTypeHeader); // Determine which extractor to use const extractorType = detectExtractorType(contentTypeHeader, url); // Handle unknown/unsupported content types if (extractorType === 'unknown') { throw new Error( `Unsupported content type: cannot extract text from ` + `URL: ${url} with content-type: ${contentTypeHeader || 'missing'}` ); } // Get response body const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); // Validate size limit if (buffer.length > maxSizeBytes) { throw new Error( `Content exceeds size limit: ${(buffer.length / 1024 / 1024).toFixed(1)}MB ` + `exceeds ${(maxSizeBytes / 1024 / 1024).toFixed(1)}MB limit for URL: ${url}` ); } // Route to appropriate extractor switch (extractorType) { case 'pdf': return extractPdf(buffer, contentTypeHeader || 'application/pdf', url, _pdfParser); case 'html': return extractHtml(buffer, contentTypeHeader || 'text/html', url); case 'plain': return extractPlainText(buffer, contentTypeHeader || 'text/plain'); default: throw new Error( `Unsupported content type: cannot extract text from ` + `URL: ${url} with content-type: ${contentTypeHeader || 'missing'}` ); } } // ============================================================================ // Individual Extractors // ============================================================================ /** * Extract text from PDF buffer */ async function extractPdf( buffer: Buffer, contentType: string, url: string, customParser?: PdfParseFunction ): Promise<ExtractTextResult> { try { // Use injected parser if provided (for testing), otherwise use pdf-parse const parser = customParser || pdfParse; const pdfData = await parser(buffer); const text = pdfData.text.trim(); if (text.length < 50) { throw new Error( `PDF has insufficient extractable text (may require OCR). ` + `URL: ${url}, extracted ${text.length} characters` ); } return { text, contentType, extractorUsed: 'pdf', }; } catch (error) { if (error instanceof Error && error.message.includes('insufficient')) { throw error; // Re-throw our own errors } throw new Error( `PDF extraction failed: ${error instanceof Error ? error.message : String(error)}. ` + `URL: ${url}, content-type: ${contentType}` ); } } /** * Extract text from HTML buffer, with Jina fallback for JS-rendered content */ async function extractHtml( buffer: Buffer, contentType: string, url: string ): Promise<ExtractTextResult> { const html = buffer.toString('utf-8'); let text = extractTextFromHtml(html); // Detect shell/skeleton HTML from JS-rendered SPAs const isShellHtml = detectShellHtml(text, html); // If insufficient content or shell HTML detected, try Jina Reader if (text.length < MIN_HTML_TEXT_LENGTH || isShellHtml) { try { text = await fetchViaJinaReader(url); return { text, contentType, extractorUsed: 'jina', }; } catch { // Keep original text if we have any if (text.length < 50) { throw new Error( `Page has insufficient text content and Jina Reader fallback failed. ` + `URL: ${url}` ); } } } return { text, contentType, extractorUsed: 'html', }; } /** * Extract plain text (return as-is) */ async function extractPlainText( buffer: Buffer, contentType: string ): Promise<ExtractTextResult> { const text = buffer.toString('utf-8'); return { text, contentType, extractorUsed: 'plain', }; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

binary-handler.ts•13.5 KiB