Skip to main content
Glama
binary-handler.ts13.8 kB
/** * Unified Binary Handler * * This module provides text extraction from various content types: * - PDF documents * - HTML pages (with Jina fallback for JS-rendered content) * - Plain text * * Extracted and unified from src/tools/projects.ts for reuse across the codebase. */ import * as cheerio from "cheerio"; import pdfParse from "pdf-parse"; // ============================================================================ // Type Definitions // ============================================================================ /** * PDF parse function signature for dependency injection (testing). * Matches the pdf-parse library interface. */ export type PdfParseFunction = (buffer: Buffer) => Promise<{ text: string; numpages: number }>; /** * Options for text extraction from HTTP response. * * @example * ```ts * const result = await extractTextFromResponse({ * url: 'https://example.com/doc.pdf', * response, * maxSizeBytes: 10 * 1024 * 1024, * }); * ``` */ export interface ExtractTextOptions { /** The original URL (used for extension detection and error messages) */ url: string; /** The fetch Response object containing the content to extract */ response: Response; /** Maximum allowed size in bytes (default: 10MB) */ maxSizeBytes?: number; /** @internal Inject a custom PDF parser (for testing) */ _pdfParser?: PdfParseFunction; } /** * Result of text extraction, including metadata about the extraction process. */ export interface ExtractTextResult { /** The extracted text content */ text: string; /** The original Content-Type header from the response */ contentType: string; /** Which extractor was used: 'html', 'pdf', 'plain', or 'jina' (for JS-rendered pages) */ extractorUsed: 'html' | 'pdf' | 'plain' | 'jina'; } // ============================================================================ // Configuration Constants // ============================================================================ /** Default maximum file size (10MB) */ const DEFAULT_MAX_SIZE_BYTES = 10 * 1024 * 1024; /** Minimum text length threshold for HTML extraction before Jina fallback */ const MIN_HTML_TEXT_LENGTH = 50; /** Default timeout for Jina Reader API requests (30 seconds) */ const JINA_TIMEOUT_MS = 30000; // ============================================================================ // Content-Type Helpers // ============================================================================ /** * Extract the base content type from a Content-Type header (strips charset, etc.) */ function getBaseContentType(contentType: string | null): string | null { if (!contentType) return null; return contentType.split(';')[0].trim().toLowerCase(); } /** * Get file extension from URL, handling query params and fragments */ function getUrlExtension(url: string): string | null { try { const urlObj = new URL(url); const pathname = urlObj.pathname; const lastDot = pathname.lastIndexOf('.'); if (lastDot === -1 || lastDot === pathname.length - 1) return null; return pathname.slice(lastDot + 1).toLowerCase(); } catch { // If URL parsing fails, try simple extraction const pathPart = url.split('?')[0].split('#')[0]; const lastDot = pathPart.lastIndexOf('.'); if (lastDot === -1 || lastDot === pathPart.length - 1) return null; return pathPart.slice(lastDot + 1).toLowerCase(); } } /** * Determine which extractor to use based on content-type and URL */ function detectExtractorType( contentType: string | null, url: string ): 'pdf' | 'html' | 'plain' | 'unknown' { const baseType = getBaseContentType(contentType); const extension = getUrlExtension(url); // Content-type based detection if (baseType) { if (baseType === 'application/pdf') return 'pdf'; if (baseType.startsWith('text/html')) return 'html'; if (baseType === 'text/plain' || baseType === 'text/markdown') return 'plain'; // Handle generic binary types - fallback to URL extension if (baseType === 'application/octet-stream') { if (extension === 'pdf') return 'pdf'; if (extension === 'txt' || extension === 'md') return 'plain'; return 'unknown'; } // Check if it's any text/* type (treat as plain) if (baseType.startsWith('text/')) return 'plain'; // Unknown binary type return 'unknown'; } // No content-type - use URL extension fallback if (extension === 'pdf') return 'pdf'; if (extension === 'txt' || extension === 'md') return 'plain'; if (extension === 'html' || extension === 'htm') return 'html'; return 'unknown'; } // ============================================================================ // HTML Extraction (from projects.ts) // ============================================================================ /** * Strip HTML tags and extract text content using cheerio */ function extractTextFromHtml(html: string): string { const $ = cheerio.load(html); // Remove elements that don't contain useful content $('script, style, noscript, iframe, nav, footer, header, aside').remove(); $('[role="navigation"], [role="banner"], [role="contentinfo"]').remove(); $('.nav, .navbar, .sidebar, .footer, .header, .menu, .breadcrumb').remove(); $('[class*="cookie"], [class*="popup"], [class*="modal"], [class*="advertisement"]').remove(); // Try to find main content area let mainContent = $('main, article, [role="main"], .main-content, #main, #content').first(); if (mainContent.length === 0) { mainContent = $('body'); } // Get text with some structure preservation let text = ''; // Process headings and paragraphs mainContent.find('h1, h2, h3, h4, h5, h6, p, li, td, th, dd, dt, blockquote, pre, code').each((_, el) => { const $el = $(el); const tagName = el.tagName.toLowerCase(); const content = $el.text().trim(); if (!content) return; if (tagName.startsWith('h')) { text += '\n\n' + content + '\n'; } else if (tagName === 'li') { text += '\n• ' + content; } else if (tagName === 'pre' || tagName === 'code') { text += '\n```\n' + content + '\n```\n'; } else { text += '\n' + content; } }); // If structured extraction yielded little, fall back to full text if (text.trim().length < 100) { text = mainContent.text(); } // Clean up whitespace text = text.replace(/\n{3,}/g, '\n\n'); text = text.replace(/[ \t]+/g, ' '); text = text.trim(); return text; } /** * Detect if extracted text is likely from a JS-rendered shell page */ function detectShellHtml(text: string, originalHtml: string): boolean { // Check for repeated "Loading..." patterns const loadingCount = (text.match(/Loading/gi) || []).length; if (loadingCount >= 3) { return true; } // Check for "Not Found" at the beginning (common SPA fallback) if (text.startsWith('Not Found') || text.startsWith('404')) { return true; } // Check if text is mostly navigation (short fragments with many newlines) const lines = text.split('\n').filter(l => l.trim().length > 0); const avgLineLength = lines.reduce((a, b) => a + b.length, 0) / (lines.length || 1); if (avgLineLength < 30 && lines.length > 20) { // Lots of short lines = likely navigation menu return true; } // Check ratio of text to HTML size - shell pages have high HTML with little content const textToHtmlRatio = text.length / originalHtml.length; if (textToHtmlRatio < 0.05 && originalHtml.length > 5000) { // Very low text extraction ratio from large HTML = likely JS-rendered return true; } // Check for common SPA framework indicators in HTML const spaIndicators = ['__NEXT_DATA__', '__NUXT__', 'window.__INITIAL_STATE__', 'id="root"', 'id="app"']; const hasSpIndicator = spaIndicators.some(indicator => originalHtml.includes(indicator)); if (hasSpIndicator && text.length < 1000) { return true; } return false; } // ============================================================================ // Jina Reader API // ============================================================================ /** * Fetch via Jina Reader API for JS-rendered pages */ async function fetchViaJinaReader(url: string, timeoutMs: number = JINA_TIMEOUT_MS): Promise<string> { const jinaUrl = `https://r.jina.ai/${url}`; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeoutMs); try { const response = await fetch(jinaUrl, { headers: { 'Accept': 'text/plain' }, signal: controller.signal, }); if (!response.ok) { throw new Error(`Jina Reader failed: HTTP ${response.status}`); } const text = await response.text(); // Jina returns markdown, clean it up slightly return text.replace(/^#+\s*/gm, '').trim(); } catch (error) { if (error instanceof Error && error.name === 'AbortError') { throw new Error(`Jina Reader timed out after ${timeoutMs / 1000}s`); } throw error; } finally { clearTimeout(timeoutId); } } // ============================================================================ // Main Extraction Function // ============================================================================ /** * Extract text content from a Response object. * * Automatically detects content type from headers or URL extension, * applies the appropriate extractor, and returns normalized text. * * @param options - Extraction options including URL and Response * @returns Promise resolving to extracted text with metadata * @throws Error for unsupported content types or extraction failures */ export async function extractTextFromResponse( options: ExtractTextOptions ): Promise<ExtractTextResult> { const { url, response, maxSizeBytes = DEFAULT_MAX_SIZE_BYTES, _pdfParser } = options; // Get content-type from headers const contentTypeHeader = response.headers.get('content-type'); const baseContentType = getBaseContentType(contentTypeHeader); // Determine which extractor to use const extractorType = detectExtractorType(contentTypeHeader, url); // Handle unknown/unsupported content types if (extractorType === 'unknown') { throw new Error( `Unsupported content type: cannot extract text from ` + `URL: ${url} with content-type: ${contentTypeHeader || 'missing'}` ); } // Get response body const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); // Validate size limit if (buffer.length > maxSizeBytes) { throw new Error( `Content exceeds size limit: ${(buffer.length / 1024 / 1024).toFixed(1)}MB ` + `exceeds ${(maxSizeBytes / 1024 / 1024).toFixed(1)}MB limit for URL: ${url}` ); } // Route to appropriate extractor switch (extractorType) { case 'pdf': return extractPdf(buffer, contentTypeHeader || 'application/pdf', url, _pdfParser); case 'html': return extractHtml(buffer, contentTypeHeader || 'text/html', url); case 'plain': return extractPlainText(buffer, contentTypeHeader || 'text/plain'); default: throw new Error( `Unsupported content type: cannot extract text from ` + `URL: ${url} with content-type: ${contentTypeHeader || 'missing'}` ); } } // ============================================================================ // Individual Extractors // ============================================================================ /** * Extract text from PDF buffer */ async function extractPdf( buffer: Buffer, contentType: string, url: string, customParser?: PdfParseFunction ): Promise<ExtractTextResult> { try { // Use injected parser if provided (for testing), otherwise use pdf-parse const parser = customParser || pdfParse; const pdfData = await parser(buffer); const text = pdfData.text.trim(); if (text.length < 50) { throw new Error( `PDF has insufficient extractable text (may require OCR). ` + `URL: ${url}, extracted ${text.length} characters` ); } return { text, contentType, extractorUsed: 'pdf', }; } catch (error) { if (error instanceof Error && error.message.includes('insufficient')) { throw error; // Re-throw our own errors } throw new Error( `PDF extraction failed: ${error instanceof Error ? error.message : String(error)}. ` + `URL: ${url}, content-type: ${contentType}` ); } } /** * Extract text from HTML buffer, with Jina fallback for JS-rendered content */ async function extractHtml( buffer: Buffer, contentType: string, url: string ): Promise<ExtractTextResult> { const html = buffer.toString('utf-8'); let text = extractTextFromHtml(html); // Detect shell/skeleton HTML from JS-rendered SPAs const isShellHtml = detectShellHtml(text, html); // If insufficient content or shell HTML detected, try Jina Reader if (text.length < MIN_HTML_TEXT_LENGTH || isShellHtml) { try { text = await fetchViaJinaReader(url); return { text, contentType, extractorUsed: 'jina', }; } catch { // Keep original text if we have any if (text.length < 50) { throw new Error( `Page has insufficient text content and Jina Reader fallback failed. ` + `URL: ${url}` ); } } } return { text, contentType, extractorUsed: 'html', }; } /** * Extract plain text (return as-is) */ async function extractPlainText( buffer: Buffer, contentType: string ): Promise<ExtractTextResult> { const text = buffer.toString('utf-8'); return { text, contentType, extractorUsed: 'plain', }; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server