Skip to main content
Glama
pdf.ts1.72 kB
import { SupportedFileType } from '@superglue/shared'; import { PDFParse } from 'pdf-parse'; import { logMessage } from '../../utils/logs.js'; import { DetectionPriority, FileParsingStrategy } from '../strategy.js'; export class PDFStrategy implements FileParsingStrategy { readonly fileType = SupportedFileType.PDF; readonly priority = DetectionPriority.BINARY_SIGNATURE; canHandle(buffer: Buffer): boolean { // PDF files start with %PDF signature (hex: 25504446) if (buffer.length < 4) return false; const signature = buffer.subarray(0, 4).toString('hex'); return signature === '25504446'; } async parse(buffer: Buffer): Promise<any> { return parsePDF(buffer); } } export async function parsePDF(buffer: Buffer): Promise<{ textContent: string; structuredContent: any }> { const parser = new PDFParse({ data: buffer, isEvalSupported: false, useSystemFonts: true, }); const textResult = await parser.getText(PDF_TEXT_CONFIG).catch((err) => { logMessage('warn', `PDF text extraction failed: ${err.message}`, {}); return { text: '' }; }); let tableResult = { pages: [] }; try { tableResult = await parser.getTable(); } catch (err) { logMessage('debug', `PDF table extraction not available in Node.js environment: ${err.message}`, {}); } await parser.destroy(); return { textContent: textResult.text || '', structuredContent: tableResult.pages || [] }; } const PDF_TEXT_CONFIG = { parseHyperlinks: true, lineEnforce: true, pageJoiner: '\n\n---\n\n', cellSeparator: '\t', cellThreshold: 7, lineThreshold: 4.6 };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server