pdf.ts•1.72 kB
import { SupportedFileType } from '@superglue/shared';
import { PDFParse } from 'pdf-parse';
import { logMessage } from '../../utils/logs.js';
import { DetectionPriority, FileParsingStrategy } from '../strategy.js';
export class PDFStrategy implements FileParsingStrategy {
readonly fileType = SupportedFileType.PDF;
readonly priority = DetectionPriority.BINARY_SIGNATURE;
canHandle(buffer: Buffer): boolean {
// PDF files start with %PDF signature (hex: 25504446)
if (buffer.length < 4) return false;
const signature = buffer.subarray(0, 4).toString('hex');
return signature === '25504446';
}
async parse(buffer: Buffer): Promise<any> {
return parsePDF(buffer);
}
}
export async function parsePDF(buffer: Buffer): Promise<{ textContent: string; structuredContent: any }> {
const parser = new PDFParse({
data: buffer,
isEvalSupported: false,
useSystemFonts: true,
});
const textResult = await parser.getText(PDF_TEXT_CONFIG).catch((err) => {
logMessage('warn', `PDF text extraction failed: ${err.message}`, {});
return { text: '' };
});
let tableResult = { pages: [] };
try {
tableResult = await parser.getTable();
} catch (err) {
logMessage('debug', `PDF table extraction not available in Node.js environment: ${err.message}`, {});
}
await parser.destroy();
return {
textContent: textResult.text || '',
structuredContent: tableResult.pages || []
};
}
const PDF_TEXT_CONFIG = {
parseHyperlinks: true,
lineEnforce: true,
pageJoiner: '\n\n---\n\n',
cellSeparator: '\t',
cellThreshold: 7,
lineThreshold: 4.6
};