import pdfParse from 'pdf-parse';
import { readFile } from 'fs/promises';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
async function testPdfParser() {
console.log('🔍 Testing PDF Parser with Real Norwegian Financial PDF\n');
// Use a PDF from Downloads folder
const pdfPath = '/Users/josuekongolo/Downloads/aarsregnskap_999059198-2024.pdf';
try {
console.log(`📄 Reading PDF: ${pdfPath}\n`);
const dataBuffer = await readFile(pdfPath);
const data = await pdfParse(dataBuffer);
console.log('📊 PDF Info:');
console.log(`Pages: ${data.numpages}`);
console.log(`Text length: ${data.text.length} characters\n`);
console.log('📝 First 2000 characters of extracted text:');
console.log('━'.repeat(50));
console.log(data.text.substring(0, 2000));
console.log('━'.repeat(50));
console.log('\n🔍 Looking for key financial terms...\n');
// Look for common Norwegian financial terms
const terms = [
'Driftsinntekter',
'Salgsinntekt',
'Sum inntekter',
'Årsresultat',
'Resultat',
'Sum eiendeler',
'Egenkapital',
'Omsetning',
'Driftsresultat'
];
terms.forEach(term => {
const index = data.text.indexOf(term);
if (index !== -1) {
// Extract 100 characters around the term
const start = Math.max(0, index - 50);
const end = Math.min(data.text.length, index + term.length + 100);
const context = data.text.substring(start, end);
console.log(`✅ Found "${term}":`);
console.log(` ${context.replace(/\n/g, ' ')}`);
console.log();
} else {
console.log(`❌ Not found: "${term}"`);
}
});
console.log('\n🔍 Looking for number patterns...\n');
// Look for Norwegian number formats (spaces as thousand separators)
const numberPatterns = [
/\d{1,3}(?:\s\d{3})*(?:,\d+)?/g, // Norwegian format: 1 234 567,89
/\d{1,3}(?:\.\d{3})*(?:,\d+)?/g, // Alternative: 1.234.567,89
/\d+(?:\s\d{3})+/g // Simple thousands: 123 456
];
numberPatterns.forEach((pattern, i) => {
const matches = data.text.match(pattern);
if (matches && matches.length > 0) {
// Filter to only show larger numbers (likely financial amounts)
const largeNumbers = matches.filter(m => {
const cleanNum = m.replace(/\s/g, '').replace(/\./g, '').replace(',', '.');
return parseFloat(cleanNum) > 10000;
});
if (largeNumbers.length > 0) {
console.log(`Pattern ${i + 1} found ${largeNumbers.length} large numbers:`);
console.log(` Examples: ${largeNumbers.slice(0, 5).join(', ')}`);
}
}
});
} catch (error) {
console.error('❌ Error parsing PDF:', error.message);
console.error('\nMake sure the PDF exists at:', pdfPath);
}
}
testPdfParser();