import { OpenAIVisionParser } from './build/scraper/openai_vision_parser.js';
import { readdirSync } from 'fs';
import { resolve } from 'path';
import { writeFile, mkdir } from 'fs/promises';
import { existsSync } from 'fs';
// Set your OpenAI API key here
const OPENAI_API_KEY = process.env.OPENAI_API_KEY || "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";
async function testVisionExtraction() {
console.log('π Testing OpenAI Vision API PDF Extraction');
console.log('β'.repeat(50));
console.log('This will use OpenAI GPT-4 Vision to extract financial data');
console.log('from scanned Γ₯rsregnskap PDFs by converting them to images');
console.log('β'.repeat(50) + '\n');
// Check if API key is valid
if (!OPENAI_API_KEY || OPENAI_API_KEY === 'your-api-key-here') {
console.error('β Please set a valid OpenAI API key');
console.error(' Get your key from: https://platform.openai.com/api-keys');
return;
}
const parser = new OpenAIVisionParser(OPENAI_API_KEY);
const pdfDir = './data/pdfs';
const outputDir = './data/extracted';
// Create output directory if it doesn't exist
if (!existsSync(outputDir)) {
await mkdir(outputDir, { recursive: true });
console.log(`π Created output directory: ${outputDir}`);
}
// Get all PDF files
const pdfFiles = readdirSync(pdfDir)
.filter(file => file.endsWith('.pdf'))
.sort();
console.log(`π Found ${pdfFiles.length} PDF files in ${pdfDir}`);
console.log('β'.repeat(50) + '\n');
const allResults = [];
const successfulExtractions = [];
for (const pdfFile of pdfFiles) {
// Extract year from filename (format: aarsregnskap_999059198-YYYY.pdf)
const yearMatch = pdfFile.match(/(\d{4})\.pdf$/);
const year = yearMatch ? parseInt(yearMatch[1]) : null;
console.log(`\nπ Processing: ${pdfFile} (Year: ${year})`);
console.log('β'.repeat(40));
const pdfPath = resolve(pdfDir, pdfFile);
try {
const startTime = Date.now();
console.log('π Converting PDF to images and extracting with Vision API...');
const result = await parser.parseFinancialPDF(pdfPath);
const duration = Math.round((Date.now() - startTime) / 1000);
// Add year to result
const fullResult = {
year,
...result,
file: pdfFile,
processingTime: duration,
extractedAt: new Date().toISOString()
};
allResults.push(fullResult);
// Display results for this year
if (result.revenue || result.profit || result.assets || result.equity) {
console.log(`β
Data extracted in ${duration}s:`);
if (result.revenue) console.log(` π° Revenue: ${(result.revenue / 1000000).toFixed(1)}M NOK`);
if (result.profit !== null) console.log(` π Profit: ${(result.profit / 1000000).toFixed(1)}M NOK`);
if (result.assets) console.log(` π’ Assets: ${(result.assets / 1000000).toFixed(1)}M NOK`);
if (result.equity) console.log(` π Equity: ${(result.equity / 1000000).toFixed(1)}M NOK`);
successfulExtractions.push(fullResult);
// Save individual year's data to JSON
const outputFile = resolve(outputDir, `financial_data_${year}.json`);
await writeFile(outputFile, JSON.stringify(fullResult, null, 2));
console.log(` πΎ Saved to: ${outputFile}`);
} else {
console.log(`β οΈ No data extracted (${duration}s)`);
console.log(' Trying Vision API with image conversion...');
}
// Rate limiting for API calls
if (pdfFiles.indexOf(pdfFile) < pdfFiles.length - 1) {
console.log('β³ Waiting 3 seconds before next PDF (API rate limiting)...');
await new Promise(resolve => setTimeout(resolve, 3000));
}
} catch (error) {
console.error(`β Error processing ${pdfFile}:`, error.message);
if (error.status === 401) {
console.error('\nπ API Key Authentication Error!');
console.error(' The OpenAI API key is invalid or expired.');
console.error(' Please provide a valid API key.');
break; // Stop processing if API key is invalid
}
allResults.push({
year,
revenue: null,
profit: null,
assets: null,
equity: null,
file: pdfFile,
error: error.message,
extractedAt: new Date().toISOString()
});
}
}
// Save combined results
const combinedOutputFile = resolve(outputDir, 'all_financial_data.json');
await writeFile(combinedOutputFile, JSON.stringify(allResults, null, 2));
console.log(`\nπΎ Saved all results to: ${combinedOutputFile}`);
// Save summary
const summary = {
totalPdfs: allResults.length,
successfulExtractions: successfulExtractions.length,
failedExtractions: allResults.length - successfulExtractions.length,
extractionDate: new Date().toISOString(),
yearsCovered: allResults.map(r => r.year).filter(Boolean).sort(),
financialSummary: successfulExtractions.map(r => ({
year: r.year,
revenue: r.revenue,
profit: r.profit,
assets: r.assets,
equity: r.equity
})).sort((a, b) => (a.year || 0) - (b.year || 0))
};
const summaryFile = resolve(outputDir, 'extraction_summary.json');
await writeFile(summaryFile, JSON.stringify(summary, null, 2));
console.log(`πΎ Saved summary to: ${summaryFile}`);
// Summary
console.log('\n' + 'β'.repeat(50));
console.log('π EXTRACTION SUMMARY');
console.log('β'.repeat(50));
console.log(`\nTotal PDFs processed: ${allResults.length}`);
console.log(`Successful extractions: ${successfulExtractions.length}`);
console.log(`Failed extractions: ${allResults.length - successfulExtractions.length}`);
if (successfulExtractions.length > 0) {
console.log('\nπ Years with extracted data:');
successfulExtractions.sort((a, b) => (a.year || 0) - (b.year || 0));
successfulExtractions.forEach(result => {
console.log(`\n${result.year}:`);
if (result.revenue) console.log(` Revenue: ${(result.revenue / 1000000).toFixed(1)}M NOK`);
if (result.profit !== null) console.log(` Profit: ${(result.profit / 1000000).toFixed(1)}M NOK`);
if (result.assets) console.log(` Assets: ${(result.assets / 1000000).toFixed(1)}M NOK`);
if (result.equity) console.log(` Equity: ${(result.equity / 1000000).toFixed(1)}M NOK`);
});
// Calculate growth if we have multiple years
if (successfulExtractions.length > 1) {
const firstYear = successfulExtractions[0];
const lastYear = successfulExtractions[successfulExtractions.length - 1];
if (firstYear.revenue && lastYear.revenue) {
const revenueGrowth = ((lastYear.revenue - firstYear.revenue) / firstYear.revenue * 100).toFixed(1);
console.log(`\nπ Revenue growth (${firstYear.year}-${lastYear.year}): ${revenueGrowth}%`);
}
}
}
console.log('\n' + 'β'.repeat(50));
console.log('π Vision API extraction complete!');
console.log(`π All extracted data saved to: ${outputDir}`);
console.log('β'.repeat(50) + '\n');
}
testVisionExtraction().catch(console.error);