/**
* OpenAI Vision Parser - Uses GPT-4 Vision to extract data from image-based PDFs
* Converts PDF pages to images and analyzes them with OpenAI's Vision API
*/
import OpenAI from 'openai';
import { readFile, writeFile } from 'fs/promises';
import { existsSync, mkdirSync } from 'fs';
import { resolve, dirname, basename } from 'path';
import { fileURLToPath } from 'url';
// @ts-ignore
import pdfParse from 'pdf-parse';
// @ts-ignore
import { pdfToPng } from 'pdf-to-png-converter';
export interface FinancialData {
revenue: number | null;
profit: number | null;
assets: number | null;
equity: number | null;
year?: number;
}
export class OpenAIVisionParser {
private openai: OpenAI;
constructor(apiKey: string) {
if (!apiKey || apiKey === '') {
console.error('❌ CRITICAL: OpenAI API key is empty or not provided!');
console.error(' The parser was initialized without a valid API key.');
console.error(' Extraction will fail.');
throw new Error('OpenAI API key is required for PDF extraction');
}
console.error(`✅ OpenAI Vision Parser initialized with API key: ${apiKey.substring(0, 20)}...`);
this.openai = new OpenAI({
apiKey: apiKey
});
}
/**
* Parse Norwegian number format
*/
private parseNorwegianNumber(text: string): number | null {
if (!text) return null;
// Clean the text
let cleaned = text.trim();
// Remove currency and unit indicators
cleaned = cleaned.replace(/NOK|TNOK|kr|mill\.|millioner/gi, '');
// Remove all letters except numbers and separators
cleaned = cleaned.replace(/[A-Za-z]/g, '').trim();
// Handle negative numbers
const isNegative = cleaned.includes('-') || cleaned.includes('(');
cleaned = cleaned.replace(/[()]/g, '');
// Norwegian format: spaces or dots for thousands, comma for decimal
// Remove thousand separators
cleaned = cleaned.replace(/\s/g, '').replace(/\./g, '');
// Convert decimal comma to dot
cleaned = cleaned.replace(',', '.');
// Parse the number
const num = parseFloat(cleaned);
if (isNaN(num)) return null;
// If the context mentioned TNOK (thousands), multiply by 1000
if (text.toLowerCase().includes('tnok') || text.includes('1000')) {
return (isNegative ? -1 : 1) * num * 1000;
}
// If number seems to be in thousands (common in Norwegian reports)
// and is between 100 and 100,000, likely TNOK
if (Math.abs(num) < 100000 && Math.abs(num) > 100) {
return (isNegative ? -1 : 1) * num * 1000;
}
return (isNegative ? -1 : 1) * num;
}
/**
* Extract financial data using GPT-4 Vision
*/
private async extractWithGPT4Vision(imageBase64: string, pageNum: number): Promise<FinancialData> {
try {
console.error(`🤖 Analyzing page ${pageNum} with GPT-4 Vision...`);
const response = await this.openai.chat.completions.create({
model: "gpt-4o", // Using full GPT-4o for better Vision capabilities
messages: [
{
role: "system",
content: "You are an expert Norwegian accountant who reads Norwegian annual financial statements (årsregnskap). Extract key financial metrics accurately."
},
{
role: "user",
content: [
{
type: "text",
text: `Extract these 4 KEY METRICS from this Norwegian financial statement image (årsregnskap):
1. **Driftsinntekter** (Operating Revenue) - Look for "Sum driftsinntekter" or "Salgsinntekt"
2. **Årsresultat** (Annual Result) - Look for "Årsresultat" or "Ordinært resultat etter skatt"
3. **Sum eiendeler** (Total Assets) - In the balance sheet
4. **Sum egenkapital** (Total Equity) - In the balance sheet under "Egenkapital og gjeld"
IMPORTANT:
- Norwegian årsregnskap typically show FULL values in NOK (not thousands)
- Use values from CURRENT YEAR (rightmost column), NOT previous year
- Extract the EXACT NUMBER as shown in the document
- Examples: "20 623" = 20623 NOK, "48 000" = 48000 NOK
- Negative numbers: (123) or -123
- Norwegian format: "1 234 567" with spaces = 1234567
Return ONLY this JSON:
{
"revenue": <number as shown or null>,
"profit": <number as shown or null>,
"assets": <number as shown or null>,
"equity": <number as shown or null>,
"year": <year or null>,
"notes": "brief description"
}`
},
{
type: "image_url",
image_url: {
url: `data:image/png;base64,${imageBase64}`,
detail: "high" // Request high detail analysis
}
}
]
}
],
max_tokens: 800, // Enough for simple 4-metric extraction
temperature: 0
});
const content = response.choices[0]?.message?.content || '{}';
console.error(`🔍 Raw API response (first 500 chars): ${content.substring(0, 500)}`);
// Extract JSON from the response (handle markdown code blocks)
let jsonText = content;
// Remove markdown code blocks if present
jsonText = jsonText.replace(/```json\s*/g, '').replace(/```\s*/g, '');
let jsonMatch = jsonText.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
console.error('❌ No JSON found in response!');
console.error('Full response:', content);
return {
revenue: null,
profit: null,
assets: null,
equity: null
};
}
try {
const data = JSON.parse(jsonMatch[0]);
console.error(`✅ Successfully parsed JSON response`);
// Norwegian årsregnskap typically show FULL values in NOK
// No conversion needed - use raw extracted values
console.error(`📊 Page ${pageNum}: ${data.notes || 'Data found'}`);
// Parse the raw values (handle string numbers from OCR)
const parseValue = (value: any) => {
if (value === null || value === undefined) return null;
if (typeof value === 'number') return value;
if (typeof value === 'string') {
// Remove spaces and convert to number
const cleaned = value.replace(/\s+/g, '').replace(',', '.');
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
}
return null;
};
const convertedRevenue = parseValue(data.revenue);
const convertedProfit = parseValue(data.profit);
const convertedAssets = parseValue(data.assets);
const convertedEquity = parseValue(data.equity);
console.error(` Revenue: ${convertedRevenue ? (convertedRevenue/1000000).toFixed(1)+'M NOK' : 'N/A'}`);
console.error(` Profit: ${convertedProfit ? (convertedProfit/1000000).toFixed(1)+'M NOK' : 'N/A'}`);
console.error(` Assets: ${convertedAssets ? (convertedAssets/1000000).toFixed(1)+'M NOK' : 'N/A'}`);
console.error(` Equity: ${convertedEquity ? (convertedEquity/1000000).toFixed(1)+'M NOK' : 'N/A'}`);
return {
revenue: convertedRevenue,
profit: convertedProfit,
assets: convertedAssets,
equity: convertedEquity,
year: data.year
};
} catch (parseError) {
console.error('❌ Failed to parse JSON:', parseError);
console.error('Raw response:', content.substring(0, 500));
return {
revenue: null,
profit: null,
assets: null,
equity: null
};
}
} catch (error: any) {
console.error(`GPT-4 Vision API error on page ${pageNum}:`, error.message);
throw error;
}
}
/**
* Convert PDF to images and extract data using OpenAI Vision
*/
async parseFinancialPDF(pdfPath: string): Promise<FinancialData> {
console.error(`\n🚀 Starting OpenAI Vision analysis for: ${pdfPath}`);
try {
// First check if PDF has extractable text
console.error('📄 Checking PDF for text content...');
const dataBuffer = await readFile(pdfPath);
const pdfData = await pdfParse(dataBuffer);
console.error(`📖 PDF has ${pdfData.numpages} pages`);
console.error(`📝 Text extracted: ${pdfData.text.length} characters`);
// If PDF has substantial text, use text-based extraction
if (pdfData.text && pdfData.text.length > 500) {
console.error('📄 PDF has text layer, using text-based extraction...');
return await this.extractFromText(pdfData.text);
}
// PDF is image-based, convert to images and use Vision API
console.error('📸 PDF is image-based (scanned). Converting to images...');
console.error(`📄 PDF path: ${pdfPath}`);
console.error(`📄 PDF exists: ${existsSync(pdfPath)}`);
// Convert PDF pages to PNG images - organize by company then by PDF
const pdfBaseName = basename(pdfPath, '.pdf');
// Extract company org number from PDF name (e.g., aarsregnskap_984562861-2021.pdf)
const orgNrMatch = pdfBaseName.match(/(\d{9})/);
const companyFolder = orgNrMatch ? orgNrMatch[1] : 'unknown';
// Use ABSOLUTE path to ensure it works regardless of working directory
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const baseDir = resolve(__dirname, '..', '..'); // Go up two levels from src/scraper
const pngFolder = resolve(baseDir, 'data', 'pdfs', 'png_images', companyFolder, pdfBaseName);
if (!existsSync(pngFolder)) {
mkdirSync(pngFolder, { recursive: true });
console.error(`📁 Created PNG folder: ${pngFolder}`);
} else {
console.error(`📁 PNG folder already exists: ${pngFolder}`);
}
console.error(`🔄 Starting PDF-to-PNG conversion...`);
console.error(` Input PDF: ${pdfPath}`);
console.error(` Output folder: ${pngFolder}`);
console.error(` Working directory: ${process.cwd()}`);
let pngPages;
try {
// Convert with absolute path for PDF as well
const absolutePdfPath = resolve(pdfPath);
console.error(` Absolute PDF path: ${absolutePdfPath}`);
pngPages = await pdfToPng(absolutePdfPath, {
disableFontFace: false,
useSystemFonts: false,
viewportScale: 2.0, // Higher resolution for better OCR
outputFolder: pngFolder, // Now using absolute path
outputFileMaskFunc: (pageNum: number) => `page_${pageNum}`,
// pagesToProcess not specified = process ALL pages
});
console.error(`✅ Converted ${pngPages.length} PDF pages to images`);
// Save PNG files to disk since they're returned as buffers
let savedCount = 0;
for (let i = 0; i < pngPages.length; i++) {
const page = pngPages[i];
if (page.content && page.content.length > 0) {
const pngPath = resolve(pngFolder, `page_${i + 1}.png`);
try {
await writeFile(pngPath, page.content);
savedCount++;
} catch (saveError) {
console.error(`⚠️ Failed to save page ${i + 1} to disk:`, saveError);
}
}
}
console.error(`💾 Saved ${savedCount} PNG files to disk`);
// Verify at least one page has content
const hasContent = pngPages.some(p => p.content && p.content.length > 0);
if (!hasContent) {
console.error(`⚠️ Warning: No page content found after conversion`);
}
} catch (conversionError: any) {
console.error(`❌ PDF-to-PNG conversion failed:`, conversionError.message);
console.error(`❌ Full conversion error:`, conversionError);
console.error(` Stack trace:`, conversionError.stack);
throw new Error(`PDF-to-PNG conversion failed: ${conversionError.message}`);
}
// Analyze each page with GPT-4 Vision and combine results
let combinedData: FinancialData = {
revenue: null,
profit: null,
assets: null,
equity: null
};
for (let i = 0; i < pngPages.length; i++) {
const page = pngPages[i];
console.error(`📄 Analyzing page ${i + 1}/${pngPages.length} with OpenAI Vision API...`);
// Convert buffer to base64
if (!page.content) {
console.error(`⚠️ No content for page ${i + 1}, skipping...`);
continue;
}
const imageBase64 = page.content.toString('base64');
// Extract data from this page
const pageData = await this.extractWithGPT4Vision(imageBase64, i + 1);
// Update combined data (prefer non-null values)
if (pageData.revenue !== null && combinedData.revenue === null) {
combinedData.revenue = pageData.revenue;
}
if (pageData.profit !== null && combinedData.profit === null) {
combinedData.profit = pageData.profit;
}
if (pageData.assets !== null && combinedData.assets === null) {
combinedData.assets = pageData.assets;
}
if (pageData.equity !== null && combinedData.equity === null) {
combinedData.equity = pageData.equity;
}
if (pageData.year) {
combinedData.year = pageData.year;
}
// If we have all 4 metrics, stop processing
if (combinedData.revenue !== null &&
combinedData.profit !== null &&
combinedData.assets !== null &&
combinedData.equity !== null) {
console.error(`✅ Found all 4 metrics on page ${i + 1}!`);
break;
}
}
console.error(`📊 Extraction complete for PDF`);
return combinedData;
} catch (error: any) {
console.error('❌ OpenAI Vision PDF parsing failed:', error.message);
console.error('❌ Full error:', error);
console.error('❌ Error stack:', error.stack);
// Check if it's an API key issue
if (error.status === 401) {
console.error('🔑 API Key Error: Please provide a valid OpenAI API key');
console.error(' Get your API key from: https://platform.openai.com/api-keys');
}
// Re-throw to make the error visible
throw new Error(`PDF parsing failed: ${error.message}. Check if OPENAI_API_KEY is set.`);
}
}
/**
* Extract financial data from text using GPT-4
*/
private async extractFromText(text: string): Promise<FinancialData> {
try {
console.error('🤖 Analyzing text with GPT-4...');
const response = await this.openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{
role: "system",
content: "You are a financial analyst expert at reading Norwegian financial statements. Extract key financial metrics from the document text."
},
{
role: "user",
content: `Extract financial data from this Norwegian financial statement text:
${text.substring(0, 4000)}
Extract:
1. Driftsinntekter/Revenue (total operating revenue)
2. Årsresultat/Net Profit (profit after tax)
3. Sum eiendeler/Total Assets
4. Egenkapital/Equity
Return as JSON:
{
"revenue": <number or null>,
"profit": <number or null>,
"assets": <number or null>,
"equity": <number or null>,
"notes": "<brief note>"
}
Important:
- Extract current year values only
- Convert TNOK to NOK (multiply by 1000)
- Use null if not found
- Numbers use Norwegian format (space/dot thousands, comma decimal)`
}
],
max_tokens: 500,
temperature: 0
});
const content = response.choices[0]?.message?.content || '{}';
const jsonMatch = content.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
return { revenue: null, profit: null, assets: null, equity: null };
}
const data = JSON.parse(jsonMatch[0]);
console.error(`📊 Extracted: ${data.notes || 'Data found'}`);
return {
revenue: data.revenue,
profit: data.profit,
assets: data.assets,
equity: data.equity
};
} catch (error: any) {
console.error('GPT-4 text analysis error:', error.message);
return { revenue: null, profit: null, assets: null, equity: null };
}
}
}