Skip to main content
Glama
josuekongolo

CompanyIQ MCP Server

by josuekongolo
openai_vision_parser.ts16.3 kB
/** * OpenAI Vision Parser - Uses GPT-4 Vision to extract data from image-based PDFs * Converts PDF pages to images and analyzes them with OpenAI's Vision API */ import OpenAI from 'openai'; import { readFile, writeFile } from 'fs/promises'; import { existsSync, mkdirSync } from 'fs'; import { resolve, dirname, basename } from 'path'; import { fileURLToPath } from 'url'; // @ts-ignore import pdfParse from 'pdf-parse'; // @ts-ignore import { pdfToPng } from 'pdf-to-png-converter'; export interface FinancialData { revenue: number | null; profit: number | null; assets: number | null; equity: number | null; year?: number; } export class OpenAIVisionParser { private openai: OpenAI; constructor(apiKey: string) { if (!apiKey || apiKey === '') { console.error('❌ CRITICAL: OpenAI API key is empty or not provided!'); console.error(' The parser was initialized without a valid API key.'); console.error(' Extraction will fail.'); throw new Error('OpenAI API key is required for PDF extraction'); } console.error(`✅ OpenAI Vision Parser initialized with API key: ${apiKey.substring(0, 20)}...`); this.openai = new OpenAI({ apiKey: apiKey }); } /** * Parse Norwegian number format */ private parseNorwegianNumber(text: string): number | null { if (!text) return null; // Clean the text let cleaned = text.trim(); // Remove currency and unit indicators cleaned = cleaned.replace(/NOK|TNOK|kr|mill\.|millioner/gi, ''); // Remove all letters except numbers and separators cleaned = cleaned.replace(/[A-Za-z]/g, '').trim(); // Handle negative numbers const isNegative = cleaned.includes('-') || cleaned.includes('('); cleaned = cleaned.replace(/[()]/g, ''); // Norwegian format: spaces or dots for thousands, comma for decimal // Remove thousand separators cleaned = cleaned.replace(/\s/g, '').replace(/\./g, ''); // Convert decimal comma to dot cleaned = cleaned.replace(',', '.'); // Parse the number const num = parseFloat(cleaned); if (isNaN(num)) return null; // If the context mentioned TNOK (thousands), multiply by 1000 if (text.toLowerCase().includes('tnok') || text.includes('1000')) { return (isNegative ? -1 : 1) * num * 1000; } // If number seems to be in thousands (common in Norwegian reports) // and is between 100 and 100,000, likely TNOK if (Math.abs(num) < 100000 && Math.abs(num) > 100) { return (isNegative ? -1 : 1) * num * 1000; } return (isNegative ? -1 : 1) * num; } /** * Extract financial data using GPT-4 Vision */ private async extractWithGPT4Vision(imageBase64: string, pageNum: number): Promise<FinancialData> { try { console.error(`🤖 Analyzing page ${pageNum} with GPT-4 Vision...`); const response = await this.openai.chat.completions.create({ model: "gpt-4o", // Using full GPT-4o for better Vision capabilities messages: [ { role: "system", content: "You are an expert Norwegian accountant who reads Norwegian annual financial statements (årsregnskap). Extract key financial metrics accurately." }, { role: "user", content: [ { type: "text", text: `Extract these 4 KEY METRICS from this Norwegian financial statement image (årsregnskap): 1. **Driftsinntekter** (Operating Revenue) - Look for "Sum driftsinntekter" or "Salgsinntekt" 2. **Årsresultat** (Annual Result) - Look for "Årsresultat" or "Ordinært resultat etter skatt" 3. **Sum eiendeler** (Total Assets) - In the balance sheet 4. **Sum egenkapital** (Total Equity) - In the balance sheet under "Egenkapital og gjeld" IMPORTANT: - Norwegian årsregnskap typically show FULL values in NOK (not thousands) - Use values from CURRENT YEAR (rightmost column), NOT previous year - Extract the EXACT NUMBER as shown in the document - Examples: "20 623" = 20623 NOK, "48 000" = 48000 NOK - Negative numbers: (123) or -123 - Norwegian format: "1 234 567" with spaces = 1234567 Return ONLY this JSON: { "revenue": <number as shown or null>, "profit": <number as shown or null>, "assets": <number as shown or null>, "equity": <number as shown or null>, "year": <year or null>, "notes": "brief description" }` }, { type: "image_url", image_url: { url: `data:image/png;base64,${imageBase64}`, detail: "high" // Request high detail analysis } } ] } ], max_tokens: 800, // Enough for simple 4-metric extraction temperature: 0 }); const content = response.choices[0]?.message?.content || '{}'; console.error(`🔍 Raw API response (first 500 chars): ${content.substring(0, 500)}`); // Extract JSON from the response (handle markdown code blocks) let jsonText = content; // Remove markdown code blocks if present jsonText = jsonText.replace(/```json\s*/g, '').replace(/```\s*/g, ''); let jsonMatch = jsonText.match(/\{[\s\S]*\}/); if (!jsonMatch) { console.error('❌ No JSON found in response!'); console.error('Full response:', content); return { revenue: null, profit: null, assets: null, equity: null }; } try { const data = JSON.parse(jsonMatch[0]); console.error(`✅ Successfully parsed JSON response`); // Norwegian årsregnskap typically show FULL values in NOK // No conversion needed - use raw extracted values console.error(`📊 Page ${pageNum}: ${data.notes || 'Data found'}`); // Parse the raw values (handle string numbers from OCR) const parseValue = (value: any) => { if (value === null || value === undefined) return null; if (typeof value === 'number') return value; if (typeof value === 'string') { // Remove spaces and convert to number const cleaned = value.replace(/\s+/g, '').replace(',', '.'); const parsed = parseFloat(cleaned); return isNaN(parsed) ? null : parsed; } return null; }; const convertedRevenue = parseValue(data.revenue); const convertedProfit = parseValue(data.profit); const convertedAssets = parseValue(data.assets); const convertedEquity = parseValue(data.equity); console.error(` Revenue: ${convertedRevenue ? (convertedRevenue/1000000).toFixed(1)+'M NOK' : 'N/A'}`); console.error(` Profit: ${convertedProfit ? (convertedProfit/1000000).toFixed(1)+'M NOK' : 'N/A'}`); console.error(` Assets: ${convertedAssets ? (convertedAssets/1000000).toFixed(1)+'M NOK' : 'N/A'}`); console.error(` Equity: ${convertedEquity ? (convertedEquity/1000000).toFixed(1)+'M NOK' : 'N/A'}`); return { revenue: convertedRevenue, profit: convertedProfit, assets: convertedAssets, equity: convertedEquity, year: data.year }; } catch (parseError) { console.error('❌ Failed to parse JSON:', parseError); console.error('Raw response:', content.substring(0, 500)); return { revenue: null, profit: null, assets: null, equity: null }; } } catch (error: any) { console.error(`GPT-4 Vision API error on page ${pageNum}:`, error.message); throw error; } } /** * Convert PDF to images and extract data using OpenAI Vision */ async parseFinancialPDF(pdfPath: string): Promise<FinancialData> { console.error(`\n🚀 Starting OpenAI Vision analysis for: ${pdfPath}`); try { // First check if PDF has extractable text console.error('📄 Checking PDF for text content...'); const dataBuffer = await readFile(pdfPath); const pdfData = await pdfParse(dataBuffer); console.error(`📖 PDF has ${pdfData.numpages} pages`); console.error(`📝 Text extracted: ${pdfData.text.length} characters`); // If PDF has substantial text, use text-based extraction if (pdfData.text && pdfData.text.length > 500) { console.error('📄 PDF has text layer, using text-based extraction...'); return await this.extractFromText(pdfData.text); } // PDF is image-based, convert to images and use Vision API console.error('📸 PDF is image-based (scanned). Converting to images...'); console.error(`📄 PDF path: ${pdfPath}`); console.error(`📄 PDF exists: ${existsSync(pdfPath)}`); // Convert PDF pages to PNG images - organize by company then by PDF const pdfBaseName = basename(pdfPath, '.pdf'); // Extract company org number from PDF name (e.g., aarsregnskap_984562861-2021.pdf) const orgNrMatch = pdfBaseName.match(/(\d{9})/); const companyFolder = orgNrMatch ? orgNrMatch[1] : 'unknown'; // Use ABSOLUTE path to ensure it works regardless of working directory const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const baseDir = resolve(__dirname, '..', '..'); // Go up two levels from src/scraper const pngFolder = resolve(baseDir, 'data', 'pdfs', 'png_images', companyFolder, pdfBaseName); if (!existsSync(pngFolder)) { mkdirSync(pngFolder, { recursive: true }); console.error(`📁 Created PNG folder: ${pngFolder}`); } else { console.error(`📁 PNG folder already exists: ${pngFolder}`); } console.error(`🔄 Starting PDF-to-PNG conversion...`); console.error(` Input PDF: ${pdfPath}`); console.error(` Output folder: ${pngFolder}`); console.error(` Working directory: ${process.cwd()}`); let pngPages; try { // Convert with absolute path for PDF as well const absolutePdfPath = resolve(pdfPath); console.error(` Absolute PDF path: ${absolutePdfPath}`); pngPages = await pdfToPng(absolutePdfPath, { disableFontFace: false, useSystemFonts: false, viewportScale: 2.0, // Higher resolution for better OCR outputFolder: pngFolder, // Now using absolute path outputFileMaskFunc: (pageNum: number) => `page_${pageNum}`, // pagesToProcess not specified = process ALL pages }); console.error(`✅ Converted ${pngPages.length} PDF pages to images`); // Save PNG files to disk since they're returned as buffers let savedCount = 0; for (let i = 0; i < pngPages.length; i++) { const page = pngPages[i]; if (page.content && page.content.length > 0) { const pngPath = resolve(pngFolder, `page_${i + 1}.png`); try { await writeFile(pngPath, page.content); savedCount++; } catch (saveError) { console.error(`⚠️ Failed to save page ${i + 1} to disk:`, saveError); } } } console.error(`💾 Saved ${savedCount} PNG files to disk`); // Verify at least one page has content const hasContent = pngPages.some(p => p.content && p.content.length > 0); if (!hasContent) { console.error(`⚠️ Warning: No page content found after conversion`); } } catch (conversionError: any) { console.error(`❌ PDF-to-PNG conversion failed:`, conversionError.message); console.error(`❌ Full conversion error:`, conversionError); console.error(` Stack trace:`, conversionError.stack); throw new Error(`PDF-to-PNG conversion failed: ${conversionError.message}`); } // Analyze each page with GPT-4 Vision and combine results let combinedData: FinancialData = { revenue: null, profit: null, assets: null, equity: null }; for (let i = 0; i < pngPages.length; i++) { const page = pngPages[i]; console.error(`📄 Analyzing page ${i + 1}/${pngPages.length} with OpenAI Vision API...`); // Convert buffer to base64 if (!page.content) { console.error(`⚠️ No content for page ${i + 1}, skipping...`); continue; } const imageBase64 = page.content.toString('base64'); // Extract data from this page const pageData = await this.extractWithGPT4Vision(imageBase64, i + 1); // Update combined data (prefer non-null values) if (pageData.revenue !== null && combinedData.revenue === null) { combinedData.revenue = pageData.revenue; } if (pageData.profit !== null && combinedData.profit === null) { combinedData.profit = pageData.profit; } if (pageData.assets !== null && combinedData.assets === null) { combinedData.assets = pageData.assets; } if (pageData.equity !== null && combinedData.equity === null) { combinedData.equity = pageData.equity; } if (pageData.year) { combinedData.year = pageData.year; } // If we have all 4 metrics, stop processing if (combinedData.revenue !== null && combinedData.profit !== null && combinedData.assets !== null && combinedData.equity !== null) { console.error(`✅ Found all 4 metrics on page ${i + 1}!`); break; } } console.error(`📊 Extraction complete for PDF`); return combinedData; } catch (error: any) { console.error('❌ OpenAI Vision PDF parsing failed:', error.message); console.error('❌ Full error:', error); console.error('❌ Error stack:', error.stack); // Check if it's an API key issue if (error.status === 401) { console.error('🔑 API Key Error: Please provide a valid OpenAI API key'); console.error(' Get your API key from: https://platform.openai.com/api-keys'); } // Re-throw to make the error visible throw new Error(`PDF parsing failed: ${error.message}. Check if OPENAI_API_KEY is set.`); } } /** * Extract financial data from text using GPT-4 */ private async extractFromText(text: string): Promise<FinancialData> { try { console.error('🤖 Analyzing text with GPT-4...'); const response = await this.openai.chat.completions.create({ model: "gpt-4o-mini", messages: [ { role: "system", content: "You are a financial analyst expert at reading Norwegian financial statements. Extract key financial metrics from the document text." }, { role: "user", content: `Extract financial data from this Norwegian financial statement text: ${text.substring(0, 4000)} Extract: 1. Driftsinntekter/Revenue (total operating revenue) 2. Årsresultat/Net Profit (profit after tax) 3. Sum eiendeler/Total Assets 4. Egenkapital/Equity Return as JSON: { "revenue": <number or null>, "profit": <number or null>, "assets": <number or null>, "equity": <number or null>, "notes": "<brief note>" } Important: - Extract current year values only - Convert TNOK to NOK (multiply by 1000) - Use null if not found - Numbers use Norwegian format (space/dot thousands, comma decimal)` } ], max_tokens: 500, temperature: 0 }); const content = response.choices[0]?.message?.content || '{}'; const jsonMatch = content.match(/\{[\s\S]*\}/); if (!jsonMatch) { return { revenue: null, profit: null, assets: null, equity: null }; } const data = JSON.parse(jsonMatch[0]); console.error(`📊 Extracted: ${data.notes || 'Data found'}`); return { revenue: data.revenue, profit: data.profit, assets: data.assets, equity: data.equity }; } catch (error: any) { console.error('GPT-4 text analysis error:', error.message); return { revenue: null, profit: null, assets: null, equity: null }; } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/josuekongolo/companyiq-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server