Skip to main content
Glama
josuekongolo

CompanyIQ MCP Server

by josuekongolo
browser_scraper.tsโ€ข19 kB
/** * Browser Scraper - Uses Puppeteer to automate browser interactions * Downloads all รฅrsregnskap PDFs and extracts financial data */ import puppeteer from 'puppeteer'; import { resolve, dirname } from 'path'; import { fileURLToPath } from 'url'; import { existsSync, mkdirSync, readdirSync, unlinkSync } from 'fs'; import { readdir, readFile, writeFile, appendFile } from 'fs/promises'; import { RegnskapClient } from '../apis/regnskap.js'; import { OpenAIVisionParser } from './openai_vision_parser.js'; // Logging utility async function logToFile(message: string, orgNr: string) { const logPath = resolve(__dirname, '../../data/logs', `scraper_${orgNr}_${new Date().toISOString().split('T')[0]}.log`); const timestamp = new Date().toISOString(); const logMessage = `[${timestamp}] ${message}\n`; try { const logDir = resolve(__dirname, '../../data/logs'); if (!existsSync(logDir)) { mkdirSync(logDir, { recursive: true }); } await appendFile(logPath, logMessage); } catch (e) { console.error('Failed to write to log file:', e); } } const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); export interface YearData { year: number; revenue: number | null; profit: number | null; assets: number | null; equity: number | null; source: string; } export class BrowserScraper { private downloadPath: string; private regnskapClient: RegnskapClient; private pdfParser: OpenAIVisionParser; constructor(openaiApiKey?: string, private orgNr?: string) { // Organize files by org number for better structure const baseDataPath = resolve(__dirname, '../../data'); const orgPath = orgNr ? orgNr : 'temp'; this.downloadPath = resolve(baseDataPath, 'pdfs', orgPath); this.regnskapClient = new RegnskapClient(); const apiKey = openaiApiKey || process.env.OPENAI_API_KEY || ''; if (!apiKey) { throw new Error('OpenAI API key is required. Please provide it as constructor parameter or set OPENAI_API_KEY in .env file'); } console.error(`๐Ÿ”‘ Using OpenAI Vision Parser with API key: ${apiKey.substring(0, 15)}...`); this.pdfParser = new OpenAIVisionParser(apiKey); if (!existsSync(this.downloadPath)) { mkdirSync(this.downloadPath, { recursive: true }); } // Ensure extracted data folder exists for this org const extractedPath = resolve(baseDataPath, 'extracted', orgPath); if (!existsSync(extractedPath)) { mkdirSync(extractedPath, { recursive: true }); } console.error(`๐Ÿ“ PDF download path: ${this.downloadPath}`); console.error(`๐Ÿ“ Extracted data path: ${extractedPath}`); } /** * Clean old downloads to ensure we only get new files */ private cleanDownloadFolder(): void { try { const files = readdirSync(this.downloadPath); files.forEach(file => { if (file.endsWith('.pdf') || file.endsWith('.crdownload')) { const filePath = resolve(this.downloadPath, file); try { unlinkSync(filePath); } catch (e) { // Ignore errors } } }); console.error('๐Ÿงน Cleaned download folder'); } catch (e) { // Ignore if folder doesn't exist } } /** * Wait for a download to complete */ private async waitForDownload(filename: string, timeout: number = 30000): Promise<boolean> { const startTime = Date.now(); const userDownloadsPath = resolve(process.env.HOME || '', 'Downloads'); while (Date.now() - startTime < timeout) { // Check both custom path and default Downloads folder const customFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath) : []; const downloadFiles = existsSync(userDownloadsPath) ? readdirSync(userDownloadsPath) : []; // Check if PDF file exists in either location if (customFiles.some(f => f.includes(filename) && f.endsWith('.pdf')) || downloadFiles.some(f => f.includes(filename) && f.endsWith('.pdf'))) { return true; } // Wait a bit before checking again await new Promise(resolve => setTimeout(resolve, 500)); } return false; } /** * Main method to scrape all financial years using browser automation */ async getAllFinancialYears(orgNr: string): Promise<YearData[]> { console.error(`\n๐Ÿค– Starting browser-based scraper for ${orgNr}...`); await logToFile(`========== SCRAPER STARTED FOR ${orgNr} ==========`, orgNr); await logToFile(`Step 1: Initializing browser scraper`, orgNr); const results: YearData[] = []; // First try to get latest from API (fast) try { const apiData = await this.regnskapClient.getExtractedFinancials(orgNr); if (apiData) { results.push({ year: apiData.year, revenue: apiData.revenue, profit: apiData.profit, assets: apiData.assets, equity: apiData.equity, source: 'regnskapsregisteret_api' }); console.error(`โœ… Got ${apiData.year} from API`); } } catch (e) { console.error('โš ๏ธ API fetch failed'); } // Keep existing PDFs - don't clean the folder // this.cleanDownloadFolder(); // Disabled to preserve downloaded PDFs for user access let browser; try { console.error('\n๐ŸŒ Launching headless browser (invisible)...'); browser = await puppeteer.launch({ executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', headless: 'new', // Use new headless mode (no deprecation warning) args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled' ], defaultViewport: null }); const page = await browser.newPage(); // Configure download behavior to save in project folder const client = await page.target().createCDPSession(); await client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath: this.downloadPath, }); console.error(`๐Ÿ“ Downloads will be saved to: ${this.downloadPath}`); // Navigate to the company page const url = `https://virksomhet.brreg.no/nb/oppslag/enheter/${orgNr}`; console.error(`๐Ÿ“„ Navigating to: ${url}`); await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); console.error('โณ Waiting for page to fully load...'); await page.waitForTimeout(5000); // Scroll to load all content console.error('๐Ÿ“œ Scrolling page to load all content...'); await page.evaluate(async () => { for (let i = 0; i < 10; i++) { window.scrollBy(0, 300); await new Promise(r => setTimeout(r, 500)); } window.scrollTo(0, 0); }); // STEP 1: First expand the ร…rsregnskap section console.error('๐Ÿ”˜ Step 1: Expanding ร…rsregnskap section...'); const aarsregnskapExpanded = await page.evaluate(() => { // Find the u-summary element with "ร…rsregnskap" text const summaries = Array.from(document.querySelectorAll('u-summary')); const aarsregnskapSummary = summaries.find(summary => summary.textContent?.includes('ร…rsregnskap') ); if (aarsregnskapSummary && aarsregnskapSummary.getAttribute('aria-expanded') === 'false') { (aarsregnskapSummary as HTMLElement).click(); console.log('Clicked ร…rsregnskap summary to expand'); return true; } return false; }); if (aarsregnskapExpanded) { console.error(' โœ… ร…rsregnskap section expanded'); await page.waitForTimeout(2000); } // STEP 2: Click "Vis alle" button to show all years console.error('๐Ÿ”˜ Step 2: Clicking "Vis alle" to show all years...'); const showAllClicked = await page.evaluate(() => { // Find button with "Vis alle" text const buttons = Array.from(document.querySelectorAll('button')); const visAlleButton = buttons.find(button => { const text = button.textContent || ''; return text.includes('Vis alle') && button.getAttribute('data-transaction-name') === 'Vis flere รฅrsregnskap'; }); if (visAlleButton) { (visAlleButton as HTMLElement).click(); console.log('Clicked "Vis alle" button'); return true; } return false; }); if (showAllClicked) { console.error(' โœ… Clicked "Vis alle" - all years should be visible'); await page.waitForTimeout(3000); } else { console.error(' โš ๏ธ Could not find "Vis alle" button'); } console.error('๐Ÿ” Finding all download links...'); // Get all available years const downloadInfo = await page.evaluate((orgNumber) => { const links: { year: number; selector: string }[] = []; // Find all download links using data-testid const elements = document.querySelectorAll('[data-testid^="download-aarsregnskap-"]'); elements.forEach(element => { const testId = element.getAttribute('data-testid'); if (testId) { const match = testId.match(/download-aarsregnskap-\d+-(\d{4})/); if (match) { const year = parseInt(match[1]); links.push({ year, selector: `[data-testid="${testId}"]` }); } } }); // Also check for download buttons with year text const allButtons = document.querySelectorAll('button, a'); allButtons.forEach(button => { const text = button.textContent || ''; if (text.includes('Last ned') || text.includes('PDF')) { // Look for year in nearby text const parent = button.parentElement; if (parent) { const parentText = parent.textContent || ''; const yearMatch = parentText.match(/\b(20\d{2})\b/); if (yearMatch) { const year = parseInt(yearMatch[1]); if (!links.some(l => l.year === year)) { links.push({ year, selector: '' }); // Will use year to find } } } } }); return links.sort((a, b) => b.year - a.year); }, orgNr); console.error(`โœ… Found ${downloadInfo.length} years available for download`); // Download PDFs for each year (except the one we already have from API) for (const info of downloadInfo) { if (results.some(r => r.year === info.year)) { console.error(`โญ๏ธ Skipping ${info.year} (already have from API)`); continue; } console.error(`\n๐Ÿ“ฅ Downloading year ${info.year}...`); await logToFile(`Step: Downloading PDF for year ${info.year}`, orgNr); try { // Click the download link if (info.selector) { await page.click(info.selector); } else { // Try to find and click by year text await page.evaluate((year) => { const elements = document.querySelectorAll('button, a'); for (const el of Array.from(elements)) { if (el.textContent?.includes(year.toString()) && (el.textContent.includes('Last ned') || el.textContent.includes('PDF'))) { (el as HTMLElement).click(); return true; } } return false; }, info.year); } // Wait for download to complete const downloaded = await this.waitForDownload(info.year.toString(), 15000); if (downloaded) { console.error(`โœ… Downloaded PDF for ${info.year}`); // Find the downloaded file (check both locations) const userDownloadsPath = resolve(process.env.HOME || '', 'Downloads'); const customFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath) : []; const downloadFiles = existsSync(userDownloadsPath) ? readdirSync(userDownloadsPath) : []; // Look for PDF in custom path first, then default Downloads let pdfFile = customFiles.find(f => f.includes(info.year.toString()) && f.includes(orgNr) && f.endsWith('.pdf') ); let pdfPath = pdfFile ? resolve(this.downloadPath, pdfFile) : null; if (!pdfPath) { pdfFile = downloadFiles.find(f => f.includes(info.year.toString()) && f.includes(orgNr) && f.endsWith('.pdf') ); pdfPath = pdfFile ? resolve(userDownloadsPath, pdfFile) : null; } if (pdfPath) { console.error(`๐Ÿ“– Parsing PDF from ${pdfPath}...`); await logToFile(`Step: Starting OpenAI extraction for ${info.year} from ${pdfPath}`, orgNr); try { const pdfData = await this.pdfParser.parseFinancialPDF(pdfPath); await logToFile(`SUCCESS: Extracted ${info.year} - Revenue: ${pdfData.revenue}, Profit: ${pdfData.profit}`, orgNr); const yearData = { year: info.year, revenue: pdfData.revenue || null, profit: pdfData.profit || null, assets: pdfData.assets || null, equity: pdfData.equity || null, source: 'openai_vision_extraction', file: pdfFile, extractedAt: new Date().toISOString() }; results.push(yearData); // Save to JSON file in org-specific folder try { const orgPath = this.orgNr || 'temp'; const jsonPath = resolve(__dirname, '../../data/extracted', orgPath, `financial_data_${info.year}.json`); await writeFile(jsonPath, JSON.stringify(yearData, null, 2)); console.error(`๐Ÿ’พ Saved JSON: ${jsonPath}`); } catch (jsonError) { console.error(`โš ๏ธ Failed to save JSON for ${info.year}:`, jsonError); } if (pdfData.revenue || pdfData.profit) { console.error(`โœ… Extracted data for ${info.year}`); } else { console.error(`โš ๏ธ No financial data extracted for ${info.year}`); } } catch (parseError) { console.error(`โŒ Failed to parse PDF for ${info.year}:`, parseError); console.error(` This error will be visible in MCP tool output`); await logToFile(`ERROR: Failed to extract ${info.year} - ${parseError}`, orgNr); // Don't save null data - skip this year } } } else { console.error(`โš ๏ธ Download timeout for ${info.year}`); } // Small delay between downloads await page.waitForTimeout(2000); } catch (error) { console.error(`โŒ Failed to download ${info.year}: ${error}`); } } } catch (error) { console.error(`โŒ Browser automation error:`, error); if (error instanceof Error) { console.error(`Error message: ${error.message}`); console.error(`Stack trace:`, error.stack); } } finally { if (browser) { await browser.close(); console.error('๐Ÿ”’ Browser closed'); } } // CRITICAL: Parse ALL downloaded PDFs (not just the ones we tracked during download) console.error(`\n๐Ÿ“– Processing ALL PDFs in download folder...`); const allPdfFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath).filter(f => f.endsWith('.pdf')) : []; console.error(`Found ${allPdfFiles.length} PDF files to process`); for (const pdfFile of allPdfFiles) { const yearMatch = pdfFile.match(/(\d{4})\.pdf$/); const year = yearMatch ? parseInt(yearMatch[1]) : null; if (!year) continue; // Skip if already processed if (results.some(r => r.year === year)) { console.error(`โญ๏ธ Skipping ${year} - already processed`); continue; } console.error(`\n๐Ÿ“– Processing ${pdfFile} (Year: ${year})...`); const pdfPath = resolve(this.downloadPath, pdfFile); try { const pdfData = await this.pdfParser.parseFinancialPDF(pdfPath); // Only save if we got SOME data if (!pdfData.revenue && !pdfData.profit && !pdfData.assets && !pdfData.equity) { console.error(`โš ๏ธ No data extracted for ${year} - skipping save`); continue; } const yearData = { year: year, revenue: pdfData.revenue || null, profit: pdfData.profit || null, assets: pdfData.assets || null, equity: pdfData.equity || null, source: 'openai_vision_extraction', file: pdfFile, extractedAt: new Date().toISOString() }; results.push(yearData); // Save to JSON file in org-specific folder try { const orgPath = this.orgNr || 'temp'; const jsonPath = resolve(__dirname, '../../data/extracted', orgPath, `financial_data_${year}.json`); await writeFile(jsonPath, JSON.stringify(yearData, null, 2)); console.error(`๐Ÿ’พ Saved JSON: ${jsonPath}`); } catch (jsonError) { console.error(`โš ๏ธ Failed to save JSON for ${year}:`, jsonError); } console.error(`โœ… Extracted data for ${year}: Rev=${pdfData.revenue ? (pdfData.revenue/1000000).toFixed(1)+'M' : 'N/A'}, Profit=${pdfData.profit ? (pdfData.profit/1000000).toFixed(1)+'M' : 'N/A'}`); } catch (error) { console.error(`โŒ Failed to process ${year}:`, error); console.error(` Error will be shown in MCP output`); } } // Sort results by year (newest first) results.sort((a, b) => b.year - a.year); console.error(`\nโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”`); console.error(`๐ŸŽ‰ BROWSER SCRAPING COMPLETE!`); console.error(`โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”`); console.error(`โœ… Total years processed: ${results.length}`); console.error(`๐Ÿ“Š Years with data: ${results.filter(r => r.revenue !== null).length}`); await logToFile(`========== SCRAPER COMPLETED FOR ${orgNr} ==========`, orgNr); await logToFile(`Final result: ${results.length} years processed, ${results.filter(r => r.revenue !== null).length} with data`, orgNr); await logToFile(`Log file location: data/logs/scraper_${orgNr}_${new Date().toISOString().split('T')[0]}.log`, orgNr); return results; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/josuekongolo/companyiq-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server