/**
* Browser Scraper - Uses Puppeteer to automate browser interactions
* Downloads all รฅrsregnskap PDFs and extracts financial data
*/
import puppeteer from 'puppeteer';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { existsSync, mkdirSync, readdirSync, unlinkSync } from 'fs';
import { readdir, readFile, writeFile, appendFile } from 'fs/promises';
import { RegnskapClient } from '../apis/regnskap.js';
import { OpenAIVisionParser } from './openai_vision_parser.js';
// Logging utility
async function logToFile(message: string, orgNr: string) {
const logPath = resolve(__dirname, '../../data/logs', `scraper_${orgNr}_${new Date().toISOString().split('T')[0]}.log`);
const timestamp = new Date().toISOString();
const logMessage = `[${timestamp}] ${message}\n`;
try {
const logDir = resolve(__dirname, '../../data/logs');
if (!existsSync(logDir)) {
mkdirSync(logDir, { recursive: true });
}
await appendFile(logPath, logMessage);
} catch (e) {
console.error('Failed to write to log file:', e);
}
}
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
export interface YearData {
year: number;
revenue: number | null;
profit: number | null;
assets: number | null;
equity: number | null;
source: string;
}
export class BrowserScraper {
private downloadPath: string;
private regnskapClient: RegnskapClient;
private pdfParser: OpenAIVisionParser;
constructor(openaiApiKey?: string, private orgNr?: string) {
// Organize files by org number for better structure
const baseDataPath = resolve(__dirname, '../../data');
const orgPath = orgNr ? orgNr : 'temp';
this.downloadPath = resolve(baseDataPath, 'pdfs', orgPath);
this.regnskapClient = new RegnskapClient();
const apiKey = openaiApiKey || process.env.OPENAI_API_KEY || '';
if (!apiKey) {
throw new Error('OpenAI API key is required. Please provide it as constructor parameter or set OPENAI_API_KEY in .env file');
}
console.error(`๐ Using OpenAI Vision Parser with API key: ${apiKey.substring(0, 15)}...`);
this.pdfParser = new OpenAIVisionParser(apiKey);
if (!existsSync(this.downloadPath)) {
mkdirSync(this.downloadPath, { recursive: true });
}
// Ensure extracted data folder exists for this org
const extractedPath = resolve(baseDataPath, 'extracted', orgPath);
if (!existsSync(extractedPath)) {
mkdirSync(extractedPath, { recursive: true });
}
console.error(`๐ PDF download path: ${this.downloadPath}`);
console.error(`๐ Extracted data path: ${extractedPath}`);
}
/**
* Clean old downloads to ensure we only get new files
*/
private cleanDownloadFolder(): void {
try {
const files = readdirSync(this.downloadPath);
files.forEach(file => {
if (file.endsWith('.pdf') || file.endsWith('.crdownload')) {
const filePath = resolve(this.downloadPath, file);
try {
unlinkSync(filePath);
} catch (e) {
// Ignore errors
}
}
});
console.error('๐งน Cleaned download folder');
} catch (e) {
// Ignore if folder doesn't exist
}
}
/**
* Wait for a download to complete
*/
private async waitForDownload(filename: string, timeout: number = 30000): Promise<boolean> {
const startTime = Date.now();
const userDownloadsPath = resolve(process.env.HOME || '', 'Downloads');
while (Date.now() - startTime < timeout) {
// Check both custom path and default Downloads folder
const customFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath) : [];
const downloadFiles = existsSync(userDownloadsPath) ? readdirSync(userDownloadsPath) : [];
// Check if PDF file exists in either location
if (customFiles.some(f => f.includes(filename) && f.endsWith('.pdf')) ||
downloadFiles.some(f => f.includes(filename) && f.endsWith('.pdf'))) {
return true;
}
// Wait a bit before checking again
await new Promise(resolve => setTimeout(resolve, 500));
}
return false;
}
/**
* Main method to scrape all financial years using browser automation
*/
async getAllFinancialYears(orgNr: string): Promise<YearData[]> {
console.error(`\n๐ค Starting browser-based scraper for ${orgNr}...`);
await logToFile(`========== SCRAPER STARTED FOR ${orgNr} ==========`, orgNr);
await logToFile(`Step 1: Initializing browser scraper`, orgNr);
const results: YearData[] = [];
// First try to get latest from API (fast)
try {
const apiData = await this.regnskapClient.getExtractedFinancials(orgNr);
if (apiData) {
results.push({
year: apiData.year,
revenue: apiData.revenue,
profit: apiData.profit,
assets: apiData.assets,
equity: apiData.equity,
source: 'regnskapsregisteret_api'
});
console.error(`โ
Got ${apiData.year} from API`);
}
} catch (e) {
console.error('โ ๏ธ API fetch failed');
}
// Keep existing PDFs - don't clean the folder
// this.cleanDownloadFolder(); // Disabled to preserve downloaded PDFs for user access
let browser;
try {
console.error('\n๐ Launching headless browser (invisible)...');
browser = await puppeteer.launch({
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
headless: 'new', // Use new headless mode (no deprecation warning)
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
],
defaultViewport: null
});
const page = await browser.newPage();
// Configure download behavior to save in project folder
const client = await page.target().createCDPSession();
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: this.downloadPath,
});
console.error(`๐ Downloads will be saved to: ${this.downloadPath}`);
// Navigate to the company page
const url = `https://virksomhet.brreg.no/nb/oppslag/enheter/${orgNr}`;
console.error(`๐ Navigating to: ${url}`);
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 60000
});
console.error('โณ Waiting for page to fully load...');
await page.waitForTimeout(5000);
// Scroll to load all content
console.error('๐ Scrolling page to load all content...');
await page.evaluate(async () => {
for (let i = 0; i < 10; i++) {
window.scrollBy(0, 300);
await new Promise(r => setTimeout(r, 500));
}
window.scrollTo(0, 0);
});
// STEP 1: First expand the ร
rsregnskap section
console.error('๐ Step 1: Expanding ร
rsregnskap section...');
const aarsregnskapExpanded = await page.evaluate(() => {
// Find the u-summary element with "ร
rsregnskap" text
const summaries = Array.from(document.querySelectorAll('u-summary'));
const aarsregnskapSummary = summaries.find(summary =>
summary.textContent?.includes('ร
rsregnskap')
);
if (aarsregnskapSummary && aarsregnskapSummary.getAttribute('aria-expanded') === 'false') {
(aarsregnskapSummary as HTMLElement).click();
console.log('Clicked ร
rsregnskap summary to expand');
return true;
}
return false;
});
if (aarsregnskapExpanded) {
console.error(' โ
ร
rsregnskap section expanded');
await page.waitForTimeout(2000);
}
// STEP 2: Click "Vis alle" button to show all years
console.error('๐ Step 2: Clicking "Vis alle" to show all years...');
const showAllClicked = await page.evaluate(() => {
// Find button with "Vis alle" text
const buttons = Array.from(document.querySelectorAll('button'));
const visAlleButton = buttons.find(button => {
const text = button.textContent || '';
return text.includes('Vis alle') &&
button.getAttribute('data-transaction-name') === 'Vis flere รฅrsregnskap';
});
if (visAlleButton) {
(visAlleButton as HTMLElement).click();
console.log('Clicked "Vis alle" button');
return true;
}
return false;
});
if (showAllClicked) {
console.error(' โ
Clicked "Vis alle" - all years should be visible');
await page.waitForTimeout(3000);
} else {
console.error(' โ ๏ธ Could not find "Vis alle" button');
}
console.error('๐ Finding all download links...');
// Get all available years
const downloadInfo = await page.evaluate((orgNumber) => {
const links: { year: number; selector: string }[] = [];
// Find all download links using data-testid
const elements = document.querySelectorAll('[data-testid^="download-aarsregnskap-"]');
elements.forEach(element => {
const testId = element.getAttribute('data-testid');
if (testId) {
const match = testId.match(/download-aarsregnskap-\d+-(\d{4})/);
if (match) {
const year = parseInt(match[1]);
links.push({ year, selector: `[data-testid="${testId}"]` });
}
}
});
// Also check for download buttons with year text
const allButtons = document.querySelectorAll('button, a');
allButtons.forEach(button => {
const text = button.textContent || '';
if (text.includes('Last ned') || text.includes('PDF')) {
// Look for year in nearby text
const parent = button.parentElement;
if (parent) {
const parentText = parent.textContent || '';
const yearMatch = parentText.match(/\b(20\d{2})\b/);
if (yearMatch) {
const year = parseInt(yearMatch[1]);
if (!links.some(l => l.year === year)) {
links.push({ year, selector: '' }); // Will use year to find
}
}
}
}
});
return links.sort((a, b) => b.year - a.year);
}, orgNr);
console.error(`โ
Found ${downloadInfo.length} years available for download`);
// Download PDFs for each year (except the one we already have from API)
for (const info of downloadInfo) {
if (results.some(r => r.year === info.year)) {
console.error(`โญ๏ธ Skipping ${info.year} (already have from API)`);
continue;
}
console.error(`\n๐ฅ Downloading year ${info.year}...`);
await logToFile(`Step: Downloading PDF for year ${info.year}`, orgNr);
try {
// Click the download link
if (info.selector) {
await page.click(info.selector);
} else {
// Try to find and click by year text
await page.evaluate((year) => {
const elements = document.querySelectorAll('button, a');
for (const el of Array.from(elements)) {
if (el.textContent?.includes(year.toString()) &&
(el.textContent.includes('Last ned') || el.textContent.includes('PDF'))) {
(el as HTMLElement).click();
return true;
}
}
return false;
}, info.year);
}
// Wait for download to complete
const downloaded = await this.waitForDownload(info.year.toString(), 15000);
if (downloaded) {
console.error(`โ
Downloaded PDF for ${info.year}`);
// Find the downloaded file (check both locations)
const userDownloadsPath = resolve(process.env.HOME || '', 'Downloads');
const customFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath) : [];
const downloadFiles = existsSync(userDownloadsPath) ? readdirSync(userDownloadsPath) : [];
// Look for PDF in custom path first, then default Downloads
let pdfFile = customFiles.find(f =>
f.includes(info.year.toString()) && f.includes(orgNr) && f.endsWith('.pdf')
);
let pdfPath = pdfFile ? resolve(this.downloadPath, pdfFile) : null;
if (!pdfPath) {
pdfFile = downloadFiles.find(f =>
f.includes(info.year.toString()) && f.includes(orgNr) && f.endsWith('.pdf')
);
pdfPath = pdfFile ? resolve(userDownloadsPath, pdfFile) : null;
}
if (pdfPath) {
console.error(`๐ Parsing PDF from ${pdfPath}...`);
await logToFile(`Step: Starting OpenAI extraction for ${info.year} from ${pdfPath}`, orgNr);
try {
const pdfData = await this.pdfParser.parseFinancialPDF(pdfPath);
await logToFile(`SUCCESS: Extracted ${info.year} - Revenue: ${pdfData.revenue}, Profit: ${pdfData.profit}`, orgNr);
const yearData = {
year: info.year,
revenue: pdfData.revenue || null,
profit: pdfData.profit || null,
assets: pdfData.assets || null,
equity: pdfData.equity || null,
source: 'openai_vision_extraction',
file: pdfFile,
extractedAt: new Date().toISOString()
};
results.push(yearData);
// Save to JSON file in org-specific folder
try {
const orgPath = this.orgNr || 'temp';
const jsonPath = resolve(__dirname, '../../data/extracted', orgPath, `financial_data_${info.year}.json`);
await writeFile(jsonPath, JSON.stringify(yearData, null, 2));
console.error(`๐พ Saved JSON: ${jsonPath}`);
} catch (jsonError) {
console.error(`โ ๏ธ Failed to save JSON for ${info.year}:`, jsonError);
}
if (pdfData.revenue || pdfData.profit) {
console.error(`โ
Extracted data for ${info.year}`);
} else {
console.error(`โ ๏ธ No financial data extracted for ${info.year}`);
}
} catch (parseError) {
console.error(`โ Failed to parse PDF for ${info.year}:`, parseError);
console.error(` This error will be visible in MCP tool output`);
await logToFile(`ERROR: Failed to extract ${info.year} - ${parseError}`, orgNr);
// Don't save null data - skip this year
}
}
} else {
console.error(`โ ๏ธ Download timeout for ${info.year}`);
}
// Small delay between downloads
await page.waitForTimeout(2000);
} catch (error) {
console.error(`โ Failed to download ${info.year}: ${error}`);
}
}
} catch (error) {
console.error(`โ Browser automation error:`, error);
if (error instanceof Error) {
console.error(`Error message: ${error.message}`);
console.error(`Stack trace:`, error.stack);
}
} finally {
if (browser) {
await browser.close();
console.error('๐ Browser closed');
}
}
// CRITICAL: Parse ALL downloaded PDFs (not just the ones we tracked during download)
console.error(`\n๐ Processing ALL PDFs in download folder...`);
const allPdfFiles = existsSync(this.downloadPath) ? readdirSync(this.downloadPath).filter(f => f.endsWith('.pdf')) : [];
console.error(`Found ${allPdfFiles.length} PDF files to process`);
for (const pdfFile of allPdfFiles) {
const yearMatch = pdfFile.match(/(\d{4})\.pdf$/);
const year = yearMatch ? parseInt(yearMatch[1]) : null;
if (!year) continue;
// Skip if already processed
if (results.some(r => r.year === year)) {
console.error(`โญ๏ธ Skipping ${year} - already processed`);
continue;
}
console.error(`\n๐ Processing ${pdfFile} (Year: ${year})...`);
const pdfPath = resolve(this.downloadPath, pdfFile);
try {
const pdfData = await this.pdfParser.parseFinancialPDF(pdfPath);
// Only save if we got SOME data
if (!pdfData.revenue && !pdfData.profit && !pdfData.assets && !pdfData.equity) {
console.error(`โ ๏ธ No data extracted for ${year} - skipping save`);
continue;
}
const yearData = {
year: year,
revenue: pdfData.revenue || null,
profit: pdfData.profit || null,
assets: pdfData.assets || null,
equity: pdfData.equity || null,
source: 'openai_vision_extraction',
file: pdfFile,
extractedAt: new Date().toISOString()
};
results.push(yearData);
// Save to JSON file in org-specific folder
try {
const orgPath = this.orgNr || 'temp';
const jsonPath = resolve(__dirname, '../../data/extracted', orgPath, `financial_data_${year}.json`);
await writeFile(jsonPath, JSON.stringify(yearData, null, 2));
console.error(`๐พ Saved JSON: ${jsonPath}`);
} catch (jsonError) {
console.error(`โ ๏ธ Failed to save JSON for ${year}:`, jsonError);
}
console.error(`โ
Extracted data for ${year}: Rev=${pdfData.revenue ? (pdfData.revenue/1000000).toFixed(1)+'M' : 'N/A'}, Profit=${pdfData.profit ? (pdfData.profit/1000000).toFixed(1)+'M' : 'N/A'}`);
} catch (error) {
console.error(`โ Failed to process ${year}:`, error);
console.error(` Error will be shown in MCP output`);
}
}
// Sort results by year (newest first)
results.sort((a, b) => b.year - a.year);
console.error(`\nโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ`);
console.error(`๐ BROWSER SCRAPING COMPLETE!`);
console.error(`โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ`);
console.error(`โ
Total years processed: ${results.length}`);
console.error(`๐ Years with data: ${results.filter(r => r.revenue !== null).length}`);
await logToFile(`========== SCRAPER COMPLETED FOR ${orgNr} ==========`, orgNr);
await logToFile(`Final result: ${results.length} years processed, ${results.filter(r => r.revenue !== null).length} with data`, orgNr);
await logToFile(`Log file location: data/logs/scraper_${orgNr}_${new Date().toISOString().split('T')[0]}.log`, orgNr);
return results;
}
}