import { JSDOM } from 'jsdom';
import puppeteer from 'puppeteer';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
export interface FetchResult {
title: string;
content: string; // Markdown
excerpt: string;
byline: string;
siteName: string;
url: string;
}
export async function fetchUrl(url: string): Promise<FetchResult> {
let browser;
try {
browser = await puppeteer.launch({
headless: true, // Use new headless mode
});
const page = await browser.newPage();
// Navigate to the URL and wait for network to be idle (indicates dynamic content loaded)
await page.goto(url, { waitUntil: 'networkidle0' });
const html = await page.content();
const doc = new JSDOM(html, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Failed to parse article content');
}
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
});
// Configure turndown to drop unwanted elements if needed
turndownService.remove(['script', 'style', 'iframe']);
const markdownContent = turndownService.turndown(article.content || '');
return {
title: article.title || 'No Title',
content: markdownContent,
excerpt: article.excerpt || '',
byline: article.byline || '',
siteName: article.siteName || '',
url: url
};
} catch (error) {
console.error('Error fetching URL:', error);
throw error;
} finally {
if (browser) {
await browser.close();
}
}
}