import * as cheerio from 'cheerio';
import { JSDOM } from 'jsdom';
import { Readability } from '@mozilla/readability';
import { logger } from '@/utils/logger.js';
import { resolveUrl, isValidUrl } from '@/utils/url.js';
import type { PageMetadata } from './types.js';
export interface ParsedPage {
title: string;
content: string;
metadata: PageMetadata;
links: string[];
}
export class HtmlParser {
constructor(private baseUrl: string) {}
parse(html: string): ParsedPage {
const $ = cheerio.load(html);
const dom = new JSDOM(html, { url: this.baseUrl });
const document = dom.window.document;
// Extract content using Readability
const reader = new Readability(document);
const article = reader.parse();
const title = this.extractTitle($, article);
const content = this.extractContent($, article);
const metadata = this.extractMetadata($, content);
const links = this.extractLinks($);
return {
title,
content,
metadata: {
...metadata,
wordCount: this.countWords(content),
links,
},
links,
};
}
private extractTitle($: cheerio.CheerioAPI, article: any): string {
// Try Readability title first
if (article?.title) {
return article.title.trim();
}
// Fallback to various title selectors
const titleSelectors = [
'title',
'h1',
'[data-title]',
'.title',
'#title',
];
for (const selector of titleSelectors) {
const element = $(selector).first();
if (element.length && element.text().trim()) {
return element.text().trim();
}
}
return 'Untitled';
}
private extractContent($: cheerio.CheerioAPI, article: any): string {
// Use Readability content if available
if (article?.textContent) {
return article.textContent.trim();
}
// Remove unwanted elements
$('script, style, nav, header, footer, aside, .nav, .navigation, .menu, .sidebar, .ads, .advertisement').remove();
// Try main content selectors
const contentSelectors = [
'main',
'[role="main"]',
'.content',
'.main-content',
'#content',
'#main',
'article',
'.article',
'.post',
'.entry',
];
for (const selector of contentSelectors) {
const element = $(selector).first();
if (element.length) {
return element.text().trim();
}
}
// Fallback to body content
return $('body').text().trim();
}
private extractMetadata($: cheerio.CheerioAPI, content: string): Omit<PageMetadata, 'wordCount' | 'links'> {
const metadata: Omit<PageMetadata, 'wordCount' | 'links'> = {};
// Title
metadata.title = $('title').text().trim() || undefined;
// Description
metadata.description = $('meta[name="description"]').attr('content') ||
$('meta[property="og:description"]').attr('content') ||
undefined;
// Keywords
const keywordsContent = $('meta[name="keywords"]').attr('content');
if (keywordsContent) {
metadata.keywords = keywordsContent.split(',').map(k => k.trim());
}
// Author
metadata.author = $('meta[name="author"]').attr('content') ||
$('meta[property="article:author"]').attr('content') ||
undefined;
// Dates
metadata.publishedTime = $('meta[property="article:published_time"]').attr('content') ||
$('meta[name="date"]').attr('content') ||
undefined;
metadata.modifiedTime = $('meta[property="article:modified_time"]').attr('content') ||
$('meta[name="last-modified"]').attr('content') ||
undefined;
// Language
metadata.language = $('html').attr('lang') ||
$('meta[http-equiv="content-language"]').attr('content') ||
undefined;
// Charset
metadata.charset = $('meta[charset]').attr('charset') ||
$('meta[http-equiv="content-type"]').attr('content')?.match(/charset=([^;]+)/)?.[1] ||
undefined;
// Content type
metadata.contentType = $('meta[http-equiv="content-type"]').attr('content') || undefined;
return metadata;
}
private extractLinks($: cheerio.CheerioAPI): string[] {
const links: string[] = [];
const seen = new Set<string>();
$('a[href]').each((_, element) => {
const href = $(element).attr('href');
if (!href || href.startsWith('javascript:') || href.startsWith('mailto:')) return;
const absoluteUrl = resolveUrl(this.baseUrl, href);
if (isValidUrl(absoluteUrl) && !seen.has(absoluteUrl)) {
links.push(absoluteUrl);
seen.add(absoluteUrl);
}
});
logger.debug(`Extracted ${links.length} unique links from ${this.baseUrl}`);
return links;
}
private countWords(text: string): number {
return text.split(/\s+/).filter(word => word.length > 0).length;
}
}