import * as cheerio from 'cheerio';
import { BaseExtractor, ExtractorResult, ExtractorOptions } from './base.js';
import { logger } from '@/utils/logger.js';
export class CheerioExtractor extends BaseExtractor {
constructor(options: ExtractorOptions = {}) {
super(options);
}
async extract(html: string, url: string): Promise<ExtractorResult> {
try {
const $ = cheerio.load(html);
// Remove unwanted elements
this.removeUnwantedElements($);
const title = this.extractTitle($);
const content = this.extractContent($);
const author = this.extractAuthor($);
const publishedTime = this.extractPublishTime($);
const language = this.extractLanguage($);
const sanitizedContent = this.sanitizeText(content);
const finalContent = this.options.maxLength
? this.truncateContent(sanitizedContent, this.options.maxLength)
: sanitizedContent;
const result: ExtractorResult = {
title,
content: finalContent,
excerpt: this.extractExcerpt(finalContent),
author,
publishedTime,
language,
};
logger.debug(`Cheerio extracted ${finalContent.length} characters from ${url}`);
return result;
} catch (error) {
logger.error(`Cheerio extraction failed for ${url}:`, error);
throw error;
}
}
private removeUnwantedElements($: cheerio.CheerioAPI): void {
// Remove scripts, styles, and other non-content elements
$(
'script, style, noscript, iframe, object, embed, ' +
'nav, header, footer, aside, ' +
'.nav, .navigation, .menu, .sidebar, ' +
'.ads, .advertisement, .ad, ' +
'.social, .share, .sharing, ' +
'.comment, .comments, ' +
'.popup, .modal, .overlay'
).remove();
}
private extractTitle($: cheerio.CheerioAPI): string {
// Try various title selectors in order of preference
const titleSelectors = [
'h1.title, h1.post-title, h1.entry-title',
'h1',
'[data-title]',
'.title',
'#title',
'title',
];
for (const selector of titleSelectors) {
const element = $(selector).first();
if (element.length && element.text().trim()) {
return element.text().trim();
}
}
return 'Untitled';
}
private extractContent($: cheerio.CheerioAPI): string {
// Try main content selectors in order of preference
const contentSelectors = [
'main',
'[role="main"]',
'.content, .main-content, #content, #main',
'article, .article',
'.post, .entry',
'.documentation, .docs',
'.markdown-body',
];
for (const selector of contentSelectors) {
const element = $(selector).first();
if (element.length) {
const text = element.text().trim();
if (text.length > 100) { // Ensure we have substantial content
return text;
}
}
}
// Fallback to body content, but try to avoid navigation
$('nav, header, footer, aside, .nav, .navigation, .menu, .sidebar').remove();
return $('body').text().trim();
}
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
const authorSelectors = [
'meta[name="author"]',
'meta[property="article:author"]',
'[rel="author"]',
'.author, .byline, .post-author',
'[data-author]',
];
for (const selector of authorSelectors) {
const element = $(selector).first();
if (element.length) {
const content = element.attr('content') || element.text();
if (content?.trim()) {
return content.trim();
}
}
}
return undefined;
}
private extractPublishTime($: cheerio.CheerioAPI): string | undefined {
const dateSelectors = [
'meta[property="article:published_time"]',
'meta[name="date"]',
'meta[name="publish_date"]',
'time[datetime]',
'[data-date]',
'.date, .publish-date, .post-date',
];
for (const selector of dateSelectors) {
const element = $(selector).first();
if (element.length) {
const content = element.attr('content') ||
element.attr('datetime') ||
element.text();
if (content?.trim()) {
return content.trim();
}
}
}
return undefined;
}
private extractLanguage($: cheerio.CheerioAPI): string | undefined {
return $('html').attr('lang') ||
$('meta[http-equiv="content-language"]').attr('content') ||
undefined;
}
}