import { JSDOM } from 'jsdom';
import { Readability } from '@mozilla/readability';
import { BaseExtractor, ExtractorResult, ExtractorOptions } from './base.js';
import { logger } from '@/utils/logger.js';
export class ReadabilityExtractor extends BaseExtractor {
constructor(options: ExtractorOptions = {}) {
super(options);
}
async extract(html: string, url: string): Promise<ExtractorResult> {
try {
const dom = new JSDOM(html, { url });
const document = dom.window.document;
// Create a copy for Readability to modify
const documentClone = document.cloneNode(true) as Document;
const reader = new Readability(documentClone);
const article = reader.parse();
if (!article) {
throw new Error('Readability failed to extract content');
}
let content = article.textContent || '';
if (this.options.preserveFormatting && article.content) {
// Convert HTML to plain text while preserving some structure
content = this.htmlToText(article.content);
}
content = this.sanitizeText(content);
if (this.options.maxLength) {
content = this.truncateContent(content, this.options.maxLength);
}
const result: ExtractorResult = {
title: article.title || 'Untitled',
content,
excerpt: article.excerpt || this.extractExcerpt(content),
author: this.extractAuthor(document),
publishedTime: this.extractPublishTime(document),
language: this.extractLanguage(document),
};
logger.debug(`Readability extracted ${content.length} characters from ${url}`);
return result;
} catch (error) {
logger.error(`Readability extraction failed for ${url}:`, error);
throw error;
}
}
private htmlToText(html: string): string {
// Simple HTML to text conversion that preserves some structure
return html
.replace(/<h[1-6][^>]*>/gi, '\n# ')
.replace(/<\/h[1-6]>/gi, '\n')
.replace(/<p[^>]*>/gi, '\n')
.replace(/<\/p>/gi, '\n')
.replace(/<br[^>]*>/gi, '\n')
.replace(/<li[^>]*>/gi, '\n- ')
.replace(/<\/li>/gi, '')
.replace(/<code[^>]*>/gi, '`')
.replace(/<\/code>/gi, '`')
.replace(/<pre[^>]*>/gi, '\n```\n')
.replace(/<\/pre>/gi, '\n```\n')
.replace(/<[^>]+>/g, '')
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'");
}
private extractAuthor(document: Document): string | undefined {
const selectors = [
'meta[name="author"]',
'meta[property="article:author"]',
'[rel="author"]',
'.author',
'.byline',
'[data-author]',
];
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
const content = element.getAttribute('content') || element.textContent;
if (content?.trim()) {
return content.trim();
}
}
}
return undefined;
}
private extractPublishTime(document: Document): string | undefined {
const selectors = [
'meta[property="article:published_time"]',
'meta[name="date"]',
'meta[name="publish_date"]',
'time[datetime]',
'[data-date]',
];
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
const content = element.getAttribute('content') ||
element.getAttribute('datetime') ||
element.textContent;
if (content?.trim()) {
return content.trim();
}
}
}
return undefined;
}
private extractLanguage(document: Document): string | undefined {
return document.documentElement.lang ||
document.querySelector('meta[http-equiv="content-language"]')?.getAttribute('content') ||
undefined;
}
}