/**
* Article Content Extractor
* Extracts clean text content from web articles using Mozilla Readability
*/
import fetch from 'node-fetch';
import { JSDOM } from 'jsdom';
import { Readability } from '@mozilla/readability';
import * as cheerio from 'cheerio';
import { createLearnLogger } from '../utils/custom-logger.js';
export class ArticleExtractor {
constructor() {
this.logger = createLearnLogger('ArticleExtractor');
}
/**
* Check if URL is a web article
*/
canHandle(url) {
try {
const urlObj = new globalThis.URL(url);
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
} catch {
return false;
}
}
/**
* Extract content from web article
*/
async extract(url) {
try {
this.logger.debug('Starting article extraction', { url });
// Fetch the webpage
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; LearnMCP/1.0; +https://github.com/forest-mcp)',
},
timeout: 30000,
});
if (!response.ok) {
throw new Error(`Failed to fetch article: ${response.status} ${response.statusText}`);
}
const html = await response.text();
const contentType = response.headers.get('content-type') || '';
// Check if it's actually HTML
if (!contentType.includes('text/html')) {
throw new Error(`Content is not HTML: ${contentType}`);
}
// Parse with JSDOM for Readability
const dom = new JSDOM(html, { url });
const document = dom.window.document;
// Extract metadata using cheerio for better parsing
const $ = cheerio.load(html);
const metadata = this.extractMetadata($, url);
// Use Readability to extract clean content
const reader = new Readability(document);
const article = reader.parse();
if (!article) {
throw new Error('Failed to parse article content with Readability');
}
const extractedContent = {
type: 'article',
url,
metadata: {
title: article.title || metadata.title || 'Unknown Article',
author: metadata.author || null,
publishDate: metadata.publishDate || null,
description: metadata.description || null,
siteName: metadata.siteName || this.extractDomain(url),
language: metadata.language || 'unknown',
keywords: metadata.keywords || [],
canonicalUrl: metadata.canonicalUrl || url,
},
content: {
text: article.textContent,
html: article.content,
excerpt: article.excerpt || metadata.description || '',
wordCount: this.countWords(article.textContent),
characterCount: article.textContent.length,
readingTime: this.estimateReadingTime(article.textContent),
},
extractedAt: new Date().toISOString(),
extractionMethod: 'mozilla-readability + cheerio',
};
this.logger.info('Article extraction completed', {
url,
title: extractedContent.metadata.title,
wordCount: extractedContent.content.wordCount,
readingTime: extractedContent.content.readingTime,
});
return extractedContent;
} catch (error) {
this.logger.error('Article extraction failed', {
url,
error: error.message,
stack: error.stack,
});
throw new Error(`Failed to extract article content: ${error.message}`);
}
}
/**
* Extract metadata from HTML using cheerio
*/
extractMetadata($, url) {
const metadata = {};
// Title
metadata.title =
$('meta[property="og:title"]').attr('content') ||
$('meta[name="twitter:title"]').attr('content') ||
$('title').text() ||
$('h1').first().text();
// Author
metadata.author =
$('meta[name="author"]').attr('content') ||
$('meta[property="article:author"]').attr('content') ||
$('[rel="author"]').text();
// Description
metadata.description =
$('meta[property="og:description"]').attr('content') ||
$('meta[name="twitter:description"]').attr('content') ||
$('meta[name="description"]').attr('content');
// Publish date
metadata.publishDate =
$('meta[property="article:published_time"]').attr('content') ||
$('meta[name="date"]').attr('content') ||
$('time[datetime]').attr('datetime');
// Site name
metadata.siteName = $('meta[property="og:site_name"]').attr('content');
// Language
metadata.language =
$('html').attr('lang') || $('meta[http-equiv="content-language"]').attr('content');
// Keywords
const keywordsContent = $('meta[name="keywords"]').attr('content');
if (keywordsContent) {
metadata.keywords = keywordsContent.split(',').map(k => k.trim());
}
// Canonical URL
metadata.canonicalUrl = $('link[rel="canonical"]').attr('href');
return metadata;
}
/**
* Extract domain from URL
*/
extractDomain(url) {
try {
return new globalThis.URL(url).hostname;
} catch {
return 'unknown';
}
}
/**
* Count words in text
*/
countWords(text) {
if (!text || typeof text !== 'string') return 0;
return text
.trim()
.split(/\s+/)
.filter(word => word.length > 0).length;
}
/**
* Estimate reading time in minutes
*/
estimateReadingTime(text) {
const wordsPerMinute = 200; // Average reading speed
const wordCount = this.countWords(text);
return Math.ceil(wordCount / wordsPerMinute);
}
/**
* Get estimated processing time
*/
getEstimatedProcessingTime(url) {
// Article extraction is typically fast (10-45 seconds)
return {
min: 10,
max: 45,
unit: 'seconds',
};
}
/**
* Check if URL is accessible
*/
async checkAccessibility(url) {
try {
const response = await fetch(url, {
method: 'HEAD',
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; LearnMCP/1.0)',
},
});
return {
accessible: response.ok,
status: response.status,
contentType: response.headers.get('content-type'),
contentLength: response.headers.get('content-length'),
};
} catch (error) {
return {
accessible: false,
error: error.message,
};
}
}
}