LearnMCP Server

Overview Schema Related Servers Score Discussions

LearnMCP
modules
content-extractors

article-extractor.js•6.29 KiB

/** * Article Content Extractor * Extracts clean text content from web articles using Mozilla Readability */ import fetch from 'node-fetch'; import { JSDOM } from 'jsdom'; import { Readability } from '@mozilla/readability'; import * as cheerio from 'cheerio'; import { createLearnLogger } from '../utils/custom-logger.js'; export class ArticleExtractor { constructor() { this.logger = createLearnLogger('ArticleExtractor'); } /** * Check if URL is a web article */ canHandle(url) { try { const urlObj = new globalThis.URL(url); return urlObj.protocol === 'http:' || urlObj.protocol === 'https:'; } catch { return false; } } /** * Extract content from web article */ async extract(url) { try { this.logger.debug('Starting article extraction', { url }); // Fetch the webpage const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; LearnMCP/1.0; +https://github.com/forest-mcp)', }, timeout: 30000, }); if (!response.ok) { throw new Error(`Failed to fetch article: ${response.status} ${response.statusText}`); } const html = await response.text(); const contentType = response.headers.get('content-type') || ''; // Check if it's actually HTML if (!contentType.includes('text/html')) { throw new Error(`Content is not HTML: ${contentType}`); } // Parse with JSDOM for Readability const dom = new JSDOM(html, { url }); const document = dom.window.document; // Extract metadata using cheerio for better parsing const $ = cheerio.load(html); const metadata = this.extractMetadata($, url); // Use Readability to extract clean content const reader = new Readability(document); const article = reader.parse(); if (!article) { throw new Error('Failed to parse article content with Readability'); } const extractedContent = { type: 'article', url, metadata: { title: article.title || metadata.title || 'Unknown Article', author: metadata.author || null, publishDate: metadata.publishDate || null, description: metadata.description || null, siteName: metadata.siteName || this.extractDomain(url), language: metadata.language || 'unknown', keywords: metadata.keywords || [], canonicalUrl: metadata.canonicalUrl || url, }, content: { text: article.textContent, html: article.content, excerpt: article.excerpt || metadata.description || '', wordCount: this.countWords(article.textContent), characterCount: article.textContent.length, readingTime: this.estimateReadingTime(article.textContent), }, extractedAt: new Date().toISOString(), extractionMethod: 'mozilla-readability + cheerio', }; this.logger.info('Article extraction completed', { url, title: extractedContent.metadata.title, wordCount: extractedContent.content.wordCount, readingTime: extractedContent.content.readingTime, }); return extractedContent; } catch (error) { this.logger.error('Article extraction failed', { url, error: error.message, stack: error.stack, }); throw new Error(`Failed to extract article content: ${error.message}`); } } /** * Extract metadata from HTML using cheerio */ extractMetadata($, url) { const metadata = {}; // Title metadata.title = $('meta[property="og:title"]').attr('content') || $('meta[name="twitter:title"]').attr('content') || $('title').text() || $('h1').first().text(); // Author metadata.author = $('meta[name="author"]').attr('content') || $('meta[property="article:author"]').attr('content') || $('[rel="author"]').text(); // Description metadata.description = $('meta[property="og:description"]').attr('content') || $('meta[name="twitter:description"]').attr('content') || $('meta[name="description"]').attr('content'); // Publish date metadata.publishDate = $('meta[property="article:published_time"]').attr('content') || $('meta[name="date"]').attr('content') || $('time[datetime]').attr('datetime'); // Site name metadata.siteName = $('meta[property="og:site_name"]').attr('content'); // Language metadata.language = $('html').attr('lang') || $('meta[http-equiv="content-language"]').attr('content'); // Keywords const keywordsContent = $('meta[name="keywords"]').attr('content'); if (keywordsContent) { metadata.keywords = keywordsContent.split(',').map(k => k.trim()); } // Canonical URL metadata.canonicalUrl = $('link[rel="canonical"]').attr('href'); return metadata; } /** * Extract domain from URL */ extractDomain(url) { try { return new globalThis.URL(url).hostname; } catch { return 'unknown'; } } /** * Count words in text */ countWords(text) { if (!text || typeof text !== 'string') return 0; return text .trim() .split(/\s+/) .filter(word => word.length > 0).length; } /** * Estimate reading time in minutes */ estimateReadingTime(text) { const wordsPerMinute = 200; // Average reading speed const wordCount = this.countWords(text); return Math.ceil(wordCount / wordsPerMinute); } /** * Get estimated processing time */ getEstimatedProcessingTime(url) { // Article extraction is typically fast (10-45 seconds) return { min: 10, max: 45, unit: 'seconds', }; } /** * Check if URL is accessible */ async checkAccessibility(url) { try { const response = await fetch(url, { method: 'HEAD', timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; LearnMCP/1.0)', }, }); return { accessible: response.ok, status: response.status, contentType: response.headers.get('content-type'), contentLength: response.headers.get('content-length'), }; } catch (error) { return { accessible: false, error: error.message, }; } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/BretMeraki/LearnMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

article-extractor.js•6.29 KiB