MCP Deep Web Research Server
by PedroDnT
import * as cheerio from 'cheerio';
import htmlToMd from 'html-to-md';
import { ExtractedContent, ContentMetadata, ContentSection, ContentExtractionOptions } from '../types/content.js';
type CheerioRoot = ReturnType<typeof cheerio.load>;
export class ContentExtractor {
private technicalSelectors = [
// Code blocks and examples
'pre', 'code', '.example', '.code-example',
// API and implementation details
'.api-details', '.implementation-details',
'.method-signature', '.function-signature',
// Parameters and documentation
'.parameters', '.returns', '.arguments',
'.technical-docs', '.api-docs'
private boilerplateSelectors = [
// Navigation elements
'nav', 'header', 'footer',
// Social sharing
'.social-share', '.share-buttons', '[id*="share"]', '[class*="share"]',
// Navigation menus
'.menu', '.navigation', '#menu', '#nav',
// Sidebars
'.sidebar', '#sidebar', '[class*="sidebar"]',
// Comments
'#comments', '.comments', '.comment-section',
// Advertisements
'.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]',
// Popups and overlays
'.popup', '.modal', '.overlay',
// Common UI elements
'.header-content', '.footer-content', '.site-header', '.site-footer',
// Cookie notices and banners
'.cookie-notice', '.cookie-banner', '.gdpr', '[class*="cookie"]', '[id*="cookie"]',
// Search and related content
'.search', '.search-form', '.related-posts', '.related-articles',
// Common widget areas
'.widget', '.widgets', '[class*="widget"]',
// Newsletter and subscription forms
'.newsletter', '.subscribe', '[class*="newsletter"]', '[class*="subscribe"]',
// Social media elements
'.social', '.social-media', '[class*="social"]',
// Print and utility links
'.print', '.utility-nav', '[class*="print"]',
// Common dynamic elements
'[data-widget]', '[data-module]',
// Common tracking and analytics
'[data-analytics]', '[data-tracking]',
// Additional UI elements
'button', '[role="button"]', '.button', '.btn',
// Footer-like elements
'[class*="footer"]', '[id*="footer"]', 'c4d-footer', 'c4d-footer-container',
// Navigation-like elements
'[class*="nav"]', '[id*="nav"]', 'c4d-nav',
// Legal and policy elements
'[class*="legal"]', '[id*="legal"]', '[class*="policy"]', '[id*="policy"]',
// Common web components
// Additional cookie-related elements
'[class*="cookie-preferences"]', '[id*="cookie-preferences"]',
'[class*="cookie-settings"]', '[id*="cookie-settings"]',
'[class*="cookie-consent"]', '[id*="cookie-consent"]',
// Additional button-related elements
'[class*="btn-"]', '[id*="btn-"]', '[class*="button-"]', '[id*="button-"]',
// Additional navigation elements
'[class*="menu-"]', '[id*="menu-"]', '[class*="navigation-"]', '[id*="navigation-"]',
// Additional footer elements
'[class*="bottom-"]', '[id*="bottom-"]', '[class*="foot-"]', '[id*="foot-"]'
private htmlToMarkdownOptions = {
skipTags: [], // Don't skip any tags by default
emDelimiter: '_',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
headingStyle: 'atx',
keepReplacement: true,
keepHtml: false,
listStyle: 'dash',
codeStyle: 'fenced',
customRules: [
// Custom rule for links
selector: 'a',
replacement: (content: string, node: any) => {
const href = node.getAttribute('href');
// Only preserve external links
if (href && href.startsWith('http')) {
return `[${content}](${href})`;
return content;
// Custom rule for images
selector: 'img',
replacement: (content: string, node: any) => {
const alt = node.getAttribute('alt');
return alt ? `[Image: ${alt}]` : '';
// Custom rule for tables
selector: 'table',
replacement: (content: string, node: any) => {
return this.convertTableToMarkdown(node);
private convertTableToMarkdown(tableNode: any): string {
const $ = cheerio.load(tableNode);
let markdown = '\n';
// Get all rows including header row
const rows = $('tr').toArray();
if (rows.length === 0) return '';
// Get maximum number of columns
const maxColumns = Math.max( => $(row).find('th, td').length));
if (maxColumns === 0) return '';
// Process headers
const headerRow = $(rows[0]);
const headers: string[] = [];
headerRow.find('th, td').each((_, cell) => {
headers.push($(cell).text().trim() || ' ');
// Pad headers if needed
while (headers.length < maxColumns) {
headers.push(' ');
// Create header row
markdown += '| ' + headers.join(' | ') + ' |\n';
// Create separator row with proper alignment
markdown += '|' + Array(maxColumns).fill(' --- ').join('|') + '|\n';
// Process data rows (skip first row if it was header)
for (let i = headerRow.find('th').length > 0 ? 1 : 0; i < rows.length; i++) {
const cells: string[] = [];
$(rows[i]).find('td').each((_, cell) => {
cells.push($(cell).text().trim() || ' ');
// Pad cells if needed
while (cells.length < maxColumns) {
cells.push(' ');
markdown += '| ' + cells.join(' | ') + ' |\n';
return markdown + '\n';
public async extract(html: string, url: string, options: ContentExtractionOptions = {}): Promise<ExtractedContent> {
console.log('Starting content extraction for URL:', url);
console.log('Initial HTML length:', html.length);
const $ = cheerio.load(html);
console.log('DOM loaded successfully');
// Remove unwanted elements
console.log('Cleaning up DOM...');
console.log('DOM cleanup complete');
// Extract metadata
console.log('Extracting metadata...');
const metadata = this.extractMetadata($);
console.log('Metadata extracted:', metadata);
// Extract main content sections
console.log('Extracting content sections...');
const sections = this.extractContentSections($);
console.log('Found sections:', sections.length);
sections.forEach((section, index) => {
console.log(`Section ${index + 1}:`, {
type: section.type,
title: section.title,
importance: section.importance,
contentLength: section.content.length
// Extract structured data
const structuredData = options.extractStructuredData ?
this.extractStructuredData($) : undefined;
// Convert content to markdown
console.log('Converting content to markdown...');
const mainContent = sections
.map(section => section.content)
const content = htmlToMd(mainContent, this.htmlToMarkdownOptions);
console.log('Markdown conversion complete. Length:', content.length);
// Clean up and format the content
console.log('Cleaning and formatting content...');
const cleanedContent = this.cleanContent(this.formatMarkdown(content));
console.log('Content cleanup complete. Final length:', cleanedContent.length);
const title = this.extractTitle($);
console.log('Extracted title:', title);
const result = {
content: this.truncateContent(cleanedContent, options.maxContentLength),
html: options.includeHtml ? html : undefined,
timestamp: new Date().toISOString(),
console.log('Content extraction complete');
return result;
private cleanupDOM($: CheerioRoot): void {
console.log('Starting DOM cleanup...');
// First pass: Remove obvious non-content elements
$('script, style, noscript, iframe, form, link, meta').remove();
$('[style*="display: none"], [style*="display:none"], [hidden]').remove();
// Second pass: Identify and preserve main content areas
const mainContentSelectors = [
let mainContent = $('body');
for (const selector of mainContentSelectors) {
const element = $(selector);
if (element.length > 0) {
mainContent = element;
console.log(`Found main content using selector: ${selector}`);
// Third pass: Remove boilerplate from main content
this.boilerplateSelectors.forEach(selector => {
mainContent.find(selector).each((_, elem) => {
const $elem = $(elem);
if (!this.containsTechnicalContent($elem)) {
// Fourth pass: Clean up remaining elements
mainContent.find('*').each((_, elem) => {
const $elem = $(elem);
const text = $elem.text().trim();
// Skip if element contains technical content
if (this.containsTechnicalContent($elem)) {
// Remove elements that are clearly UI components
if (
text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) ||
text.match(/^(follow us|subscribe|sign up|log in|register)$/i) ||
) {
// Remove empty elements except code blocks
if (!$'pre, code') && text === '' && !$elem.find('img').length) {
// Fifth pass: Remove duplicate content but preserve code blocks
const seen = new Set<string>();
mainContent.find('p, li, td, div').each((_, elem) => {
const $elem = $(elem);
if (this.containsTechnicalContent($elem)) {
return; // Don't deduplicate technical content
const text = $elem.text().trim();
if (text && seen.has(text)) {
} else {
// Replace body content with cleaned main content
console.log('DOM cleanup complete');
private containsTechnicalContent($elem: cheerio.Cheerio): boolean {
// Check if element matches technical selectors
if (this.technicalSelectors.some(selector => $ {
return true;
// Check if element contains code blocks
if ($elem.find('pre, code').length > 0) {
return true;
// Check for technical keywords in text
const text = $elem.text().toLowerCase();
return (
text.includes('example') ||
text.includes('implementation') ||
text.includes('usage') ||
text.includes('api') ||
text.includes('method') ||
text.includes('function') ||
text.includes('parameter') ||
text.includes('return') ||
text.includes('class') ||
text.includes('interface') ||
text.includes('object') ||
private cleanContent(content: string): string {
return content
// Remove duplicate newlines
.replace(/\n{3,}/g, '\n\n')
// Remove lines that are just special characters or very short
.filter(line => {
const trimmed = line.trim();
if (trimmed.length < 3) return false;
if (/^[-_=*#]+$/.test(trimmed)) return false;
return true;
// Remove duplicate paragraphs
.filter((line, index, arr) => {
return arr.indexOf(line) === index;
private extractTitle($: CheerioRoot): string {
// Try OpenGraph title first
const ogTitle = $('meta[property="og:title"]').attr('content');
if (ogTitle) return ogTitle;
// Try article title
const articleTitle = $('article h1').first().text();
if (articleTitle) return articleTitle;
// Try main title
const mainTitle = $('h1').first().text() || $('title').text();
if (mainTitle) return mainTitle;
return 'Untitled';
private extractMetadata($: CheerioRoot): ContentMetadata {
const metadata: ContentMetadata = {};
// Extract author =
$('meta[name="author"]').attr('content') ||
$('meta[property="article:author"]').attr('content') ||
$('.author').first().text() ||
// Extract dates
metadata.datePublished =
$('meta[property="article:published_time"]').attr('content') ||
$('meta[name="publication-date"]').attr('content') ||
metadata.lastModified =
$('meta[property="article:modified_time"]').attr('content') ||
// Extract language
metadata.language = $('html').attr('lang') || undefined;
// Calculate reading time and word count
const text = $('body').text();
const words = text.trim().split(/\s+/).length;
metadata.wordCount = words;
metadata.readingTime = Math.ceil(words / 200); // Assuming 200 words per minute
return metadata;
private extractContentSections($: CheerioRoot): ContentSection[] {
console.log('Starting content section extraction...');
const sections: ContentSection[] = [];
// Enhanced main content selectors with scoring
const mainSelectors = [
{ selector: 'article[class*="content"]', score: 10 },
{ selector: '[role="main"]', score: 9 },
{ selector: 'main', score: 8 },
{ selector: '.main-content', score: 8 },
{ selector: '#main-content', score: 8 },
{ selector: '.post-content', score: 7 },
{ selector: '.article-content', score: 7 },
{ selector: '.entry-content', score: 7 },
{ selector: '.content', score: 6 },
{ selector: '.documentation', score: 8 },
{ selector: '.markdown-body', score: 7 },
{ selector: '[itemprop="articleBody"]', score: 8 },
{ selector: '[data-content-type="article"]', score: 8 }
// Find best content container based on scoring
let bestScore = 0;
let mainContent: cheerio.Cheerio = $('body');
mainSelectors.forEach(({ selector, score }) => {
const elements = $(selector);
elements.each((_, element) => {
const $element = $(element);
let elementScore = score;
// Boost score based on content quality
elementScore += this.evaluateContentQuality($element);
if (elementScore > bestScore) {
bestScore = elementScore;
mainContent = $element;
console.log(`Found better content container: ${selector} (score: ${elementScore})`);
// Clean up the selected content container
this.cleanupContentContainer($, mainContent);
// Extract sections based on semantic structure
let currentSection: ContentSection = {
id: 'main',
content: '',
importance: 1,
type: 'main'
// Process content hierarchically
mainContent.find('h1, h2, h3, h4, h5, h6, p, pre, code, .example, .implementation, .method, .function, section, article').each((_, element) => {
const $element = $(element);
const text = $element.text().trim();
if (!text) return;
// Check for section breaks
const isHeading = $'h1, h2, h3, h4, h5, h6');
const isTechnical = this.containsTechnicalContent($element);
const isNewSection = $'section, article') && $element.find('h1, h2, h3, h4, h5, h6').length > 0;
if (isHeading || isTechnical || isNewSection) {
// Save current section if it has content
if (currentSection.content.trim()) {
// Calculate importance
const importance = this.calculateSectionImportance($element, isHeading, isTechnical);
// Create new section
currentSection = {
id: `section-${sections.length + 1}`,
title: isHeading ? text : (isTechnical ? 'Technical Content' : 'Content Section'),
content: '',
type: isTechnical ? 'technical' : 'main'
// Add content to current section
if (isTechnical) {
// Include context for technical content
const context = this.getContextualContent($, $element);
currentSection.content += '\n' + (context || $element.html() || '');
} else {
currentSection.content += '\n' + ($element.html() || '');
// Add final section
if (currentSection.content.trim()) {
console.log(`Extracted ${sections.length} content sections`);
return sections;
private evaluateContentQuality($element: cheerio.Cheerio): number {
let score = 0;
// Check for technical content density
const text = $element.text();
const technicalTerms = text.match(/\b(api|function|method|class|interface|example|implementation|code|return|parameter)\b/gi);
if (technicalTerms) {
score += technicalTerms.length * 0.5;
// Check for code blocks
score += $element.find('pre, code').length * 2;
// Check for proper content structure
score += $element.find('h1, h2, h3, h4, h5, h6').length;
score += $element.find('p').length * 0.5;
score += $element.find('ul, ol').length;
// Penalize for common boilerplate
score -= $element.find(this.boilerplateSelectors.join(', ')).length * 2;
return score;
private calculateSectionImportance($element: cheerio.Cheerio, isHeading: boolean, isTechnical: boolean): number {
let importance = 0.5;
if (isHeading) {
const level = parseInt($element.prop('tagName').slice(1));
importance = Math.max(0.5, 1 - (level - 1) * 0.1);
if (isTechnical) {
importance += 0.3;
// Boost importance based on content quality
const contentQuality = this.evaluateContentQuality($element);
importance += Math.min(0.2, contentQuality * 0.05);
return Math.min(1, importance);
private findContextContainer($: CheerioRoot, $element: cheerio.Cheerio): cheerio.Cheerio {
// Look for the nearest container that provides context
let $container = $element;
let depth = 0;
const maxDepth = 3; // Prevent going too far up the DOM
while (depth < maxDepth) {
const $parent = $container.parent();
if (!$parent.length) break;
// Check if parent provides good context
const parentText = $parent.text().trim();
const hasContext = parentText.length > $container.text().length * 1.5 &&
if (hasContext) {
$container = $parent;
return $container;
private getContextualContent($: CheerioRoot, $element: cheerio.Cheerio): string | null {
const container = this.findContextContainer($, $element);
if (!container.length) return null;
// Get previous sibling if it's a heading or description
let content = '';
const $prevSibling = container.prev();
if ($'h1, h2, h3, h4, p') &&
this.containsTechnicalContent($prevSibling)) {
content += $prevSibling.html() + '\n';
content += container.html() || '';
// Get next sibling if it provides additional context
const $nextSibling =;
if ($'p') &&
this.containsTechnicalContent($nextSibling)) {
content += '\n' + $nextSibling.html();
return content;
private calculateImportance($element: cheerio.Cheerio): number {
let importance = 0.5;
// Base importance on heading level
if ($'h1')) importance = 1;
else if ($'h2')) importance = 0.8;
else if ($'h3')) importance = 0.6;
// Increase importance based on content indicators
const text = $element.text().toLowerCase();
if (
text.includes('example') ||
text.includes('implementation') ||
text.includes('usage') ||
text.includes('api') ||
text.includes('method') ||
text.includes('function') ||
text.includes('parameter') ||
) {
importance += 0.2;
// Increase importance if contains code
if ($element.find('code').length > 0 || $'pre')) {
importance += 0.2;
// Increase importance for technical elements
if ($','))) {
importance += 0.1;
return Math.min(importance, 1);
private extractStructuredData($: CheerioRoot): any[] {
const structuredData: any[] = [];
// Extract JSON-LD
$('script[type="application/ld+json"]').each((_, element) => {
try {
const data = JSON.parse($(element).html() || '{}');
} catch (error) {
// Ignore invalid JSON
return structuredData;
private formatMarkdown(content: string): string {
// First pass: Basic cleanup
let formatted = content
// Fix list markers
.replace(/^\* /gm, '- ')
// Add spacing around headers
.replace(/^(#{1,6} .+)$/gm, '\n$1\n')
// Add spacing around lists
.replace(/^(- .+)$/gm, '$1\n');
// Handle code blocks
formatted = formatted.replace(/`([^`]+)`/g, (match, code) => {
if (code.includes('\n') || code.includes('function')) {
return '\n\n```\n' + code.trim() + '\n```\n\n';
return '`' + code.trim() + '`';
// Add spacing between sections
formatted = formatted.replace(/^(#{1,6} .*)/gm, '\n\n$1\n');
// Handle tables - complete rewrite of table structure
formatted = formatted.replace(/\|(.*)\|\n/g, (match: string, row: string) => {
const cells = row.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell);
if (cells.length === 0) return '';
// Detect if this is a separator row
if (cells.every(cell => /^[-\s]+$/.test(cell))) {
return ''; // Skip separator rows, we'll add our own
// Check if this is a header row (no separator row seen yet)
if (!formatted.includes('| ---')) {
const separator = => '---').join(' | ');
return '| ' + cells.join(' | ') + ' |\n| ' + separator + ' |\n';
return '| ' + cells.join(' | ') + ' |\n';
// Final cleanup
return formatted
// Fix paragraph spacing
.replace(/\n{3,}/g, '\n\n')
// Ensure sections are properly separated
.replace(/(\w)\n(#{1,6} )/g, '$1\n\n$2')
// Add proper spacing around code blocks
.replace(/```/g, '\n```\n')
.replace(/\n{4,}/g, '\n\n\n')
private cleanupContentContainer($: CheerioRoot, $container: cheerio.Cheerio): void {
console.log('Cleaning up content container...');
// Remove nested boilerplate elements
this.boilerplateSelectors.forEach(selector => {
$container.find(selector).each((_, elem) => {
const $elem = $(elem);
// Keep element if it contains technical content
if (!this.containsTechnicalContent($elem)) {
// Remove empty elements
$container.find('*').each((_, elem) => {
const $elem = $(elem);
const text = $elem.text().trim();
// Skip technical content and elements with images
if (this.containsTechnicalContent($elem) || $elem.find('img').length > 0) {
// Remove if empty or just whitespace
if (!text || text.length < 3) {
// Remove common UI text patterns
if (
text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) ||
text.match(/^(follow us|subscribe|sign up|log in|register)$/i) ||
) {
// Remove duplicate content
const seen = new Set<string>();
$container.find('p, li, td, div').each((_, elem) => {
const $elem = $(elem);
// Skip technical content
if (this.containsTechnicalContent($elem)) {
const text = $elem.text().trim();
if (text && seen.has(text)) {
} else {
console.log('Content container cleanup complete');
private truncateContent(content: string, maxLength?: number): string {
if (!maxLength || content.length <= maxLength) {
return content;
// Truncate at word boundary
const truncated = content.slice(0, maxLength);
const lastSpace = truncated.lastIndexOf(' ');
return truncated.slice(0, lastSpace) + '...';