- Google-Search-MCP-Server
- dist
- services
import axios from 'axios';
import * as cheerio from 'cheerio';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import MarkdownIt from 'markdown-it';
import TurndownService from 'turndown';
export class ContentExtractor {
constructor() {
// Cache for webpage content (key: url + format, value: content)
this.contentCache = new Map();
// Cache expiration time in milliseconds (30 minutes)
this.cacheTTL = 30 * 60 * 1000;
this.md = new MarkdownIt();
this.turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
});
}
cleanText(text) {
// Remove multiple blank lines
text = text.replace(/\n\s*\n\s*\n/g, '\n\n');
// Remove excessive spaces
text = text.replace(/ +/g, ' ');
return text.trim();
}
cleanMarkdown(text) {
let cleanedText = this.cleanText(text);
// Ensure headers have space after #
cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1');
return cleanedText;
}
htmlToMarkdown(html) {
return this.cleanMarkdown(this.turndownService.turndown(html));
}
htmlToPlainText(html) {
const dom = new JSDOM(html);
return this.cleanText(dom.window.document.body.textContent || '');
}
isValidUrl(url) {
try {
new URL(url);
return true;
}
catch {
return false;
}
}
/**
* Generate a cache key from URL and format
*/
generateCacheKey(url, format) {
return `${url}|${format}`;
}
/**
* Check if a cache entry is still valid
*/
isCacheValid(entry) {
const now = Date.now();
return now - entry.timestamp < this.cacheTTL;
}
/**
* Store webpage content in cache
*/
cacheContent(url, format, content) {
const cacheKey = this.generateCacheKey(url, format);
this.contentCache.set(cacheKey, {
timestamp: Date.now(),
content
});
// Limit cache size to prevent memory issues (max 50 entries)
if (this.contentCache.size > 50) {
// Delete oldest entry
const oldestKey = Array.from(this.contentCache.entries())
.sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0];
this.contentCache.delete(oldestKey);
}
}
/**
* Generates a concise summary of the content
* @param content The content to summarize
* @param maxLength Maximum length of the summary
* @returns A summary of the content
*/
generateSummary(content, maxLength = 300) {
// Simple summarization: take first few sentences up to maxLength
const sentences = content.split(/(?<=[.!?])\s+/);
let summary = '';
for (const sentence of sentences) {
if ((summary + sentence).length <= maxLength) {
summary += sentence + ' ';
}
else {
break;
}
}
return summary.trim() + (summary.length < content.length ? '...' : '');
}
async extractContent(url, format = 'markdown') {
if (!this.isValidUrl(url)) {
throw new Error('Invalid URL provided');
}
// Check cache first
const cacheKey = this.generateCacheKey(url, format);
const cachedContent = this.contentCache.get(cacheKey);
if (cachedContent && this.isCacheValid(cachedContent)) {
console.error(`Using cached content for ${url}`);
return cachedContent.content;
}
try {
// Fetch webpage content
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
},
timeout: 10000
});
// Parse with Cheerio for metadata
const $ = cheerio.load(response.data);
const metaTags = {};
// Only extract the most important meta tags to reduce data volume
const importantMetaTags = ['description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description'];
$('meta').each((_, element) => {
const name = $(element).attr('name') || $(element).attr('property') || '';
const content = $(element).attr('content') || '';
if (name && content && importantMetaTags.some(tag => name.includes(tag))) {
metaTags[name] = content;
}
});
// Use Readability for main content extraction
const dom = new JSDOM(response.data);
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Failed to extract content from webpage');
}
// Convert content based on requested format
let contentStr;
switch (format) {
case 'html':
contentStr = article.content || '';
break;
case 'text':
contentStr = this.htmlToPlainText(article.content || '');
break;
case 'markdown':
default:
contentStr = this.htmlToMarkdown(article.content || '');
break;
}
// Calculate content stats
const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length;
// Generate a summary of the content
const summary = this.generateSummary(contentStr);
const content = {
url,
title: $('title').text() || article.title || '',
description: metaTags['description'] || '',
content: contentStr,
format: format,
meta_tags: metaTags,
stats: {
word_count: wordCount,
approximate_chars: contentStr.length
},
content_preview: {
first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '')
},
summary: summary
};
// Cache the content before returning
this.cacheContent(url, format, content);
return content;
}
catch (error) {
if (axios.isAxiosError(error)) {
throw new Error(`Failed to fetch webpage: ${error.message}`);
}
throw error;
}
}
async batchExtractContent(urls, format = 'markdown') {
const results = {};
await Promise.all(urls.map(async (url) => {
try {
results[url] = await this.extractContent(url, format);
}
catch (error) {
results[url] = {
error: error instanceof Error ? error.message : 'Unknown error occurred'
};
}
}));
return results;
}
}