Google Research MCP

content-extractor.service.ts•12.7 KiB

import axios from 'axios'; import * as cheerio from 'cheerio'; import { Readability } from '@mozilla/readability'; import { JSDOM } from 'jsdom'; import MarkdownIt from 'markdown-it'; import { WebpageContent, OutputFormat } from '../types.js'; import TurndownService from 'turndown'; interface ContentCacheEntry { timestamp: number; content: WebpageContent; } export class ContentExtractor { private md: MarkdownIt; private turndownService: TurndownService; // Cache for webpage content (key: url + format, value: content) private contentCache: Map<string, ContentCacheEntry> = new Map(); // Cache expiration time in milliseconds (30 minutes) private cacheTTL: number = 30 * 60 * 1000; constructor() { this.md = new MarkdownIt(); this.turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }); } private cleanText(text: string): string { // Remove multiple blank lines text = text.replace(/\n\s*\n\s*\n/g, '\n\n'); // Remove excessive spaces text = text.replace(/ +/g, ' '); return text.trim(); } private cleanMarkdown(text: string): string { let cleanedText = this.cleanText(text); // Ensure headers have space after # cleanedText = cleanedText.replace(/#([A-Za-z0-9])/g, '# $1'); return cleanedText; } private htmlToMarkdown(html: string): string { return this.cleanMarkdown(this.turndownService.turndown(html)); } private htmlToPlainText(html: string): string { const dom = new JSDOM(html); return this.cleanText(dom.window.document.body.textContent || ''); } private isValidUrl(url: string): boolean { try { new URL(url); return true; } catch { return false; } } /** * Generate a cache key from URL and format */ private generateCacheKey(url: string, format: OutputFormat): string { return `${url}|${format}`; } /** * Check if a cache entry is still valid */ private isCacheValid(entry: ContentCacheEntry): boolean { const now = Date.now(); return now - entry.timestamp < this.cacheTTL; } /** * Store webpage content in cache */ private cacheContent(url: string, format: OutputFormat, content: WebpageContent): void { const cacheKey = this.generateCacheKey(url, format); this.contentCache.set(cacheKey, { timestamp: Date.now(), content }); // Limit cache size to prevent memory issues (max 50 entries) if (this.contentCache.size > 50) { // Delete oldest entry const oldestKey = Array.from(this.contentCache.entries()) .sort((a, b) => a[1].timestamp - b[1].timestamp)[0][0]; this.contentCache.delete(oldestKey); } } /** * Generates a concise summary of the content * @param content The content to summarize * @param maxLength Maximum length of the summary * @returns A summary of the content */ private generateSummary(content: string, maxLength: number = 300): string { // Simple summarization: take first few sentences up to maxLength const sentences = content.split(/(?<=[.!?])\s+/); let summary = ''; for (const sentence of sentences) { if ((summary + sentence).length <= maxLength) { summary += sentence + ' '; } else { break; } } return summary.trim() + (summary.length < content.length ? '...' : ''); } async extractContent(url: string, format: OutputFormat = 'markdown'): Promise<WebpageContent> { if (!this.isValidUrl(url)) { throw new Error('Invalid URL provided'); } // Check cache first const cacheKey = this.generateCacheKey(url, format); const cachedContent = this.contentCache.get(cacheKey); if (cachedContent && this.isCacheValid(cachedContent)) { console.error(`Using cached content for ${url}`); return cachedContent.content; } try { // Fetch webpage content const response = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }, timeout: 30000, // Increased for complex research content maxContentLength: 50 * 1024 * 1024, // 50MB max for research papers maxBodyLength: 50 * 1024 * 1024 }); // Parse with Cheerio for metadata const $ = cheerio.load(response.data); const metaTags: Record<string, string> = {}; // Extract more meta tags for research purposes const importantMetaTags = [ 'description', 'keywords', 'author', 'og:title', 'og:description', 'twitter:title', 'twitter:description', 'article:published_time', 'article:author', 'citation_title', 'citation_author', 'citation_publication_date', 'citation_journal_title', 'citation_doi' ]; $('meta').each((_, element) => { const name = $(element).attr('name') || $(element).attr('property') || ''; const content = $(element).attr('content') || ''; if (name && content && importantMetaTags.some(tag => name.includes(tag))) { metaTags[name] = content; } }); // Use enhanced content extraction for complete, coherent content const dom = new JSDOM(response.data); const document = dom.window.document; // First try Readability with enhanced options const reader = new Readability(document.cloneNode(true) as Document, { charThreshold: 50, // Lower threshold to capture more content classesToPreserve: ['table', 'figure', 'chart', 'formula', 'citation', 'code', 'pre', 'blockquote'], keepClasses: true, debug: false }); const article = reader.parse(); let contentStr: string; if (article && article.content && article.content.length > 100) { // Readability succeeded - use it but enhance it switch (format) { case 'html': contentStr = this.enhanceHtmlContent(article.content); break; case 'text': contentStr = this.enhanceTextContent(this.htmlToPlainText(article.content)); break; case 'markdown': default: contentStr = this.enhanceMarkdownContent(this.htmlToMarkdown(article.content)); break; } } else { // Readability failed or returned insufficient content - use fallback extraction console.log(`Readability failed for ${url}, using fallback extraction`); contentStr = this.fallbackContentExtraction($, format); } // Calculate content stats const wordCount = contentStr.split(/\s+/).filter(word => word.length > 0).length; // Generate an improved summary of the content const summary = this.generateSummary(contentStr, 500); // Longer summaries for research content // Extract any section headings for better content structure understanding const headings: string[] = []; $('h1, h2, h3, h4, h5, h6').each((_, element) => { const headingText = $(element).text().trim(); if (headingText) { headings.push(headingText); } }); const content: WebpageContent = { url, title: ($('title').text() as string) || article?.title || '', description: metaTags['description'] || '', content: contentStr, format: format, meta_tags: metaTags, stats: { word_count: wordCount, approximate_chars: contentStr.length }, content_preview: { first_500_chars: contentStr.slice(0, 500) + (contentStr.length > 500 ? '...' : '') }, summary: summary, structure: { headings: headings.slice(0, 20) // Include up to 20 headings to show document structure } }; // Cache the content before returning this.cacheContent(url, format, content); return content; } catch (error) { if (axios.isAxiosError(error)) { throw new Error(`Failed to fetch webpage: ${error.message}`); } throw error; } } async batchExtractContent(urls: string[], format: OutputFormat = 'markdown'): Promise<Record<string, WebpageContent | { error: string }>> { const results: Record<string, WebpageContent | { error: string }> = {}; await Promise.all( urls.map(async (url) => { try { results[url] = await this.extractContent(url, format); } catch (error) { results[url] = { error: error instanceof Error ? error.message : 'Unknown error occurred' }; } }) ); return results; } /** * Enhanced HTML content processing to preserve complete thoughts */ private enhanceHtmlContent(html: string): string { // Ensure paragraphs are complete and well-formed const $ = cheerio.load(html); // Remove empty paragraphs and normalize whitespace $('p').each((i, elem) => { const $p = $(elem); const text = $p.text().trim(); if (!text || text.length < 10) { $p.remove(); } else { // Ensure paragraph ends with proper punctuation if (!/[.!?]$/.test(text)) { $p.html($p.html() + '.'); } } }); return $.html(); } /** * Enhanced text content processing for coherent reading */ private enhanceTextContent(text: string): string { // Split into paragraphs and clean each one const paragraphs = text.split(/\n\s*\n/) .map(p => p.trim()) .filter(p => p.length > 20) // Remove very short paragraphs .map(p => { // Ensure paragraph ends with proper punctuation const trimmed = p.trim(); if (!/[.!?]$/.test(trimmed)) { return trimmed + '.'; } return trimmed; }); return paragraphs.join('\n\n'); } /** * Enhanced markdown content processing */ private enhanceMarkdownContent(markdown: string): string { // Clean up markdown and ensure complete sentences let enhanced = markdown; // Fix common markdown issues enhanced = enhanced.replace(/#{1,6}\s*([^#\n]+)\s*#{0,6}/g, (match, title) => { const level = match.indexOf(' ') - match.indexOf('#'); return '#'.repeat(Math.min(level, 6)) + ' ' + title.trim(); }); // Ensure list items are complete enhanced = enhanced.replace(/^(\s*[-*+]\s+)(.+)$/gm, (match, prefix, content) => { const trimmed = content.trim(); if (!/[.!?]$/.test(trimmed) && trimmed.length > 10) { return prefix + trimmed + '.'; } return match; }); // Clean up multiple blank lines enhanced = enhanced.replace(/\n{3,}/g, '\n\n'); return enhanced.trim(); } /** * Fallback content extraction when Readability fails */ private fallbackContentExtraction($: cheerio.Root, format: OutputFormat): string { // Remove unwanted elements $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar').remove(); // Try to find main content areas const contentSelectors = [ 'main', '[role="main"]', '.main-content', '.content', '.post-content', '.entry-content', '.article-content', '.page-content', 'article', '.article-body' ]; let contentElement = null; for (const selector of contentSelectors) { const elem = $(selector).first(); if (elem.length && elem.text().trim().length > 200) { contentElement = elem; break; } } // If no main content found, use body but clean it if (!contentElement) { contentElement = $('body'); // Remove navigation, sidebars, etc. contentElement.find('nav, header, footer, aside, .nav, .navigation, .sidebar, .menu').remove(); } const htmlContent = contentElement.html() || ''; switch (format) { case 'html': return this.enhanceHtmlContent(htmlContent); case 'text': return this.enhanceTextContent(this.htmlToPlainText(htmlContent)); case 'markdown': default: return this.enhanceMarkdownContent(this.htmlToMarkdown(htmlContent)); } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mixelpixx/Google-Research-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content-extractor.service.ts•12.7 KiB