Brave Deep Research MCP

content-extractor.ts•7.04 KiB

import { Page } from 'puppeteer'; import { browserManager } from './browser-manager.js'; import { logger } from './utils/logger.js'; /** * Interface for the extracted content from a webpage */ export interface ExtractedContent { url: string; title: string; content: string; links: { url: string; text: string }[]; error?: string; } /** * ContentExtractor handles the webpage content extraction logic */ export class ContentExtractor { /** * Extract content from a URL * @param url The URL to extract content from * @returns The extracted content */ public static async extractContent(url: string): Promise<ExtractedContent> { let page: Page | null = null; try { page = await browserManager.newPage(); logger.debug(`Navigating to: ${url}`); // Navigate to the page await page.goto(url, { waitUntil: 'networkidle2', timeout: parseInt(process.env.PAGE_TIMEOUT || '30000', 10) }); // Extract the page title const title = await page.title(); // Extract the main content using the content extractor algorithm const content = await this.extractMainContent(page); // Extract relevant links const links = await this.extractLinks(page); return { url, title, content, links }; } catch (error) { logger.error(`Error extracting content from ${url}:`, error); return { url, title: 'Error', content: '', links: [], error: error instanceof Error ? error.message : String(error) }; } finally { if (page) { await page.close().catch(err => logger.error('Error closing page:', err)); } } } /** * Extract the main content from a page using readability-like heuristics * @param page The Puppeteer page object * @returns The extracted main content */ private static async extractMainContent(page: Page): Promise<string> { return page.evaluate(() => { // Define potential content selectors in order of preference const contentSelectors = [ 'article', 'main', '.post-content', '.entry-content', '.content', '#content', '.post', '.article', '.blog-post', ]; // Define elements to ignore const elementsToIgnore = [ 'nav', 'header', 'footer', 'aside', '.sidebar', '.menu', '.navigation', '.ads', '.advertisement', '.related', '.comments', ]; // Try to find the main content container let contentElement: Element | null = null; // First try with defined selectors for (const selector of contentSelectors) { const elements = document.querySelectorAll(selector); if (elements.length === 1) { contentElement = elements[0]; break; } else if (elements.length > 1) { // Find the largest element by content length let maxLength = 0; let maxElement: Element | null = null; elements.forEach((el) => { const length = el.textContent?.length || 0; if (length > maxLength) { maxLength = length; maxElement = el; } }); if (maxElement) { contentElement = maxElement; break; } } } // If no content found, use density analysis to find the main content if (!contentElement) { const paragraphs = document.querySelectorAll('p'); if (paragraphs.length > 0) { // Find common ancestor with most paragraphs const ancestors = new Map<Element, number>(); paragraphs.forEach((p) => { let current: Element | null = p; while (current && current !== document.body) { ancestors.set(current, (ancestors.get(current) || 0) + 1); current = current.parentElement; } }); // Find the element with highest paragraph density let maxCount = 0; let bestAncestor: Element | null = null; ancestors.forEach((count, element) => { if (count > maxCount) { maxCount = count; bestAncestor = element; } }); if (bestAncestor) { contentElement = bestAncestor; } } } // If still no content, use the body if (!contentElement) { contentElement = document.body; } // Clone the content to avoid modifying the original const clonedContent = contentElement.cloneNode(true) as Element; // Remove ignored elements for (const selector of elementsToIgnore) { clonedContent.querySelectorAll(selector).forEach(el => el.remove()); } // Remove scripts, styles, and other non-content elements clonedContent.querySelectorAll('script, style, iframe, noscript').forEach(el => el.remove()); // Extract and format the text content const textContent = clonedContent.textContent || ''; // Clean up the content (remove excessive whitespace) return textContent .replace(/\\s+/g, ' ') .replace(/\\n+/g, '\\n') .trim(); }); } /** * Extract relevant links from the page * @param page The Puppeteer page object * @returns Array of links with URL and text */ private static async extractLinks(page: Page): Promise<{ url: string; text: string }[]> { return page.evaluate(() => { const links: { url: string; text: string }[] = []; const seen = new Set<string>(); // Get all links from the document const anchorElements = document.querySelectorAll('a[href]'); anchorElements.forEach(anchor => { const href = anchor.getAttribute('href'); if (!href) return; // Create an absolute URL const url = new URL(href, window.location.href).toString(); // Skip non-http/https links, anchors, and already seen URLs if (!url.startsWith('http') || seen.has(url)) return; // Skip URLs that likely aren't content pages if ( url.includes('/cdn-cgi/') || url.includes('/wp-admin/') || url.endsWith('.jpg') || url.endsWith('.jpeg') || url.endsWith('.png') || url.endsWith('.gif') || url.endsWith('.pdf') || url.endsWith('.zip') ) { return; } // Get the link text and clean it const text = (anchor.textContent || '') .replace(/\\s+/g, ' ') .trim(); // Only include links with meaningful text if (text.length > 1) { links.push({ url, text }); seen.add(url); } }); // Return only the most relevant links (max 10) return links.slice(0, 10); }); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/suthio/brave-deep-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content-extractor.ts•7.04 KiB