Skip to main content
Glama
index.ts16 kB
#!/usr/bin/env node import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; import puppeteer from 'puppeteer-core'; import type { Browser, Page } from 'puppeteer-core'; import { z } from 'zod'; import dotenv from 'dotenv'; import * as cheerio from 'cheerio'; // Load environment variables dotenv.config(); // Configuration Constants const BROWSERLESS_API_KEY = process.env.BROWSERLESS_API_KEY; const BROWSERLESS_WS_ENDPOINT = `wss://production-sfo.browserless.io/stealth?token=${BROWSERLESS_API_KEY}`; // Timing Constants (in milliseconds) const TIMEOUTS = { PAGE_LOAD: 30000, PAGE_STABILIZATION: 1000, SCROLL_COMPLETION: 500, CONTENT_CHECK_INTERVAL: 500, ADDITIONAL_CONTENT_WAIT: 1000, NETWORK_IDLE: 5000, NETWORK_IDLE_TIME: 500, FINAL_RENDER_WAIT: 1000, } as const; // Default Values const DEFAULTS = { INITIAL_WAIT: 3000, SCROLL_COUNT: 5, SCROLL_WAIT: 1000, CLEANUP_HTML: false, } as const; // Error Keywords for Page Detachment Detection const PAGE_DETACHMENT_KEYWORDS = ['detached', 'closed', 'Target closed'] as const; // Validate required environment variables if (!BROWSERLESS_API_KEY) { console.error('Error: BROWSERLESS_API_KEY environment variable is not set'); process.exit(1); } interface FetchWebContentArgs { url: string; initialWaitTime?: number; scrolls?: number; scrollWaitTime?: number; cleanup?: boolean; } // Simple logger utility const log = { info: (message: string, ...args: unknown[]) => console.error(`[Browserless] ${message}`, ...args), error: (message: string, ...args: unknown[]) => console.error(`[Browserless] ${message}`, ...args), }; class BrowserlessServer { private server: McpServer; private browser: Browser | null = null; constructor() { this.server = new McpServer({ name: 'digest-mcp', version: '0.1.0', }); this.setupToolHandlers(); // Cleanup on exit process.on('SIGINT', async () => { await this.cleanup(); process.exit(0); }); } private setupToolHandlers() { this.server.registerTool( 'web_content', { title: 'Fetch Web Content', description: 'Fetch fully rendered DOM content using browserless.io. Handles AJAX/JavaScript dynamic loading. Optimized for SPAs and infinite scroll pages. Returns the complete rendered HTML after all JavaScript execution, including dynamically loaded content. Each scroll waits for page height changes and network activity to settle.', inputSchema: { url: z.string().describe('The URL to fetch'), initialWaitTime: z.number().optional().default(DEFAULTS.INITIAL_WAIT).describe('Time to wait (in milliseconds) after loading the page before scrolling'), scrolls: z.number().optional().default(DEFAULTS.SCROLL_COUNT).describe('Number of times to scroll down the page'), scrollWaitTime: z.number().optional().default(DEFAULTS.SCROLL_WAIT).describe('Time to wait (in milliseconds) between each scroll action'), cleanup: z.boolean().optional().default(DEFAULTS.CLEANUP_HTML).describe('Whether to clean up HTML (remove scripts, styles, SVG, forms, etc.) and keep only meaningful text content'), }, outputSchema: { size: z.number().describe('Size of the content in bytes'), content: z.string().describe('The fetched HTML content'), }, }, async (args) => this.handleWebContentRequest(args) ); } private async handleWebContentRequest(args: FetchWebContentArgs) { if (!args.url) { throw new McpError(ErrorCode.InvalidParams, 'URL is required'); } try { const content = await this.fetchWebContent(args); const size = Buffer.byteLength(content, 'utf8'); return { content: [{ type: 'text' as const, text: content }], structuredContent: { size, content, }, }; } catch (error) { const errorMessage = this.formatError(error); log.error('Tool Error:', errorMessage); throw new McpError(ErrorCode.InternalError, `Failed to fetch web content: ${errorMessage}`); } } private formatError(error: unknown): string { if (error instanceof Error) { return `${error.message}${error.stack ? '\n' + error.stack : ''}`; } if (typeof error === 'object' && error !== null) { return JSON.stringify(error, null, 2); } return String(error); } private async fetchWebContent(args: FetchWebContentArgs): Promise<string> { const { url, initialWaitTime = DEFAULTS.INITIAL_WAIT, scrolls = DEFAULTS.SCROLL_COUNT, scrollWaitTime = DEFAULTS.SCROLL_WAIT, cleanup = DEFAULTS.CLEANUP_HTML, } = args; log.info(`Fetching: ${url}, initialWait: ${initialWaitTime}ms, scrolls: ${scrolls}, scrollWait: ${scrollWaitTime}ms, cleanup: ${cleanup}`); let page: Page | null = null; try { await this.ensureBrowserConnection(); page = await this.createPage(); await this.navigateToUrl(page, url); await this.waitForInitialLoad(initialWaitTime); await this.performScrolling(page, scrolls, scrollWaitTime); await this.waitForNetworkAndRendering(page, scrolls, scrollWaitTime); const rawContent = await this.extractPageContent(page); const finalContent = cleanup ? this.cleanupHtml(rawContent) : rawContent; await this.closePage(page); log.info('Content fetched successfully'); return finalContent; } catch (error) { await this.closePage(page); throw error; } } private async ensureBrowserConnection(): Promise<Browser> { // Check if browser exists and is still connected if (this.browser) { try { // Test if the connection is still alive by checking if browser is connected if (this.browser.connected) { return this.browser; } log.info('Browser connection was closed, reconnecting...'); } catch (error) { log.info('Browser connection check failed, reconnecting...'); } // Clear the stale connection this.browser = null; } log.info('Connecting to browserless.io...'); try { this.browser = await puppeteer.connect({ browserWSEndpoint: BROWSERLESS_WS_ENDPOINT, }); // Handle unexpected disconnections this.browser.on('disconnected', () => { log.info('Browser disconnected by remote'); this.browser = null; }); log.info('Connected successfully'); return this.browser; } catch (error) { log.error('Connection failed:', error); throw new Error(`Failed to connect to browserless.io: ${this.formatError(error)}`); } } private async createPage(): Promise<Page> { if (!this.browser) { throw new Error('Browser not connected'); } log.info('Creating new page...'); try { const page = await this.browser.newPage(); log.info('Page created'); return page; } catch (error) { // If page creation fails due to connection issues, try reconnecting once if (error instanceof Error && (error.message.includes('Connection closed') || error.message.includes('Target closed') || error.message.includes('Session closed'))) { log.info('Page creation failed due to connection issue, reconnecting...'); this.browser = null; const browser = await this.ensureBrowserConnection(); // Create page with the new browser connection const newPage = await browser.newPage(); log.info('Page created after reconnection'); return newPage; } throw error; } } private async navigateToUrl(page: Page, url: string): Promise<void> { log.info(`Loading page: ${url}`); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: TIMEOUTS.PAGE_LOAD, }); log.info('Page loaded successfully'); await this.sleep(TIMEOUTS.PAGE_STABILIZATION); } catch (error) { log.error('Failed to load page:', error); throw new Error(`Failed to load URL ${url}: ${this.formatError(error)}`); } } private async waitForInitialLoad(waitTime: number): Promise<void> { if (waitTime > 0) { log.info(`Waiting ${waitTime}ms after page load`); await this.sleep(waitTime); } } private async performScrolling(page: Page, scrolls: number, scrollWaitTime: number): Promise<void> { if (scrolls === 0) { log.info('Scrolling skipped (scrolls = 0)'); return; } log.info(`Starting scrolling: ${scrolls} scroll(s)`); for (let i = 0; i < scrolls; i++) { if (page.isClosed()) { log.info('Page was closed, stopping scrolling'); break; } log.info(`Scrolling down (${i + 1}/${scrolls})`); try { const previousHeight = await this.getScrollHeight(page); log.info(`Current page height: ${previousHeight}px`); await this.scrollToBottom(page); await this.sleep(TIMEOUTS.SCROLL_COMPLETION); const newHeight = await this.getScrollHeight(page); log.info(`After scroll height: ${newHeight}px`); await this.waitForNewContent(page, previousHeight, scrollWaitTime); } catch (error) { if (this.isPageDetachmentError(error)) { log.error('Page/Frame issue detected, stopping scrolling early'); break; } log.error(`Scroll error (${i + 1}/${scrolls}):`, error); } } log.info(`Completed ${scrolls} scroll(s)`); } private async getScrollHeight(page: Page): Promise<number> { try { return await page.evaluate(() => document.documentElement.scrollHeight); } catch (error) { log.info('Could not get scroll height, using fallback'); return 0; } } private async scrollToBottom(page: Page): Promise<void> { try { await page.evaluate(() => { // Use instant scroll for better reliability in headless mode window.scrollTo(0, document.documentElement.scrollHeight); }); } catch (error) { log.info('Evaluate failed, using keyboard scroll'); await page.keyboard.press('End'); } } private async waitForNewContent(page: Page, previousHeight: number, scrollWaitTime: number): Promise<void> { if (scrollWaitTime <= 0) return; const startTime = Date.now(); let contentLoaded = false; while (Date.now() - startTime < scrollWaitTime) { try { const currentHeight = await page.evaluate(() => document.documentElement.scrollHeight); if (currentHeight > previousHeight) { log.info(`New content detected (height: ${previousHeight} -> ${currentHeight})`); contentLoaded = true; await this.sleep(TIMEOUTS.ADDITIONAL_CONTENT_WAIT); break; } } catch (error) { break; // Page might be detached } await this.sleep(TIMEOUTS.CONTENT_CHECK_INTERVAL); } if (!contentLoaded) { log.info('No new content detected after scroll'); } } private async waitForNetworkAndRendering(page: Page, scrolls: number, scrollWaitTime: number): Promise<void> { // Final wait after scrolling if (scrolls > 0 && scrollWaitTime > 0) { log.info('Final wait after scrolling'); await this.sleep(scrollWaitTime); } // Wait for network to idle log.info('Waiting for network to idle...'); try { await page.waitForNetworkIdle({ timeout: TIMEOUTS.NETWORK_IDLE, idleTime: TIMEOUTS.NETWORK_IDLE_TIME, }); log.info('Network idle'); } catch (error) { log.info('Network idle timeout (continuing anyway)'); } // Additional wait for JavaScript rendering await this.sleep(TIMEOUTS.FINAL_RENDER_WAIT); } private async extractPageContent(page: Page): Promise<string> { log.info('Extracting rendered DOM content'); if (page.isClosed()) { throw new Error('Page was closed before content extraction'); } try { const content = await page.evaluate(() => document.documentElement.outerHTML); log.info(`Extracted ${content.length} characters of rendered content`); return content; } catch (error) { log.error('Error getting rendered content, trying fallback:', error); try { return await page.content(); } catch (fallbackError) { throw new Error(`Failed to extract page content: ${this.formatError(error)}`); } } } private cleanupHtml(html: string): string { log.info('Cleaning up HTML content'); const $ = cheerio.load(html); // Remove <head> entirely $('head').remove(); // Remove comments $('*').contents().each((_, elem) => { if (elem.type === 'comment') { $(elem).remove(); } }); // Remove script tags $('script').remove(); // Remove style tags $('style').remove(); // Remove noscript tags $('noscript').remove(); // Remove frame, iframe, form, img, picture, and source elements $('frame').remove(); $('iframe').remove(); $('form').remove(); $('img').remove(); $('picture').remove(); $('source').remove(); // Remove inline styles $('[style]').removeAttr('style'); // Remove class attributes $('[class]').removeAttr('class'); // Remove rel attributes $('[rel]').removeAttr('rel'); // Remove tabindex attributes $('[tabindex]').removeAttr('tabindex'); // Remove SVG and path elements $('svg').remove(); $('path').remove(); $('circle').remove(); $('rect').remove(); $('polygon').remove(); $('polyline').remove(); $('line').remove(); $('ellipse').remove(); $('g').remove(); $('defs').remove(); $('clipPath').remove(); $('mask').remove(); // Remove data-*, aria-*, and on* attributes $('*').each((_, elem) => { const $elem = $(elem); if (elem.type === 'tag' && 'attribs' in elem) { const attrs = elem.attribs; Object.keys(attrs).forEach(attr => { if (attr.startsWith('data-') || attr.startsWith('aria-') || attr.startsWith('on')) { $elem.removeAttr(attr); } }); } }); const cleanedHtml = $.html(); log.info(`Cleaned HTML: ${html.length} -> ${cleanedHtml.length} characters`); return cleanedHtml; } private async closePage(page: Page | null): Promise<void> { if (!page) return; try { await page.close(); log.info('Page closed successfully'); } catch (error) { log.error('Error closing page (non-fatal):', error); } } private isPageDetachmentError(error: unknown): boolean { if (!(error instanceof Error)) return false; return PAGE_DETACHMENT_KEYWORDS.some(keyword => error.message.includes(keyword)); } private sleep(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } private async cleanup(): Promise<void> { log.info('Cleaning up...'); if (this.browser) { try { if (this.browser.connected) { await this.browser.disconnect(); log.info('Browser disconnected'); } else { log.info('Browser already disconnected'); } } catch (error) { log.error('Error disconnecting browser:', error); } this.browser = null; } } async run(): Promise<void> { const transport = new StdioServerTransport(); await this.server.connect(transport); log.info('Server running on stdio'); } } const server = new BrowserlessServer(); server.run().catch((error) => { console.error('Fatal error running server:', error); process.exit(1); });

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bakhtiyork/digest-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server