docmcp

docmcp
src
services
crawler
implementations

PuppeteerExtractor.ts•13.6 KiB

import puppeteer, { Browser, Page } from 'puppeteer'; import { IContentExtractor } from '../interfaces/IContentExtractor'; import { ExtractedContent, ExtractionOptions, PageType } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; import { UrlUtils } from '../utils/UrlUtils'; /** * Content extractor implementation using Puppeteer for SPAs (Single Page Applications). * This extractor uses a headless browser to render JavaScript and capture the fully * rendered DOM, making it suitable for modern web applications. */ export class PuppeteerExtractor implements IContentExtractor { private readonly logger = LoggingUtils.createTaggedLogger('puppeteer-extractor'); private browser: Browser | null = null; private browserInitPromise: Promise<Browser> | null = null; private pages: Set<Page> = new Set(); /** * Initialize the browser instance lazily * @returns A promise that resolves to the browser instance */ private async initBrowser(): Promise<Browser> { if (this.browser) { return this.browser; } if (this.browserInitPromise) { return this.browserInitPromise; } this.logger.debug('Initializing Puppeteer browser instance'); this.browserInitPromise = puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ] }); try { this.browser = await this.browserInitPromise; this.logger.debug('Puppeteer browser instance initialized successfully'); return this.browser; } catch (error) { this.browserInitPromise = null; this.logger.error(`Failed to initialize browser: ${error instanceof Error ? error.message : String(error)}`); throw error; } } /** * Extract content from a URL using Puppeteer (headless browser) * @param url The URL to extract content from * @param options Configuration options for the extraction * @returns The extracted content */ async extract(url: string, options: ExtractionOptions = {}): Promise<ExtractedContent> { const startTime = Date.now(); this.logger.debug(`Starting extraction for ${url}`); let page: Page | null = null; try { // Initialize browser if not already done const browser = await this.initBrowser(); // Create a new page page = await browser.newPage(); this.pages.add(page); // Configure page await this.configurePage(page, options); // Navigate to the URL this.logger.debug(`Navigating to ${url}`); const response = await page.goto(url, { waitUntil: 'networkidle2', // Wait until there are no more network connections for at least 500ms timeout: options.timeout || 30000, }); if (!response) { throw new Error(`Failed to load page: ${url}`); } if (response.status() >= 400) { throw new Error(`Failed to load page: HTTP ${response.status()} - ${url}`); } // Wait for specified selector if provided if (options.waitForSelector) { await page.waitForSelector(options.waitForSelector, { timeout: options.timeout || 30000 }); } // Wait for additional time if specified if (options.waitForTimeout) { // Use setTimeout instead of waitForTimeout which may not be available in all Puppeteer versions await new Promise(resolve => setTimeout(resolve, options.waitForTimeout)); } // Extract the page title const title = await page.title(); // Get the fully rendered HTML const content = await page.content(); // Extract text content const text = await this.extractText(page); // Extract metadata const metadata = await this.extractMetadata(page, url); // Extract links if requested let links: string[] = []; if (options.extractLinks) { links = await this.extractLinks(page, url); } const extractedContent: ExtractedContent = { url: url, title: title || null, content: content, text: text, metadata: metadata, links: links }; const elapsedTime = Date.now() - startTime; this.logger.debug(`Extraction completed for ${url} in ${elapsedTime}ms`); return extractedContent; } catch (error) { this.logger.error(`Error extracting content from ${url}: ${error instanceof Error ? error.message : String(error)}`); throw error; } finally { // Clean up page resources if (page) { this.pages.delete(page); try { await page.close(); } catch (error) { // Ignore errors during page closure } } } } /** * Configure the Puppeteer page with appropriate settings * @param page Puppeteer Page object * @param options Extraction options */ private async configurePage(page: Page, options: ExtractionOptions): Promise<void> { // Set user agent if (options.userAgent) { await page.setUserAgent(options.userAgent); } else { await page.setUserAgent('DocMCP Crawler/1.0 (Puppeteer)'); } // Set viewport await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1, }); // Disable images, fonts, and CSS if we're only interested in content await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); if (['image', 'font', 'media'].includes(resourceType)) { req.abort(); } else { req.continue(); } }); // Handle console messages for debugging page.on('console', (msg) => { if (msg.type() === 'error') { this.logger.debug(`Console error on page: ${msg.text()}`); } }); } /** * Extract text content from the page * @param page Puppeteer Page object * @returns Extracted text content */ private async extractText(page: Page): Promise<string> { // Extract all text content from the page return page.evaluate(() => { const scripts = document.querySelectorAll('script, style, noscript, iframe'); scripts.forEach(s => s.remove()); // Get text from body or whole document if no body const body = document.body || document.documentElement; return body.innerText .replace(/\s+/g, ' ') .replace(/\n+/g, '\n') .trim(); }); } /** * Extract metadata from the page * @param page Puppeteer Page object * @param url Source URL * @returns Metadata object */ private async extractMetadata(page: Page, url: string): Promise<Record<string, any>> { return page.evaluate((baseUrl) => { const metadata: Record<string, any> = { domain: new URL(baseUrl).hostname, lastModified: null, author: null, description: null, keywords: [], language: null, canonicalUrl: null, renderedWith: 'puppeteer' }; // Extract meta description const descriptionMeta = document.querySelector('meta[name="description"]') || document.querySelector('meta[property="og:description"]'); if (descriptionMeta && descriptionMeta.getAttribute('content')) { metadata.description = descriptionMeta.getAttribute('content')?.trim(); } // Extract meta keywords const keywordsMeta = document.querySelector('meta[name="keywords"]'); if (keywordsMeta && keywordsMeta.getAttribute('content')) { metadata.keywords = keywordsMeta.getAttribute('content') ?.split(',') .map(k => k.trim()) .filter(Boolean) || []; } // Extract language const html = document.querySelector('html'); const langMeta = document.querySelector('meta[http-equiv="content-language"]'); metadata.language = (html && html.getAttribute('lang')) || (langMeta && langMeta.getAttribute('content')) || null; // Extract canonical URL const canonicalLink = document.querySelector('link[rel="canonical"]'); if (canonicalLink && canonicalLink.getAttribute('href')) { const href = canonicalLink.getAttribute('href'); metadata.canonicalUrl = href?.startsWith('http') ? href : new URL(href || '', baseUrl).href; } // Extract author const authorMeta = document.querySelector('meta[name="author"]') || document.querySelector('meta[property="article:author"]'); if (authorMeta && authorMeta.getAttribute('content')) { metadata.author = authorMeta.getAttribute('content')?.trim(); } // Extract last modified const lastModifiedMeta = document.querySelector('meta[http-equiv="last-modified"]'); if (lastModifiedMeta && lastModifiedMeta.getAttribute('content')) { metadata.lastModified = lastModifiedMeta.getAttribute('content'); } // Extract Open Graph metadata document.querySelectorAll('meta[property^="og:"]').forEach(element => { const property = element.getAttribute('property'); const content = element.getAttribute('content'); if (property && content) { const key = property.replace('og:', ''); metadata[`og_${key}`] = content.trim(); } }); // Detect SPA frameworks const frameworks = []; // React detection if ( document.querySelector('[data-reactroot]') || document.querySelector('[data-reactid]') || // @ts-ignore: React devtools global hook may not exist in all windows window.__REACT_DEVTOOLS_GLOBAL_HOOK__ ) { frameworks.push('react'); } // Angular detection if ( document.querySelector('[ng-app]') || document.querySelector('[ng-controller]') || document.querySelector('[ng-model]') || document.querySelectorAll('*[class*="ng-"]').length > 0 || // @ts-ignore: Angular globals may not exist in all windows window.getAllAngularRootElements ) { frameworks.push('angular'); } // Vue detection if ( document.querySelector('[data-v-]') || document.querySelectorAll('*[class*="v-"]').length > 0 || // @ts-ignore: Vue globals may not exist in all windows window.__VUE__ ) { frameworks.push('vue'); } if (frameworks.length > 0) { metadata.frameworks = frameworks; } return metadata; }, url); } /** * Extract links from the page * @param page Puppeteer Page object * @param baseUrl Base URL for resolving relative links * @returns Array of normalized absolute URLs */ private async extractLinks(page: Page, baseUrl: string): Promise<string[]> { return page.evaluate((baseUrl) => { const links = new Set<string>(); const excludedExtensions = [ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.css', '.js', '.json', '.xml', '.csv', '.rss', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.rar', '.mp3', '.mp4', '.avi', '.mkv', '.mov', '.wav', '.ogg', '.exe', '.bin', '.iso', '.dmg', ]; const isExcludedUrl = (url: string): boolean => { const lowercaseUrl = url.toLowerCase(); return excludedExtensions.some(ext => lowercaseUrl.endsWith(ext)); }; const resolveUrl = (url: string, base: string): string => { try { return new URL(url, base).href; } catch (e) { return ''; } }; const isValidUrl = (url: string): boolean => { try { new URL(url); return true; } catch (e) { return false; } }; // Extract regular links document.querySelectorAll('a[href]').forEach(element => { const href = element.getAttribute('href'); if (href) { try { const absoluteUrl = resolveUrl(href, baseUrl); if (isValidUrl(absoluteUrl) && !isExcludedUrl(absoluteUrl)) { links.add(absoluteUrl); } } catch (error) { // Invalid or malformed URL, skip it } } }); return Array.from(links); }, baseUrl); } /** * Check if this extractor supports the given page type * @param pageType The type of page (static or SPA) * @returns True if this extractor supports the page type */ supportsPageType(pageType: PageType): boolean { return pageType === PageType.SPA; } /** * Clean up Puppeteer resources * This is important to prevent memory leaks */ async cleanup(): Promise<void> { this.logger.debug('Cleaning up Puppeteer resources'); // Close all open pages for (const page of this.pages) { try { await page.close(); } catch (error) { // Ignore errors during cleanup } } this.pages.clear(); // Close browser instance if it exists if (this.browser) { try { await this.browser.close(); this.browser = null; this.browserInitPromise = null; this.logger.debug('Puppeteer browser instance closed'); } catch (error) { this.logger.error(`Error closing browser: ${error instanceof Error ? error.message : String(error)}`); } } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

PuppeteerExtractor.ts•13.6 KiB