Skip to main content
Glama
PuppeteerExtractor.ts13.9 kB
import puppeteer, { Browser, Page } from 'puppeteer'; import { IContentExtractor } from '../interfaces/IContentExtractor'; import { ExtractedContent, ExtractionOptions, PageType } from '../interfaces/types'; import { LoggingUtils } from '../utils/LoggingUtils'; import { UrlUtils } from '../utils/UrlUtils'; /** * Content extractor implementation using Puppeteer for SPAs (Single Page Applications). * This extractor uses a headless browser to render JavaScript and capture the fully * rendered DOM, making it suitable for modern web applications. */ export class PuppeteerExtractor implements IContentExtractor { private readonly logger = LoggingUtils.createTaggedLogger('puppeteer-extractor'); private browser: Browser | null = null; private browserInitPromise: Promise<Browser> | null = null; private pages: Set<Page> = new Set(); /** * Initialize the browser instance lazily * @returns A promise that resolves to the browser instance */ private async initBrowser(): Promise<Browser> { if (this.browser) { return this.browser; } if (this.browserInitPromise) { return this.browserInitPromise; } this.logger.debug('Initializing Puppeteer browser instance'); this.browserInitPromise = puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ] }); try { this.browser = await this.browserInitPromise; this.logger.debug('Puppeteer browser instance initialized successfully'); return this.browser; } catch (error) { this.browserInitPromise = null; this.logger.error(`Failed to initialize browser: ${error instanceof Error ? error.message : String(error)}`); throw error; } } /** * Extract content from a URL using Puppeteer (headless browser) * @param url The URL to extract content from * @param options Configuration options for the extraction * @returns The extracted content */ async extract(url: string, options: ExtractionOptions = {}): Promise<ExtractedContent> { const startTime = Date.now(); this.logger.debug(`Starting extraction for ${url}`); let page: Page | null = null; try { // Initialize browser if not already done const browser = await this.initBrowser(); // Create a new page page = await browser.newPage(); this.pages.add(page); // Configure page await this.configurePage(page, options); // Navigate to the URL this.logger.debug(`Navigating to ${url}`); const response = await page.goto(url, { waitUntil: 'networkidle2', // Wait until there are no more network connections for at least 500ms timeout: options.timeout || 30000, }); if (!response) { throw new Error(`Failed to load page: ${url}`); } if (response.status() >= 400) { throw new Error(`Failed to load page: HTTP ${response.status()} - ${url}`); } // Wait for specified selector if provided if (options.waitForSelector) { await page.waitForSelector(options.waitForSelector, { timeout: options.timeout || 30000 }); } // Wait for additional time if specified if (options.waitForTimeout) { // Use setTimeout instead of waitForTimeout which may not be available in all Puppeteer versions await new Promise(resolve => setTimeout(resolve, options.waitForTimeout)); } // Extract the page title const title = await page.title(); // Get the fully rendered HTML const content = await page.content(); // Extract text content const text = await this.extractText(page); // Extract metadata const metadata = await this.extractMetadata(page, url); // Extract links if requested let links: string[] = []; if (options.extractLinks) { links = await this.extractLinks(page, url); } const extractedContent: ExtractedContent = { url: url, title: title || null, content: content, text: text, metadata: metadata, links: links }; const elapsedTime = Date.now() - startTime; this.logger.debug(`Extraction completed for ${url} in ${elapsedTime}ms`); return extractedContent; } catch (error) { this.logger.error(`Error extracting content from ${url}: ${error instanceof Error ? error.message : String(error)}`); throw error; } finally { // Clean up page resources if (page) { this.pages.delete(page); try { await page.close(); } catch (error) { // Ignore errors during page closure } } } } /** * Configure the Puppeteer page with appropriate settings * @param page Puppeteer Page object * @param options Extraction options */ private async configurePage(page: Page, options: ExtractionOptions): Promise<void> { // Set user agent if (options.userAgent) { await page.setUserAgent(options.userAgent); } else { await page.setUserAgent('DocMCP Crawler/1.0 (Puppeteer)'); } // Set viewport await page.setViewport({ width: 1920, height: 1080, deviceScaleFactor: 1, }); // Disable images, fonts, and CSS if we're only interested in content await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); if (['image', 'font', 'media'].includes(resourceType)) { req.abort(); } else { req.continue(); } }); // Handle console messages for debugging page.on('console', (msg) => { if (msg.type() === 'error') { this.logger.debug(`Console error on page: ${msg.text()}`); } }); } /** * Extract text content from the page * @param page Puppeteer Page object * @returns Extracted text content */ private async extractText(page: Page): Promise<string> { // Extract all text content from the page return page.evaluate(() => { const scripts = document.querySelectorAll('script, style, noscript, iframe'); scripts.forEach(s => s.remove()); // Get text from body or whole document if no body const body = document.body || document.documentElement; return body.innerText .replace(/\s+/g, ' ') .replace(/\n+/g, '\n') .trim(); }); } /** * Extract metadata from the page * @param page Puppeteer Page object * @param url Source URL * @returns Metadata object */ private async extractMetadata(page: Page, url: string): Promise<Record<string, any>> { return page.evaluate((baseUrl) => { const metadata: Record<string, any> = { domain: new URL(baseUrl).hostname, lastModified: null, author: null, description: null, keywords: [], language: null, canonicalUrl: null, renderedWith: 'puppeteer' }; // Extract meta description const descriptionMeta = document.querySelector('meta[name="description"]') || document.querySelector('meta[property="og:description"]'); if (descriptionMeta && descriptionMeta.getAttribute('content')) { metadata.description = descriptionMeta.getAttribute('content')?.trim(); } // Extract meta keywords const keywordsMeta = document.querySelector('meta[name="keywords"]'); if (keywordsMeta && keywordsMeta.getAttribute('content')) { metadata.keywords = keywordsMeta.getAttribute('content') ?.split(',') .map(k => k.trim()) .filter(Boolean) || []; } // Extract language const html = document.querySelector('html'); const langMeta = document.querySelector('meta[http-equiv="content-language"]'); metadata.language = (html && html.getAttribute('lang')) || (langMeta && langMeta.getAttribute('content')) || null; // Extract canonical URL const canonicalLink = document.querySelector('link[rel="canonical"]'); if (canonicalLink && canonicalLink.getAttribute('href')) { const href = canonicalLink.getAttribute('href'); metadata.canonicalUrl = href?.startsWith('http') ? href : new URL(href || '', baseUrl).href; } // Extract author const authorMeta = document.querySelector('meta[name="author"]') || document.querySelector('meta[property="article:author"]'); if (authorMeta && authorMeta.getAttribute('content')) { metadata.author = authorMeta.getAttribute('content')?.trim(); } // Extract last modified const lastModifiedMeta = document.querySelector('meta[http-equiv="last-modified"]'); if (lastModifiedMeta && lastModifiedMeta.getAttribute('content')) { metadata.lastModified = lastModifiedMeta.getAttribute('content'); } // Extract Open Graph metadata document.querySelectorAll('meta[property^="og:"]').forEach(element => { const property = element.getAttribute('property'); const content = element.getAttribute('content'); if (property && content) { const key = property.replace('og:', ''); metadata[`og_${key}`] = content.trim(); } }); // Detect SPA frameworks const frameworks = []; // React detection if ( document.querySelector('[data-reactroot]') || document.querySelector('[data-reactid]') || // @ts-ignore: React devtools global hook may not exist in all windows window.__REACT_DEVTOOLS_GLOBAL_HOOK__ ) { frameworks.push('react'); } // Angular detection if ( document.querySelector('[ng-app]') || document.querySelector('[ng-controller]') || document.querySelector('[ng-model]') || document.querySelectorAll('*[class*="ng-"]').length > 0 || // @ts-ignore: Angular globals may not exist in all windows window.getAllAngularRootElements ) { frameworks.push('angular'); } // Vue detection if ( document.querySelector('[data-v-]') || document.querySelectorAll('*[class*="v-"]').length > 0 || // @ts-ignore: Vue globals may not exist in all windows window.__VUE__ ) { frameworks.push('vue'); } if (frameworks.length > 0) { metadata.frameworks = frameworks; } return metadata; }, url); } /** * Extract links from the page * @param page Puppeteer Page object * @param baseUrl Base URL for resolving relative links * @returns Array of normalized absolute URLs */ private async extractLinks(page: Page, baseUrl: string): Promise<string[]> { return page.evaluate((baseUrl) => { const links = new Set<string>(); const excludedExtensions = [ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.css', '.js', '.json', '.xml', '.csv', '.rss', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.rar', '.mp3', '.mp4', '.avi', '.mkv', '.mov', '.wav', '.ogg', '.exe', '.bin', '.iso', '.dmg', ]; const isExcludedUrl = (url: string): boolean => { const lowercaseUrl = url.toLowerCase(); return excludedExtensions.some(ext => lowercaseUrl.endsWith(ext)); }; const resolveUrl = (url: string, base: string): string => { try { return new URL(url, base).href; } catch (e) { return ''; } }; const isValidUrl = (url: string): boolean => { try { new URL(url); return true; } catch (e) { return false; } }; // Extract regular links document.querySelectorAll('a[href]').forEach(element => { const href = element.getAttribute('href'); if (href) { try { const absoluteUrl = resolveUrl(href, baseUrl); if (isValidUrl(absoluteUrl) && !isExcludedUrl(absoluteUrl)) { links.add(absoluteUrl); } } catch (error) { // Invalid or malformed URL, skip it } } }); return Array.from(links); }, baseUrl); } /** * Check if this extractor supports the given page type * @param pageType The type of page (static or SPA) * @returns True if this extractor supports the page type */ supportsPageType(pageType: PageType): boolean { return pageType === PageType.SPA; } /** * Clean up Puppeteer resources * This is important to prevent memory leaks */ async cleanup(): Promise<void> { this.logger.debug('Cleaning up Puppeteer resources'); // Close all open pages for (const page of this.pages) { try { await page.close(); } catch (error) { // Ignore errors during cleanup } } this.pages.clear(); // Close browser instance if it exists if (this.browser) { try { await this.browser.close(); this.browser = null; this.browserInitPromise = null; this.logger.debug('Puppeteer browser instance closed'); } catch (error) { this.logger.error(`Error closing browser: ${error instanceof Error ? error.message : String(error)}`); } } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server