Skip to main content
Glama

Low Cost Browsing MCP Server

by nightweb
extraction.ts•13.7 kB
import { Logger } from 'pino'; import { BrowserManager } from '../core/browser-manager.js'; import { ExtractContentInput, ExtractContentOutput, ExtractTableInput, ExtractTableOutput, ExtractAttributesInput, ExtractAttributesOutput, ExtractScreenshotInput, ExtractScreenshotOutput, Table } from '../types/extraction.js'; import { ErrorResponse } from '../types/index.js'; import { Readability } from '@mozilla/readability'; import { JSDOM } from 'jsdom'; export class ExtractionTools { constructor( private browserManager: BrowserManager, private logger: Logger ) {} async content(params: ExtractContentInput): Promise<ExtractContentOutput | ErrorResponse> { const startTime = Date.now(); this.logger.info({ pageId: params.pageId, mode: params.mode }, 'Extracting content'); try { const pageInfo = this.browserManager.getPage(params.pageId); if (!pageInfo) { return { status: 'error', code: 'page_not_found', message: 'Page not found', details: { pageId: params.pageId } }; } const page = pageInfo.page; let content: string; switch (params.mode) { case 'text': if (params.scopeCss) { const element = await page.locator(params.scopeCss).first(); content = await element.textContent() || ''; } else { content = await page.textContent('body') || ''; } break; case 'html': if (params.scopeCss) { const element = await page.locator(params.scopeCss).first(); content = await element.innerHTML() || ''; } else { content = await page.innerHTML('body') || ''; } break; case 'markdown': // Simple HTML to Markdown conversion let htmlContent: string; if (params.scopeCss) { const element = await page.locator(params.scopeCss).first(); htmlContent = await element.innerHTML() || ''; } else { htmlContent = await page.innerHTML('body') || ''; } content = this.htmlToMarkdown(htmlContent); break; case 'readability': const fullHTML = await page.content(); content = this.extractWithReadability(fullHTML, page.url()); break; default: return { status: 'error', code: 'invalid_params', message: `Unsupported mode: ${params.mode}`, details: { mode: params.mode } }; } // Apply character limit if (content.length > params.maxChars) { content = content.substring(0, params.maxChars); this.logger.warn({ pageId: params.pageId, originalLength: content.length, truncatedTo: params.maxChars }, 'Content truncated due to size limit'); } const meta = { url: page.url(), title: await page.title(), ts: new Date().toISOString() }; const duration = Date.now() - startTime; this.logger.info({ pageId: params.pageId, mode: params.mode, contentLength: content.length, duration }, 'Content extracted'); return { status: 'ok', content, meta }; } catch (error) { const duration = Date.now() - startTime; this.logger.error({ pageId: params.pageId, mode: params.mode, error: error instanceof Error ? error.message : String(error), duration }, 'Content extraction failed'); return { status: 'error', code: 'internal_error', message: error instanceof Error ? error.message : 'Unknown error during content extraction', details: { pageId: params.pageId, mode: params.mode } }; } } async table(params: ExtractTableInput): Promise<ExtractTableOutput | ErrorResponse> { const startTime = Date.now(); this.logger.info({ pageId: params.pageId, tableCss: params.tableCss }, 'Extracting tables'); try { const pageInfo = this.browserManager.getPage(params.pageId); if (!pageInfo) { return { status: 'error', code: 'page_not_found', message: 'Page not found', details: { pageId: params.pageId } }; } const page = pageInfo.page; const tables: Table[] = []; // Find all table elements matching the CSS selector const tableElements = await page.locator(params.tableCss).all(); for (let i = 0; i < Math.min(tableElements.length, params.limit); i++) { const tableElement = tableElements[i]; try { const table = await this.extractTableData(tableElement, params.headerStrategy); if (table.headers.length > 0 || table.rows.length > 0) { tables.push(table); } } catch (tableError) { this.logger.warn({ pageId: params.pageId, tableIndex: i, error: tableError instanceof Error ? tableError.message : String(tableError) }, 'Failed to extract table data'); } } const duration = Date.now() - startTime; this.logger.info({ pageId: params.pageId, tablesFound: tables.length, duration }, 'Tables extracted'); return { status: 'ok', tables }; } catch (error) { const duration = Date.now() - startTime; this.logger.error({ pageId: params.pageId, error: error instanceof Error ? error.message : String(error), duration }, 'Table extraction failed'); return { status: 'error', code: 'internal_error', message: error instanceof Error ? error.message : 'Unknown error during table extraction', details: { pageId: params.pageId } }; } } async attributes(params: ExtractAttributesInput): Promise<ExtractAttributesOutput | ErrorResponse> { const startTime = Date.now(); this.logger.info({ pageId: params.pageId, queries: params.queries.length }, 'Extracting attributes'); try { const pageInfo = this.browserManager.getPage(params.pageId); if (!pageInfo) { return { status: 'error', code: 'page_not_found', message: 'Page not found', details: { pageId: params.pageId } }; } const page = pageInfo.page; const results: Array<Array<string | null>> = []; for (const query of params.queries) { try { const elements = await page.locator(query.css).all(); const values: Array<string | null> = []; for (let i = 0; i < Math.min(elements.length, params.limitPerQuery); i++) { const element = elements[i]; const value = await element.getAttribute(query.attr); values.push(value); } results.push(values); } catch (queryError) { this.logger.warn({ pageId: params.pageId, query, error: queryError instanceof Error ? queryError.message : String(queryError) }, 'Failed to extract attribute'); results.push([]); } } const duration = Date.now() - startTime; this.logger.info({ pageId: params.pageId, queriesProcessed: params.queries.length, duration }, 'Attributes extracted'); return { status: 'ok', results }; } catch (error) { const duration = Date.now() - startTime; this.logger.error({ pageId: params.pageId, error: error instanceof Error ? error.message : String(error), duration }, 'Attribute extraction failed'); return { status: 'error', code: 'internal_error', message: error instanceof Error ? error.message : 'Unknown error during attribute extraction', details: { pageId: params.pageId } }; } } async screenshot(params: ExtractScreenshotInput): Promise<ExtractScreenshotOutput | ErrorResponse> { const startTime = Date.now(); this.logger.info({ pageId: params.pageId, fullPage: params.fullPage }, 'Taking screenshot'); try { const pageInfo = this.browserManager.getPage(params.pageId); if (!pageInfo) { return { status: 'error', code: 'page_not_found', message: 'Page not found', details: { pageId: params.pageId } }; } const page = pageInfo.page; const screenshotOptions: any = { type: params.format, fullPage: params.fullPage }; if (params.format === 'jpeg') { screenshotOptions.quality = params.quality; } if (params.clip) { screenshotOptions.clip = params.clip; screenshotOptions.fullPage = false; } const buffer = await page.screenshot(screenshotOptions); const imageBase64 = buffer.toString('base64'); // Check size limit (8MB default) const maxBytes = 8 * 1024 * 1024; // 8MB if (buffer.length > maxBytes) { return { status: 'error', code: 'dom_too_large', message: `Screenshot size (${buffer.length} bytes) exceeds limit (${maxBytes} bytes)`, details: { pageId: params.pageId, size: buffer.length, limit: maxBytes } }; } const meta = { bytes: buffer.length, format: params.format, width: params.clip?.width, height: params.clip?.height }; const duration = Date.now() - startTime; this.logger.info({ pageId: params.pageId, size: buffer.length, format: params.format, duration }, 'Screenshot taken'); return { status: 'ok', imageBase64, meta }; } catch (error) { const duration = Date.now() - startTime; this.logger.error({ pageId: params.pageId, error: error instanceof Error ? error.message : String(error), duration }, 'Screenshot failed'); return { status: 'error', code: 'internal_error', message: error instanceof Error ? error.message : 'Unknown error during screenshot', details: { pageId: params.pageId } }; } } private async extractTableData(tableElement: any, headerStrategy: string): Promise<Table> { const headers: string[] = []; const rows: string[][] = []; // Extract headers based on strategy switch (headerStrategy) { case 'th': const thElements = await tableElement.locator('th').all(); for (const th of thElements) { const text = await th.textContent(); headers.push(text?.trim() || ''); } break; case 'firstRow': const firstRowCells = await tableElement.locator('tr').first().locator('td, th').all(); for (const cell of firstRowCells) { const text = await cell.textContent(); headers.push(text?.trim() || ''); } break; case 'auto': // Try th first, then first row const thCount = await tableElement.locator('th').count(); if (thCount > 0) { const thElements = await tableElement.locator('th').all(); for (const th of thElements) { const text = await th.textContent(); headers.push(text?.trim() || ''); } } else { const firstRowCells = await tableElement.locator('tr').first().locator('td').all(); for (const cell of firstRowCells) { const text = await cell.textContent(); headers.push(text?.trim() || ''); } } break; } // Extract data rows const dataRows = headerStrategy === 'firstRow' ? await tableElement.locator('tr').nth(1).locator('xpath=following-sibling::tr').all() : await tableElement.locator('tr').all(); for (const row of dataRows) { const cells = await row.locator('td').all(); const rowData: string[] = []; for (const cell of cells) { const text = await cell.textContent(); rowData.push(text?.trim() || ''); } if (rowData.length > 0) { rows.push(rowData); } } return { headers, rows }; } private htmlToMarkdown(html: string): string { // Basic HTML to Markdown conversion return html .replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n') .replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n') .replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n') .replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n') .replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**') .replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**') .replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*') .replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*') .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)') .replace(/<br\s*\/?>/gi, '\n') .replace(/<[^>]*>/g, '') // Remove remaining HTML tags .replace(/\n\s*\n\s*\n/g, '\n\n') // Clean up multiple newlines .trim(); } private extractWithReadability(html: string, url: string): string { try { const dom = new JSDOM(html, { url }); const reader = new Readability(dom.window.document); const article = reader.parse(); return article?.textContent || ''; } catch (error) { // Fallback to simple text extraction if Readability fails const dom = new JSDOM(html); return dom.window.document.body?.textContent || ''; } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nightweb/lc-browser-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server