Skip to main content
Glama

docs-mcp-server

HtmlJsExecutorMiddleware.ts6.56 kB
import { logger } from "../../utils/logger"; import type { FetchOptions, RawContent } from "../fetcher/types"; import { executeJsInSandbox } from "../utils/sandbox"; import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; /** * Middleware to parse HTML content and execute embedded JavaScript within a secure sandbox. * It uses the `executeJsInSandbox` utility (Node.js `vm` + JSDOM) to run scripts, * including fetching external scripts. * * This middleware updates `context.content` with the HTML *after* script execution. * Subsequent middleware (e.g., HtmlCheerioParserMiddleware) should handle parsing this content. * * @remarks * **WARNING:** This middleware provides a basic sandboxed JavaScript execution * environment but is **not suitable for general production use** on arbitrary * web pages. The JSDOM + Node VM environment lacks many Web APIs found in * real browsers (e.g., `MutationObserver`, `IntersectionObserver`, layout-dependent APIs) * and does not fully replicate browser script execution order (e.g., `async`, `defer`, * dynamic script loading). Use with caution and primarily for pages with * simple, known JavaScript dependencies. For robust rendering of complex pages, * consider using a headless browser solution. */ export class HtmlJsExecutorMiddleware implements ContentProcessorMiddleware { async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> { try { logger.debug( `Executing JavaScript in sandbox for HTML content from ${context.source}`, ); // Define the callback for fetching external scripts const fetchScriptContentCallback = async ( scriptUrl: string, ): Promise<string | null> => { if (!context.fetcher) { logger.warn( `⚠️ No fetcher available in context to fetch external script: ${scriptUrl}`, ); return null; } try { logger.debug(`Fetching external script via context fetcher: ${scriptUrl}`); // Pass relevant options, especially the signal for cancellation const fetchOptions: FetchOptions = { signal: context.options?.signal, // Pass signal from context if available followRedirects: true, // Generally want to follow redirects for scripts // timeout: context.options?.fetchTimeout // Add if timeout is configurable at context level }; const rawContent: RawContent = await context.fetcher.fetch( scriptUrl, fetchOptions, ); // Optional: Check MIME type to be reasonably sure it's JavaScript const allowedMimeTypes = [ "application/javascript", "text/javascript", "application/x-javascript", ]; // Allow common JS types or be lenient if type is generic/unknown const mimeTypeLower = rawContent.mimeType.toLowerCase().split(";")[0].trim(); if ( !allowedMimeTypes.includes(mimeTypeLower) && !["application/octet-stream", "unknown/unknown", ""].includes(mimeTypeLower) // Allow empty MIME type as well ) { logger.warn( `⏭️ Skipping execution of external script ${scriptUrl} due to unexpected MIME type: ${rawContent.mimeType}`, ); context.errors.push( new Error( `Skipping execution of external script ${scriptUrl} due to unexpected MIME type: ${rawContent.mimeType}`, ), ); return null; } // Convert content to string using provided encoding or default to utf-8 const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content); // Validate encoding before using it const validEncodings: BufferEncoding[] = [ "ascii", "utf8", "utf-8", "utf16le", "ucs2", "ucs-2", "base64", "base64url", "latin1", "binary", "hex", ]; const encoding = rawContent.encoding && validEncodings.includes(rawContent.encoding.toLowerCase() as BufferEncoding) ? (rawContent.encoding.toLowerCase() as BufferEncoding) : "utf-8"; return contentBuffer.toString(encoding); } catch (fetchError) { // fetcher.fetch is expected to throw on error (e.g., 404, network error) const message = fetchError instanceof Error ? fetchError.message : String(fetchError); logger.warn(`⚠️ Failed to fetch external script ${scriptUrl}: ${message}`); // Use warn for fetch failures like 404 context.errors.push( new Error(`Failed to fetch external script ${scriptUrl}: ${message}`, { cause: fetchError, }), ); return null; // Indicate failure to the sandbox runner } }; // TODO: Plumb timeout options from context.options if available const sandboxOptions = { html: context.content, url: context.source, fetchScriptContent: fetchScriptContentCallback, }; const result = await executeJsInSandbox(sandboxOptions); // Update context content with the HTML after script execution context.content = result.finalHtml; // DO NOT update context.dom here. The subsequent HtmlCheerioParserMiddleware will handle parsing. // Add any errors encountered during script execution to the context if (result.errors.length > 0) { context.errors.push(...result.errors); logger.warn( `⚠️ Encountered ${result.errors.length} error(s) during sandbox execution for ${context.source}`, ); } logger.debug( `Sandbox execution completed for ${context.source}. Proceeding with updated content.`, ); // Proceed to the next middleware with the modified context await next(); } catch (error) { const baseMessage = `HtmlJsExecutorMiddleware failed for ${context.source}`; const errorMessage = error instanceof Error ? error.message : String(error); const processingError = new Error(`${baseMessage}: ${errorMessage}`, { cause: error, }); logger.error(`❌ ${processingError.message}`); context.errors.push(processingError); return; } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server