even-better-playwright-mcp

Overview Schema Related Servers Score Discussions

clean-html.ts•5.06 KiB

/** * Clean HTML utility for getting LLM-friendly HTML with search/diff * Simplified version inspired by playwriter's formatHtmlForPrompt */ import type { Page, Locator } from 'playwright'; import { createPatch } from 'diff'; export interface GetCleanHTMLOptions { locator: Locator | Page; search?: string | RegExp; showDiffSinceLastCall?: boolean; maxContentLen?: number; } // Store last HTML snapshots per locator/page for diffing const lastHtmlSnapshots: WeakMap<Page, Map<string, string>> = new WeakMap(); function isPage(obj: any): obj is Page { return obj && typeof obj.content === 'function' && typeof obj.goto === 'function'; } function isRegExp(value: any): value is RegExp { return typeof value === 'object' && value !== null && typeof value.test === 'function' && typeof value.exec === 'function'; } function getSnapshotKey(locator: Locator | Page): string { if (isPage(locator)) { return '__page__'; } // For locators, use a string representation return (locator as any)._selector || '__locator__'; } /** * Basic HTML cleaning - removes scripts, styles, and cleans up whitespace */ function cleanHtml(html: string, maxContentLen: number): string { // Remove script and style tags with their content let cleaned = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ''); cleaned = cleaned.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ''); // Remove comments cleaned = cleaned.replace(//g, ''); // Truncate very long text content (but keep tags intact) if (maxContentLen > 0) { cleaned = cleaned.replace(/>([^<]{500,})</g, (match, content) => { const truncated = content.slice(0, maxContentLen); const remaining = content.length - maxContentLen; return `>${truncated}...(${remaining} more chars)<`; }); } // Clean up excessive whitespace cleaned = cleaned.replace(/\s+/g, ' '); cleaned = cleaned.replace(/>\s+</g, '>\n<'); return cleaned.trim(); } /** * Get cleaned HTML from page or locator with search and diff capabilities */ export async function getCleanHTML(options: GetCleanHTMLOptions): Promise<string> { const { locator, search, showDiffSinceLastCall = false, maxContentLen = 500, } = options; // Get raw HTML let rawHtml: string; let page: Page; if (isPage(locator)) { page = locator; rawHtml = await locator.content(); } else { page = locator.page(); rawHtml = await locator.innerHTML(); } // Clean the HTML const cleanedHtml = cleanHtml(rawHtml, maxContentLen); // Sanitize to remove unpaired surrogates that break JSON encoding let htmlStr = cleanedHtml.toWellFormed?.() ?? cleanedHtml; // Handle diffing if (showDiffSinceLastCall) { let pageSnapshots = lastHtmlSnapshots.get(page); if (!pageSnapshots) { pageSnapshots = new Map(); lastHtmlSnapshots.set(page, pageSnapshots); } const snapshotKey = getSnapshotKey(locator); const previousSnapshot = pageSnapshots.get(snapshotKey); if (!previousSnapshot) { pageSnapshots.set(snapshotKey, htmlStr); return 'No previous snapshot available. This is the first call for this locator. Full snapshot stored for next diff.'; } const patch = createPatch('html', previousSnapshot, htmlStr, 'previous', 'current', { context: 3, }); pageSnapshots.set(snapshotKey, htmlStr); if (patch.split('\n').length <= 4) { return 'No changes detected since last snapshot'; } return patch; } // Store snapshot for future diffs let pageSnapshots = lastHtmlSnapshots.get(page); if (!pageSnapshots) { pageSnapshots = new Map(); lastHtmlSnapshots.set(page, pageSnapshots); } pageSnapshots.set(getSnapshotKey(locator), htmlStr); // Handle search if (search) { const lines = htmlStr.split('\n'); const matchIndices: number[] = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; let isMatch = false; if (isRegExp(search)) { isMatch = search.test(line); } else { isMatch = line.includes(search); } if (isMatch) { matchIndices.push(i); if (matchIndices.length >= 10) break; } } if (matchIndices.length === 0) { return 'No matches found'; } // Collect lines with 5 lines of context above and below each match const CONTEXT_LINES = 5; const includedLines = new Set<number>(); for (const idx of matchIndices) { const start = Math.max(0, idx - CONTEXT_LINES); const end = Math.min(lines.length - 1, idx + CONTEXT_LINES); for (let i = start; i <= end; i++) { includedLines.add(i); } } // Build result with separators between non-contiguous sections const sortedIndices = [...includedLines].sort((a, b) => a - b); const result: string[] = []; for (let i = 0; i < sortedIndices.length; i++) { const lineIdx = sortedIndices[i]; if (i > 0 && sortedIndices[i - 1] !== lineIdx - 1) { result.push('---'); } result.push(lines[lineIdx]); } return result.join('\n'); } return htmlStr; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SJMakin/even-better-playwright-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

clean-html.ts•5.06 KiB