Skip to main content
Glama

Playwright MCP Server

by showfive
HTMLparser.ts17.4 kB
import { JSDOM } from 'jsdom'; import { ElementProcessorManager } from './ElementProcessorManager.js'; import { TextFormatter } from './TextFormatter.js'; import { Node, NodeFilter, SKIP_TAGS, MAIN_CONTENT_SELECTORS } from './constants.js'; import type { ParsedHTML, ViewportDimensions, VisibleContentOptions, VisibleContentResult } from './types.js'; /** * HTMLパーサーのメインクラス */ export class HTMLParser { private readonly dom: JSDOM; private readonly elementProcessor: ElementProcessorManager; constructor(html: string) { // スタイル要素を含まないHTMLを作成 const cleanHtml = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ''); this.dom = new JSDOM(cleanHtml, { runScripts: "outside-only", includeNodeLocations: true, pretendToBeVisual: true }); this.elementProcessor = new ElementProcessorManager(this.processChildren.bind(this)); } /** * 要素のテキストコンテンツを安全に取得 */ private safeGetTextContent(element: Element | null): string { if (!element) return ''; try { return element.textContent?.trim() || ''; } catch { return ''; } } /** * 子要素の処理 */ private processChildren(element: Element): string { let result = ''; for (const child of Array.from(element.childNodes)) { result += this.processElement(child as Element); } return result; } /** * 個別の要素の処理 */ private processElement(element: Element | null): string { if (!element) return ''; // テキストノードの処理 if (element.nodeType === Node.TEXT_NODE && element.textContent) { return TextFormatter.formatTextNode(element.textContent); } const handler = this.elementProcessor.getHandler(element.tagName); if (handler) { return handler.processElement(element); } // スキップすべき要素の処理 if (SKIP_TAGS.includes(element.tagName as any)) { return ''; } return this.processChildren(element); } /** * メインコンテンツの特定と取得 */ private extractMainContent(): string { const document = this.dom.window.document; // メインコンテンツの検索 for (const selector of MAIN_CONTENT_SELECTORS) { try { const element = document.querySelector(selector); if (element) { const content = this.processElement(element); if (content.length > 100) { return content; } } } catch { continue; } } // メインコンテンツが見つからない場合はbodyから取得 try { const bodyClone = document.body.cloneNode(true) as HTMLElement; Array.from(bodyClone.getElementsByTagName('script')).forEach(el => el.remove()); Array.from(bodyClone.getElementsByTagName('style')).forEach(el => el.remove()); return this.processElement(bodyClone); } catch (error) { console.error('Error extracting body text:', error); return 'Failed to extract content'; } } /** * コンテンツの相対位置を計算 */ private calculateRelativePosition(line: number, totalHeight: number, viewportHeight: number): number { // スクロール位置を考慮した相対位置の計算 const lineHeight = 20; // 推定行の高さ(ピクセル) const pixelPosition = line * lineHeight; const totalPixelHeight = totalHeight * lineHeight; return (pixelPosition / totalPixelHeight) * 100; } /** * ノードがスキップすべきものかを判定 */ /** * 要素の深さを取得 */ private getElementDepth(element: Element): number { let depth = 0; let current = element; while (current.parentElement) { depth++; current = current.parentElement; } return depth - 1; // body要素の深さを0とする } /** * テーブルを構造化されたテキストに変換 */ private formatTable(table: Element): string { const rows = Array.from(table.querySelectorAll('tr')); const result: string[] = ['Table:']; TextFormatter.setIndentLevel(this.getElementDepth(table) + 1); for (const row of rows) { const cells = Array.from(row.querySelectorAll('th, td')) .map(cell => cell.textContent?.trim() || '') .join(' | '); result.push(TextFormatter.getCurrentIndent() + `| ${cells} |`); } return result.join('\n'); } /** * フォームを構造化されたテキストに変換 */ private formatForm(form: Element): string { const result: string[] = ['Form:']; TextFormatter.setIndentLevel(this.getElementDepth(form) + 1); const indent = TextFormatter.getCurrentIndent(); // フォーム要素の処理 const inputs = form.querySelectorAll('input, textarea, select, button'); for (const input of Array.from(inputs)) { const type = input.getAttribute('type') || input.tagName.toLowerCase(); const name = input.getAttribute('name') || ''; const label = this.findInputLabel(input); result.push(`${indent}${type}${name ? ` (${name})` : ''}${label ? `: ${label}` : ''}`); } return result.join('\n'); } /** * 入力要素のラベルを検索 */ private findInputLabel(input: Element): string { const id = input.getAttribute('id'); if (id) { const label = this.dom.window.document.querySelector(`label[for="${id}"]`); if (label) { return label.textContent?.trim() || ''; } } // 親要素がlabelの場合 let parent = input.parentElement; while (parent) { if (parent.tagName === 'LABEL') { const labelText = parent.textContent?.trim() || ''; return labelText.replace(input.textContent?.trim() || '', '').trim(); } parent = parent.parentElement; } return ''; } private shouldSkipNode(node: Node): boolean { const parent = node.parentElement; if (!parent) return true; if (SKIP_TAGS.includes(parent.tagName as any)) { return true; } return !node.textContent?.trim(); } /** * 画面内のコンテンツと画面外の位置情報を取得 */ public getVisibleContent(options: VisibleContentOptions): VisibleContentResult { const document = this.dom.window.document; const mainElement = document.querySelector('main, article, [role="main"]') || document.body; // ドキュメント全体の高さを計算 const totalHeight = Array.from(mainElement.getElementsByTagName('*')).reduce((max, el) => { const loc = this.dom.nodeLocation(el); return loc ? Math.max(max, loc.endLine) : max; }, 0); // ビューポートの範囲(0-100%)を計算 const viewportStart = Math.max(0, this.calculateRelativePosition( options.viewport.top, totalHeight, options.viewport.height )); const viewportEnd = Math.min(100, this.calculateRelativePosition( options.viewport.top + options.viewport.height, totalHeight, options.viewport.height )); // テキストノードを収集 const textNodes = this.collectTextNodes(mainElement); if (textNodes.length === 0) { return { content: '', hasAbove: false, hasBelow: false }; } // 可視範囲内のノードを抽出 const visibleContent = this.extractVisibleContent(textNodes, { start: viewportStart, end: viewportEnd, minVisiblePercentage: options.minVisiblePercentage, viewportHeight: options.viewport.height }); return visibleContent; } /** * テキストノードの収集 */ private collectTextNodes(rootElement: Element): Array<{ node: Node; line: number }> { const textNodes: Array<{ node: Node; line: number }> = []; const walker = this.dom.window.document.createTreeWalker( rootElement, NodeFilter.SHOW_TEXT, { acceptNode: (node) => { return this.shouldSkipNode(node) ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT; } } ); while (walker.nextNode()) { const node = walker.currentNode; const parent = node.parentElement; if (!parent) continue; const location = this.dom.nodeLocation(parent); if (!location) continue; textNodes.push({ node, line: location.startLine }); } return textNodes.sort((a, b) => a.line - b.line); } /** * 可視範囲内のコンテンツを抽出 */ private extractVisibleContent( nodes: Array<{ node: Node; line: number }>, viewport: { start: number; end: number; minVisiblePercentage?: number; viewportHeight: number; } ): VisibleContentResult { const maxLine = nodes[nodes.length - 1].line; const visibleNodes = nodes.filter(({ line }) => { const position = this.calculateRelativePosition( line, maxLine, viewport.viewportHeight ); if (position >= viewport.start && position <= viewport.end) { return true; } if (viewport.minVisiblePercentage) { const visibleRange = position >= viewport.start - 5 && position <= viewport.end + 5; if (!visibleRange) return false; const visiblePart = Math.min(viewport.end, position + 5) - Math.max(viewport.start, position - 5); const visiblePercent = (visiblePart / 10) * 100; return visiblePercent >= viewport.minVisiblePercentage; } return false; }); const content = this.formatVisibleContent(visibleNodes.map(n => n.node)); const hasAbove = nodes.some(({ line }) => { const position = this.calculateRelativePosition( line, maxLine, viewport.viewportHeight ); return position < viewport.start - 5; }); const hasBelow = nodes.some(({ line }) => { const position = this.calculateRelativePosition( line, maxLine, viewport.viewportHeight ); return position > viewport.end + 5; }); return { content, hasAbove, hasBelow }; } /** * 可視コンテンツのフォーマット */ private formatVisibleContent(nodes: Node[]): string { let lastParent: Element | null = null; let contentParts: string[] = []; let currentBlock = ''; for (const node of nodes) { const parent = node.parentElement!; const text = node.textContent?.trim() || ''; if (lastParent && parent !== lastParent) { if (currentBlock) { contentParts.push(currentBlock); currentBlock = ''; } } switch (parent.tagName) { case 'H1': case 'H2': case 'H3': case 'H4': case 'H5': case 'H6': if (currentBlock) contentParts.push(currentBlock); const level = parseInt(parent.tagName[1]); TextFormatter.setIndentLevel(0); contentParts.push(TextFormatter.formatHeading(level, text)); currentBlock = ''; break; case 'A': const href = parent.getAttribute('href'); currentBlock += href ? TextFormatter.formatLink(text, href) : `${text} `; break; case 'LI': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push(TextFormatter.formatListItem(text)); currentBlock = ''; break; case 'P': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push(text); currentBlock = ''; break; case 'TABLE': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push(this.formatTable(parent)); currentBlock = ''; break; case 'FORM': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push(this.formatForm(parent)); currentBlock = ''; break; case 'NAV': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push('Navigation:'); currentBlock = ''; break; case 'ASIDE': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push('Sidebar:'); currentBlock = ''; break; case 'FOOTER': if (currentBlock) contentParts.push(currentBlock); TextFormatter.setIndentLevel(this.getElementDepth(parent)); contentParts.push('Footer:'); currentBlock = ''; break; default: currentBlock += `${text} `; } lastParent = parent; } if (currentBlock) { contentParts.push(currentBlock); } return TextFormatter.format(contentParts.filter(part => part.trim().length > 0).join('\n')); } /** * HTMLの解析を実行 */ public parse(): ParsedHTML { try { const document = this.dom.window.document; const title = this.safeGetTextContent(document.querySelector('title')); const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; const mainText = this.extractMainContent(); const cleanText = TextFormatter.format(mainText); return { title, description, content: cleanText }; } catch (error) { console.error('Error parsing HTML:', error); return { title: '', description: '', content: 'Failed to process webpage content' }; } } } /** * HTMLからテキストを抽出し構造化する */ export function extractTextFromHtml(html: string): string { try { const parser = new HTMLParser(html); const { title, description, content } = parser.parse(); return TextFormatter.generateStructuredContent(title, description, content); } catch (error) { console.error('Error in extractTextFromHtml:', error); return 'Failed to process webpage content'; } } /** * 画面に表示されている範囲内のテキストを抽出 */ export function extractVisibleText( html: string, viewport: ViewportDimensions, minVisiblePercentage?: number ): string { try { const parser = new HTMLParser(html); const { content, hasAbove, hasBelow } = parser.getVisibleContent({ viewport, minVisiblePercentage }); const parts: string[] = []; if (hasAbove) parts.push('...'); if (content) parts.push(content); if (hasBelow) parts.push('...'); return parts.join('\n'); } catch (error) { console.error('Error in extractVisibleText:', error); return 'Failed to extract visible content'; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/showfive/playwright-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server