Skip to main content
Glama
extractDocs.ts9.27 kB
import { type DefaultTreeAdapterMap, parse } from "parse5"; type Element = DefaultTreeAdapterMap["element"]; type ChildNode = DefaultTreeAdapterMap["childNode"]; type TextNode = DefaultTreeAdapterMap["textNode"]; type CommentNode = DefaultTreeAdapterMap["commentNode"]; /** * Extracts Revit API documentation from HTML and converts it to markdown */ export async function extractRvtDocsText(url: string): Promise<string> { const response = await fetch(url); const html = await response.text(); const doc = parse(html); const htmlElement = findElement( doc.childNodes, (node) => node.nodeName === "html", ); if (!htmlElement) throw new Error("HTML element not found"); const mainContent = findElementAfterComment( htmlElement, " Main content and footer ", ); if (!mainContent) throw new Error("Main content section not found"); let markdown = ""; // Extract left column content const leftColumn = findElementAfterComment( mainContent, " Left Column: Namespace, Title, Description, Remarks ", ); if (leftColumn) { markdown += extractLeftColumn(leftColumn); } // Extract hierarchy const rightColumn = findElementAfterComment( mainContent, " Right Column: Hierarchy - Only show div if hierarchy exists ", ); if (rightColumn) { const hierarchyHtml = extractHtmlContent(rightColumn); if (hierarchyHtml.trim()) { markdown += `## Hierarchy\n\n${htmlToMarkdown(hierarchyHtml)}\n\n`; } } // Extract syntax sections const syntaxSections = findAll( mainContent, (el) => hasClass(el, "card-title") && getText(el).includes("Syntax"), ); for (const section of syntaxSections) { markdown += extractSyntax(section); } // Extract tables const tables = findAll(mainContent, (el) => el.nodeName === "table"); for (const table of tables) { const tableMarkdown = extractTable(table); if (tableMarkdown.trim()) { markdown += `\n${tableMarkdown}`; } } return markdown.replace(/\n{3,}/g, "\n\n").trim(); } function findElementAfterComment( node: ChildNode, commentText: string, ): Element | null { if ( node.nodeName === "#comment" && (node as CommentNode).data.includes(commentText) ) { if (node.parentNode) { const siblings = node.parentNode.childNodes; const index = siblings.indexOf(node); for (let i = index + 1; i < siblings.length; i++) { const sibling = siblings[i]; if (sibling.nodeName !== "#text" && sibling.nodeName !== "#comment") { return sibling as Element; } } } } if ("childNodes" in node) { for (const child of node.childNodes) { const result = findElementAfterComment(child, commentText); if (result) return result; } } return null; } function extractLeftColumn(element: Element): string { let markdown = ""; // Namespace const namespace = find(element, (el) => hasClass(el, "card-namespace")); if (namespace) { const text = cleanText(getText(namespace)); if (text.includes("Namespace:")) { markdown += `**Namespace:** ${text.replace("Namespace:", "").trim()}\n\n`; } } // Title const titleCard = find(element, (el) => hasClass(el, "card-title")); if (titleCard) { const h1 = find(titleCard, (el) => el.nodeName === "h1"); if (h1) { markdown += `# ${cleanText(getText(h1))}\n\n`; } const typeBadge = find(titleCard, (el) => hasClass(el, "bg-gray-200")); if (typeBadge) { markdown += `**Type:** ${cleanText(getText(typeBadge))}\n\n`; } } // Description const description = find(element, (el) => hasClass(el, "card-description")); if (description) { const html = extractHtmlContent(description).replace( "<strong>Description:</strong>", "", ).trim(); if (html) { markdown += `## Description\n\n${htmlToMarkdown(html)}\n\n`; } } // Remarks const remarks = find(element, (el) => hasClass(el, "card-remarks")); if (remarks) { const html = extractHtmlContent(remarks).replace( "<strong>Remarks:</strong>", "", ).trim(); if (html) { markdown += `## Remarks\n\n${htmlToMarkdown(html)}\n\n`; } } return markdown; } function extractSyntax(syntaxTitle: Element): string { let markdown = "## Syntax\n\n"; const parentCard = findParent(syntaxTitle, (el) => hasClass(el, "card")); if (parentCard) { const codeSnippets = findAll( parentCard, (el) => hasClass(el, "code-snippet"), ); for (const snippet of codeSnippets) { const codeElement = find(snippet, (el) => el.nodeName === "code"); if (codeElement) { const code = cleanText(getText(codeElement)); if (code) { const codeClass = getAttr(codeElement, "class") || ""; const language = codeClass.includes("vbnet") ? "vbnet" : codeClass.includes("cpp") ? "cpp" : "csharp"; markdown += `\`\`\`${language}\n${code}\n\`\`\`\n\n`; } } } } return markdown; } function extractTable(table: Element): string { let markdown = ""; const thead = find(table, (el) => el.nodeName === "thead"); const tbody = find(table, (el) => el.nodeName === "tbody"); if (thead) { const headerRow = find(thead, (el) => el.nodeName === "tr"); if (headerRow) { const headers = findAll(headerRow, (el) => el.nodeName === "th").map( (el) => cleanText(getText(el)), ); markdown += `| ${headers.join(" | ")} |\n`; markdown += `|${headers.map(() => "---").join("|")}|\n`; } } if (tbody) { const rows = findAll(tbody, (el) => el.nodeName === "tr"); for (const row of rows) { const cells = findAll(row, (el) => el.nodeName === "td").map((el) => cleanText(getText(el)) ); if (cells.length > 0) { markdown += `| ${cells.join(" | ")} |\n`; } } } return `${markdown}\n`; } // Consolidated helper functions function find( element: Element | ChildNode, predicate: (el: Element) => boolean, ): Element | null { if ( "nodeName" in element && element.nodeName !== "#text" && element.nodeName !== "#comment" ) { const el = element as Element; if (predicate(el)) return el; } if ("childNodes" in element) { for (const child of element.childNodes) { const result = find(child, predicate); if (result) return result; } } return null; } function findElement( node: ChildNode | ChildNode[], predicate: (el: Element) => boolean, ): Element | null { const nodes = Array.isArray(node) ? node : [node]; for (const n of nodes) { const result = find(n, predicate); if (result) return result; } return null; } function findAll( element: Element, predicate: (el: Element) => boolean, ): Element[] { const results: Element[] = []; if (predicate(element)) { results.push(element); } if (element.childNodes) { for (const child of element.childNodes) { if (child.nodeName !== "#text" && child.nodeName !== "#comment") { results.push(...findAll(child as Element, predicate)); } } } return results; } function findParent( element: Element, predicate: (el: Element) => boolean, ): Element | null { let current = element.parentNode; while (current && "attrs" in current) { const el = current as Element; if (predicate(el)) return el; current = el.parentNode; } return null; } function hasClass(element: Element, className: string): boolean { return element.attrs?.some((attr) => attr.name === "class" && attr.value.includes(className) ) ?? false; } function getAttr(element: Element, name: string): string | undefined { return element.attrs?.find((attr) => attr.name === name)?.value; } function getText(node: ChildNode): string { if (node.nodeName === "#text") { return (node as TextNode).value; } if ("childNodes" in node) { return node.childNodes.map(getText).join(""); } return ""; } function cleanText(text: string): string { return text.replace(/\s+/g, " ").trim(); } function extractHtmlContent(element: Element): string { let html = ""; if (element.childNodes) { for (const child of element.childNodes) { if (child.nodeName === "#text") { html += (child as TextNode).value; } else if (child.nodeName === "br") { html += "\n"; } else if (child.nodeName === "strong") { html += `<strong>${getText(child)}</strong>`; } else if (["ul", "ol", "li", "p"].includes(child.nodeName)) { html += `<${child.nodeName}>${ extractHtmlContent(child as Element) }</${child.nodeName}>`; } else { html += extractHtmlContent(child as Element); } } } return html; } function htmlToMarkdown(html: string): string { return html .replace(/<br\s*\/?>/g, "\n") .replace(/<p>/g, "") .replace(/<\/p>/g, "\n") .replace(/<strong>(.*?)<\/strong>/g, "**$1**") .replace(/<ul>/g, "") .replace(/<\/ul>/g, "") .replace(/<ol>/g, "") .replace(/<\/ol>/g, "") .replace(/<li>(.*?)<\/li>/g, "- $1") .replace(/\n{3,}/g, "\n\n") .replace(/\s+/g, " ") .replace(/\n /g, "\n") .trim(); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kaitpw/Rvt_Docs_MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server