Skip to main content
Glama

docs-mcp-server

HtmlToMarkdownMiddleware.ts4.9 kB
// @ts-expect-error import { gfm } from "@joplin/turndown-plugin-gfm"; import TurndownService from "turndown"; import { logger } from "../../utils/logger"; // Added logger import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; /** * Middleware to convert the final processed HTML content (from Cheerio object in context.dom) * into Markdown using Turndown, applying custom rules. */ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { private turndownService: TurndownService; constructor() { this.turndownService = new TurndownService({ headingStyle: "atx", hr: "---", bulletListMarker: "-", codeBlockStyle: "fenced", emDelimiter: "_", strongDelimiter: "**", linkStyle: "inlined", }); this.turndownService.use(gfm); this.addCustomRules(); } private addCustomRules(): void { // Preserve code blocks and syntax (replicated from HtmlProcessor) this.turndownService.addRule("pre", { filter: ["pre"], replacement: (_content, node) => { const element = node as unknown as HTMLElement; let language = element.getAttribute("data-language") || ""; if (!language) { // Try to infer the language from the class name // This is a common pattern in syntax highlighters const highlightElement = element.closest( '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]', ) || element.querySelector( '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]', ); if (highlightElement) { const className = highlightElement.className; const match = className.match( /(?:highlight-source-|highlight-|language-)(\w+)/, ); if (match) language = match[1]; } } const brElements = Array.from(element.querySelectorAll("br")); for (const br of brElements) { br.replaceWith("\n"); } const text = element.textContent || ""; return `\n\`\`\`${language}\n${text.replace(/^\n+|\n+$/g, "")}\n\`\`\`\n`; }, }); this.turndownService.addRule("anchor", { filter: ["a"], replacement: (content, node) => { const href = (node as HTMLElement).getAttribute("href"); if (!content || content === "#") { return ""; // Remove if content is # or empty } if (!href) { return content; // Preserve content if href is missing or empty } return `[${content}](${href})`; // Standard link conversion }, }); } /** * Processes the context to convert the sanitized HTML body node to Markdown. * @param context The current processing context. * @param next Function to call the next middleware. */ async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> { // Check if we have a Cheerio object from a previous step const $ = context.dom; if (!$) { logger.warn( `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware ran correctly.`, ); await next(); return; } // Only process if we have a Cheerio object (implicitly means it's HTML) try { logger.debug(`Converting HTML content to Markdown for ${context.source}`); // Provide Turndown with the HTML string content from the Cheerio object's body, // or the whole document if body is empty/unavailable. const htmlToConvert = $("body").html() || $.html(); const markdown = this.turndownService.turndown(htmlToConvert).trim(); if (!markdown) { // If conversion results in empty markdown, log a warning but treat as valid empty markdown const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`; logger.warn(`⚠️ ${warnMsg}`); context.content = ""; } else { // Conversion successful and produced non-empty markdown context.content = markdown; logger.debug(`Successfully converted HTML to Markdown for ${context.source}`); } } catch (error) { logger.error( `❌ Error converting HTML to Markdown for ${context.source}: ${error}`, ); context.errors.push( new Error( `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`, ), ); // Decide if pipeline should stop? For now, continue. } // Call the next middleware in the chain regardless of whether conversion happened await next(); // No need to close/free Cheerio object explicitly // context.dom = undefined; // Optionally clear the dom property if no longer needed downstream } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server