Skip to main content
Glama

docs-mcp-server

HtmlLinkExtractorMiddleware.ts4.32 kB
import { logger } from "../../utils/logger"; import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; /** * Middleware to extract links (href attributes from <a> tags) from HTML content using Cheerio. * It expects the Cheerio API object to be available in `context.dom`. * This should run *after* parsing but *before* conversion to Markdown. */ export class HtmlLinkExtractorMiddleware implements ContentProcessorMiddleware { /** * Processes the context to extract links from the sanitized HTML body. * @param context The current middleware context. * @param next Function to call the next middleware. */ async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> { // Check if we have a Cheerio object from a previous step const $ = context.dom; if (!$) { logger.warn( `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`, ); await next(); return; } try { // Determine effective document base (first <base href> if present) let docBase = context.source; try { const baseEl = $("base[href]").first(); const rawBase = baseEl.attr("href"); if (rawBase && rawBase.trim() !== "") { try { const trimmed = rawBase.trim(); // Attempt to resolve candidate base against the document source. const candidate = new URL(trimmed, context.source); // Heuristic validation: // Accept if: // - Starts with a valid scheme (e.g. https:, data:, etc.) OR // - Protocol-relative (//...) OR // - Root-relative (/...) OR // - Dot-relative (./ or ../...) OR // - Otherwise a plain relative path segment WITHOUT a suspicious leading colon // // Reject (fallback to original page URL) if the first character is a colon or // there is a colon before any slash that does NOT form a valid scheme. const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed); const protocolRelative = trimmed.startsWith("//"); const firstSlash = trimmed.indexOf("/"); const firstColon = trimmed.indexOf(":"); const colonBeforeSlash = firstColon !== -1 && (firstSlash === -1 || firstColon < firstSlash); const suspiciousColon = colonBeforeSlash && !hasScheme && !protocolRelative; if (suspiciousColon || trimmed.startsWith(":")) { logger.debug( `Ignoring suspicious <base href> value (colon misuse): ${rawBase}`, ); } else { // Accept candidate (valid scheme, protocol/root/dot relative, or plain relative segment) docBase = candidate.href; } } catch { logger.debug(`Ignoring invalid <base href> value: ${rawBase}`); } } } catch { // Ignore base parsing errors } const linkElements = $("a[href]"); logger.debug( `Found ${linkElements.length} potential links in ${context.source} (base=${docBase})`, ); const extractedLinks: string[] = []; linkElements.each((_index, element) => { const href = $(element).attr("href"); if (href && href.trim() !== "") { try { const urlObj = new URL(href, docBase); if (!["http:", "https:", "file:"].includes(urlObj.protocol)) { logger.debug(`Ignoring link with invalid protocol: ${href}`); return; } extractedLinks.push(urlObj.href); } catch (_e) { logger.debug(`Ignoring invalid URL syntax: ${href}`); } } }); context.links = [...new Set(extractedLinks)]; logger.debug( `Extracted ${context.links.length} unique, valid links from ${context.source}`, ); } catch (error) { logger.error(`❌ Error extracting links from ${context.source}: ${error}`); context.errors.push( new Error( `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`, ), ); } await next(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server