docs-mcp-server

Overview Schema Related Servers Score Discussions

HtmlNormalizationMiddleware.ts•5.68 KiB

import type * as cheerio from "cheerio"; import type { AnyNode } from "domhandler"; import { logger } from "../../utils/logger"; import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; /** * Middleware that normalizes URLs and links in HTML content after DOM parsing. * * This middleware performs the following transformations: * - Converts relative image URLs to absolute URLs * - Removes tracking/ad images (1x1 pixels, analytics beacons) * - Converts relative link URLs to absolute URLs * - Removes anchor links (#...) but preserves their text content * - Removes non-HTTP links (javascript:, mailto:, etc.) but preserves their text content * * This ensures that indexed documents contain functional absolute URLs and removes * non-functional links while preserving contextually valuable text content. */ export class HtmlNormalizationMiddleware implements ContentProcessorMiddleware { // Known tracking/analytics domains and patterns to filter out private readonly trackingPatterns = [ "adroll.com", "doubleclick.net", "google-analytics.com", "googletagmanager.com", "analytics.twitter.com", "twitter.com/1/i/adsct", "t.co/1/i/adsct", "bat.bing.com", "pixel.rubiconproject.com", "casalemedia.com", "tremorhub.com", "rlcdn.com", "facebook.com/tr", "linkedin.com/px", "quantserve.com", "scorecardresearch.com", "hotjar.com", "mouseflow.com", "crazyegg.com", "clarity.ms", ]; async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> { if (!context.dom) { logger.debug( `Skipping HTML normalization for ${context.source} - no DOM available`, ); await next(); return; } try { logger.debug(`Normalizing HTML URLs and links for ${context.source}`); const $ = context.dom; const baseUrl = context.source; // Normalize image URLs this.normalizeImageUrls($, baseUrl); // Normalize and clean links this.normalizeLinks($, baseUrl); logger.debug(`Successfully normalized HTML content for ${context.source}`); } catch (error) { logger.error(`❌ Failed to normalize HTML for ${context.source}: ${error}`); context.errors.push( error instanceof Error ? error : new Error(`HTML normalization failed: ${String(error)}`), ); } await next(); } /** * Checks if an image should be kept based on its source URL. * Filters out tracking pixels and analytics beacons. */ private shouldKeepImage(src: string): boolean { const srcLower = src.toLowerCase(); return !this.trackingPatterns.some((pattern) => srcLower.includes(pattern)); } /** * Normalizes image URLs by converting relative URLs to absolute URLs. * Removes tracking/analytics images. * Preserves data URIs (inline images). */ private normalizeImageUrls($: cheerio.CheerioAPI, baseUrl: string): void { $("img").each((_index, element) => { const $img = $(element); const src = $img.attr("src"); if (!src) { // Remove images without src $img.remove(); return; } // Keep data URIs (inline images) as-is if (src.startsWith("data:")) { return; } // Check if this is a tracking image if (!this.shouldKeepImage(src)) { $img.remove(); return; } try { // If it's already an absolute URL, leave it unchanged new URL(src); } catch { // It's a relative URL, convert to absolute try { const absoluteUrl = new URL(src, baseUrl).href; $img.attr("src", absoluteUrl); } catch (error) { logger.debug(`Failed to resolve relative image URL: ${src} - ${error}`); // Remove images we can't resolve $img.remove(); } } }); } /** * Normalizes links by: * - Converting relative URLs to absolute URLs * - Unwrapping anchor links (preserving text content) * - Unwrapping non-HTTP links (preserving text content) */ private normalizeLinks($: cheerio.CheerioAPI, baseUrl: string): void { $("a").each((_index, element) => { const $link = $(element); const href = $link.attr("href"); if (!href) { // Links without href - unwrap them (preserve content, remove tag) this.unwrapElement($, $link); return; } // Handle anchor links (starting with #) if (href.startsWith("#")) { this.unwrapElement($, $link); return; } // Check if it's already an absolute URL try { const url = new URL(href); // Handle non-HTTP protocols (javascript:, mailto:, etc.) if (url.protocol !== "http:" && url.protocol !== "https:") { this.unwrapElement($, $link); return; } // It's already a valid HTTP/HTTPS absolute URL, leave it unchanged } catch { // It's a relative URL, convert to absolute try { const absoluteUrl = new URL(href, baseUrl).href; $link.attr("href", absoluteUrl); } catch (error) { logger.debug(`Failed to resolve relative link URL: ${href} - ${error}`); // If we can't resolve it, unwrap it to preserve the text content this.unwrapElement($, $link); } } }); } /** * Unwraps an element by replacing it with its HTML content. * This preserves the inner HTML (including nested elements) while removing the wrapping tag. */ private unwrapElement( _$: cheerio.CheerioAPI, $element: cheerio.Cheerio<AnyNode>, ): void { const htmlContent = $element.html() || $element.text(); $element.replaceWith(htmlContent); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

HtmlNormalizationMiddleware.ts•5.68 KiB