Skip to main content
Glama

docs-mcp-server

HtmlPipeline.ts4.41 kB
import { GreedySplitter, SemanticMarkdownSplitter } from "../../splitter"; import { SPLITTER_MAX_CHUNK_SIZE, SPLITTER_MIN_CHUNK_SIZE, SPLITTER_PREFERRED_CHUNK_SIZE, } from "../../utils/config"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import type { ContentFetcher, RawContent } from "../fetcher/types"; import { HtmlSanitizerMiddleware } from "../middleware"; import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware"; import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware"; import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware"; import { HtmlNormalizationMiddleware } from "../middleware/HtmlNormalizationMiddleware"; import { HtmlPlaywrightMiddleware } from "../middleware/HtmlPlaywrightMiddleware"; import { HtmlToMarkdownMiddleware } from "../middleware/HtmlToMarkdownMiddleware"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { resolveCharset } from "../utils/charset"; import { BasePipeline } from "./BasePipeline"; import type { ProcessedContent } from "./types"; /** * Pipeline for processing HTML content using middleware and semantic splitting with size optimization. * Converts HTML to clean markdown format then uses SemanticMarkdownSplitter for semantic chunking, * followed by GreedySplitter for universal size optimization. */ export class HtmlPipeline extends BasePipeline { private readonly playwrightMiddleware: HtmlPlaywrightMiddleware; private readonly standardMiddleware: ContentProcessorMiddleware[]; private readonly greedySplitter: GreedySplitter; constructor( preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE, ) { super(); this.playwrightMiddleware = new HtmlPlaywrightMiddleware(); this.standardMiddleware = [ new HtmlCheerioParserMiddleware(), new HtmlMetadataExtractorMiddleware(), new HtmlLinkExtractorMiddleware(), new HtmlSanitizerMiddleware(), new HtmlNormalizationMiddleware(), new HtmlToMarkdownMiddleware(), ]; // Create the two-phase splitting: semantic + size optimization const semanticSplitter = new SemanticMarkdownSplitter( preferredChunkSize, maxChunkSize, ); this.greedySplitter = new GreedySplitter( semanticSplitter, SPLITTER_MIN_CHUNK_SIZE, preferredChunkSize, ); } canProcess(rawContent: RawContent): boolean { return MimeTypeUtils.isHtml(rawContent.mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, ): Promise<ProcessedContent> { // Use enhanced charset detection that considers HTML meta tags const resolvedCharset = resolveCharset( rawContent.charset, rawContent.content, rawContent.mimeType, ); const contentString = convertToString(rawContent.content, resolvedCharset); const context: MiddlewareContext = { content: contentString, source: rawContent.source, metadata: {}, links: [], errors: [], options, fetcher, }; // Build middleware stack dynamically based on scrapeMode let middleware: ContentProcessorMiddleware[] = [...this.standardMiddleware]; if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") { middleware = [this.playwrightMiddleware, ...middleware]; } // Execute the middleware stack using the base class method await this.executeMiddlewareStack(middleware, context); // Split the content using SemanticMarkdownSplitter (HTML is converted to markdown by middleware) const chunks = await this.greedySplitter.splitText( typeof context.content === "string" ? context.content : "", ); return { textContent: typeof context.content === "string" ? context.content : "", metadata: context.metadata, links: context.links, errors: context.errors, chunks, }; } /** * Cleanup resources used by this pipeline, specifically the Playwright browser instance. */ public async close(): Promise<void> { await super.close(); // Call base class close (no-op by default) await this.playwrightMiddleware.closeBrowser(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server