Skip to main content
Glama

docs-mcp-server

HtmlPipeline.test.ts12.3 kB
// Copyright (c) 2025 import { beforeEach, describe, expect, it, vi } from "vitest"; import type { RawContent } from "../fetcher/types"; import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware"; import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware"; import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware"; import { HtmlSanitizerMiddleware } from "../middleware/HtmlSanitizerMiddleware"; import { HtmlToMarkdownMiddleware } from "../middleware/HtmlToMarkdownMiddleware"; import { ScrapeMode, type ScraperOptions } from "../types"; import { HtmlPipeline } from "./HtmlPipeline"; describe("HtmlPipeline", () => { beforeEach(() => { // Set up spies without mock implementations to use real middleware vi.spyOn(HtmlCheerioParserMiddleware.prototype, "process"); vi.spyOn(HtmlMetadataExtractorMiddleware.prototype, "process"); vi.spyOn(HtmlLinkExtractorMiddleware.prototype, "process"); vi.spyOn(HtmlSanitizerMiddleware.prototype, "process"); vi.spyOn(HtmlToMarkdownMiddleware.prototype, "process"); }); it("canProcess returns true for text/html", () => { const pipeline = new HtmlPipeline(); expect(pipeline.canProcess({ mimeType: "text/html" } as RawContent)).toBe(true); expect(pipeline.canProcess({ mimeType: "application/xhtml+xml" } as RawContent)).toBe( true, ); }); it("canProcess returns false for non-html", () => { const pipeline = new HtmlPipeline(); expect(pipeline.canProcess({ mimeType: "text/markdown" } as RawContent)).toBe(false); // @ts-expect-error expect(pipeline.canProcess({ mimeType: undefined } as RawContent)).toBe(false); }); it("process decodes Buffer content with UTF-8 charset", async () => { const pipeline = new HtmlPipeline(); const raw: RawContent = { content: Buffer.from("<html><body>abc</body></html>", "utf-8"), mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) expect(result.textContent).toBeTruthy(); expect(result.textContent).toContain("abc"); }); it("process decodes Buffer content with ISO-8859-1 charset", async () => { // Create a spy to capture the content before it's processed let capturedContent = ""; const originalProcess = HtmlCheerioParserMiddleware.prototype.process; vi.spyOn(HtmlCheerioParserMiddleware.prototype, "process").mockImplementationOnce( async function (this: HtmlCheerioParserMiddleware, ctx, next) { capturedContent = ctx.content; // Call the original implementation after capturing return originalProcess.call(this, ctx, next); }, ); const pipeline = new HtmlPipeline(); // Create a buffer with ISO-8859-1 encoding (Latin-1) // This contains characters that would be encoded differently in UTF-8 const raw: RawContent = { content: Buffer.from("<html><body>Café</body></html>", "latin1"), mimeType: "text/html", charset: "iso-8859-1", // Explicitly set charset to ISO-8859-1 source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); // Verify the content was properly decoded expect(capturedContent).toBe("<html><body>Café</body></html>"); // Check that we got some markdown content (exact format depends on the actual middleware) expect(result.textContent).toBeTruthy(); expect(result.textContent).toContain("Café"); }); it("process defaults to UTF-8 when charset is not specified", async () => { const pipeline = new HtmlPipeline(); const raw: RawContent = { content: Buffer.from("<html><body>abc</body></html>", "utf-8"), mimeType: "text/html", // No charset specified source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) expect(result.textContent).toBeTruthy(); expect(result.textContent).toContain("abc"); }); it("process uses string content directly", async () => { const pipeline = new HtmlPipeline(); const raw: RawContent = { content: "<html><body>abc</body></html>", mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) expect(result.textContent).toBeTruthy(); expect(result.textContent).toContain("abc"); }); it("process decodes Buffer content with UTF-16LE BOM", async () => { const pipeline = new HtmlPipeline(); // UTF-16LE BOM: 0xFF 0xFE, then 'abc' as UTF-16LE const buf = Buffer.from([0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00]); const raw: RawContent = { content: buf, mimeType: "text/html", charset: "utf-16le", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); }); it("process decodes Buffer content with UTF-8 BOM", async () => { const pipeline = new HtmlPipeline(); // UTF-8 BOM: 0xEF 0xBB 0xBF, then 'abc' const buf = Buffer.from([0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63]); const raw: RawContent = { content: buf, mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); }); it("process decodes Buffer content with Japanese UTF-8 text", async () => { const pipeline = new HtmlPipeline(); const japanese = "<html><body>こんにちは世界</body></html>"; // "Hello, world" in Japanese const raw: RawContent = { content: Buffer.from(japanese, "utf-8"), mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("こんにちは世界"); }); it("process decodes Buffer content with Russian UTF-8 text", async () => { const pipeline = new HtmlPipeline(); const russian = "<html><body>Привет, мир</body></html>"; // "Hello, world" in Russian const raw: RawContent = { content: Buffer.from(russian, "utf-8"), mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("Привет, мир"); }); it("process calls middleware in order and aggregates results", async () => { const pipeline = new HtmlPipeline(); const html = ` <html> <head> <title>Test Title</title> </head> <body> <p>This is a <a href="https://test.link/">test link</a>.</p> </body> </html> `; const raw: RawContent = { content: html, mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); // Verify all middleware was called expect(HtmlCheerioParserMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlMetadataExtractorMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlLinkExtractorMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlSanitizerMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlToMarkdownMiddleware.prototype.process).toHaveBeenCalledTimes(1); // Verify the result contains expected data from the actual middleware expect(result.metadata.title).toBe("Test Title"); expect(result.links).toContain("https://test.link/"); expect(result.textContent).toBeTruthy(); expect(result.textContent).toEqual("This is a [test link](https://test.link/)."); }); it("process collects errors from middleware", async () => { // Override with error-generating implementation just for this test vi.spyOn(HtmlMetadataExtractorMiddleware.prototype, "process").mockImplementationOnce( async (ctx, next) => { ctx.errors.push(new Error("fail")); await next(); }, ); const pipeline = new HtmlPipeline(); const raw: RawContent = { content: "<html><body>abc</body></html>", mimeType: "text/html", charset: "utf-8", source: "http://test", }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.errors.some((e) => e.message === "fail")).toBe(true); }); it("should correctly process HTML through the full standard middleware stack (E2E with spies)", async () => { // Reset call counts for all spies vi.clearAllMocks(); const pipeline = new HtmlPipeline(); // Sample HTML with elements for each middleware to process const html = ` <html> <head> <title>Test Page</title> <meta name="description" content="A test page for E2E testing"> </head> <body> <h1>Hello World</h1> <p>This is a <a href="https://example.com/test/link">test link</a>.</p> <script>alert('This should be sanitized');</script> <img src="image.jpg" onerror="alert('This attribute should be sanitized');"> </body> </html> `; const raw: RawContent = { content: html, mimeType: "text/html", charset: "utf-8", source: "http://test.example.com", }; const result = await pipeline.process(raw, { url: "http://example.com", library: "example", version: "", scrapeMode: ScrapeMode.Fetch, }); // Verify all middleware was called expect(HtmlCheerioParserMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlMetadataExtractorMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlLinkExtractorMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlSanitizerMiddleware.prototype.process).toHaveBeenCalledTimes(1); expect(HtmlToMarkdownMiddleware.prototype.process).toHaveBeenCalledTimes(1); // Verify the result contains expected data // The exact values will depend on the actual middleware implementations expect(result.metadata.title).toBe("Test Page"); expect(result.links).toContain("https://example.com/test/link"); // Verify the content was sanitized (no script tags) and converted to markdown expect(result.textContent).not.toContain("alert"); expect(result.textContent).toContain("Hello World"); expect(result.textContent).toContain("test link"); // Verify no errors occurred expect(result.errors).toHaveLength(0); }); describe("cleanup", () => { it("should call closeBrowser on Playwright middleware when close() is called", async () => { const pipeline = new HtmlPipeline(); // Spy on the closeBrowser method const closeBrowserSpy = vi.spyOn( (pipeline as any).playwrightMiddleware, "closeBrowser", ); await pipeline.close(); expect(closeBrowserSpy).toHaveBeenCalledOnce(); }); it("should be idempotent - multiple close() calls should not error", async () => { const pipeline = new HtmlPipeline(); // Multiple calls should not throw await expect(pipeline.close()).resolves.not.toThrow(); await expect(pipeline.close()).resolves.not.toThrow(); await expect(pipeline.close()).resolves.not.toThrow(); }); it("should call close() even if closeBrowser throws an error", async () => { const pipeline = new HtmlPipeline(); // Mock closeBrowser to throw an error vi.spyOn((pipeline as any).playwrightMiddleware, "closeBrowser").mockRejectedValue( new Error("Browser cleanup failed"), ); // close() should still complete (error should be handled internally or thrown) await expect(pipeline.close()).rejects.toThrow("Browser cleanup failed"); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server