Skip to main content
Glama

docs-mcp-server

WebScraperStrategy.test.ts32.8 kB
import { beforeEach, describe, expect, it, vi } from "vitest"; import type { Document } from "../../types"; import type { ScraperOptions } from "../types"; import { ScrapeMode } from "../types"; // Import ScrapeMode import { WebScraperStrategy } from "./WebScraperStrategy"; // Mock dependencies vi.mock("../../utils/logger"); // Mock HttpFetcher module with a factory vi.mock("../fetcher/HttpFetcher", async (importActual) => { return { ...(await importActual()), }; }); // Import the mocked HttpFetcher AFTER vi.mock import { HttpFetcher } from "../fetcher/HttpFetcher"; // Hold the mock function reference outside the factory scope const mockFetchFn = vi.spyOn(HttpFetcher.prototype, "fetch"); describe("WebScraperStrategy", () => { let strategy: WebScraperStrategy; let options: ScraperOptions; beforeEach(() => { vi.resetAllMocks(); // Resets calls and implementations on ALL mocks // Set default mock behavior for the fetch function for the suite mockFetchFn.mockResolvedValue({ content: "<html><body><h1>Default Mock Content</h1></body></html>", mimeType: "text/html", source: "https://example.com", // Default source }); // Create a fresh instance of the strategy for each test // It will receive the mocked HttpFetcher via dependency injection (if applicable) // or internal instantiation (which will use the mocked module) strategy = new WebScraperStrategy(); // Setup default options for tests options = { url: "https://example.com", library: "test", version: "1.0", maxPages: 99, maxDepth: 3, scope: "subpages", // Ensure followRedirects has a default for tests if needed by fetch mock checks followRedirects: true, scrapeMode: ScrapeMode.Fetch, // Use enum member }; // No need to mock prototype anymore // No need to mock pipeline directly }); // No need for afterEach vi.restoreAllMocks() as resetAllMocks() is in beforeEach it("should only accept http/https URLs", () => { expect(strategy.canHandle("https://example.com")).toBe(true); expect(strategy.canHandle("http://example.com")).toBe(true); expect(strategy.canHandle("file:///path/to/file.txt")).toBe(false); expect(strategy.canHandle("invalid://example.com")).toBe(false); expect(strategy.canHandle("any_string")).toBe(false); }, 10000); it("should use HttpFetcher to fetch content and process result", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com"; options.url = testUrl; // Ensure options match // Configure mock response for this specific test const expectedTitle = "Test Page Title"; mockFetchFn.mockResolvedValue({ content: `<html><head><title>${expectedTitle}</title></head><body><h1>Fetched Content</h1></body></html>`, mimeType: "text/html", source: testUrl, }); await strategy.scrape(options, progressCallback); // Verify HttpFetcher mock was called expect(mockFetchFn).toHaveBeenCalledWith(testUrl, { signal: undefined, // scrape doesn't pass signal in this basic call followRedirects: options.followRedirects, // Check default from options }); // Verify that the pipeline processed and called the callback with a document expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( (call) => call[0].document, ); expect(documentProcessingCall).toBeDefined(); // Use non-null assertion operator (!) since we've asserted it's defined expect(documentProcessingCall![0].document.content).toBe("# Fetched Content"); // Check processed markdown (from H1) expect(documentProcessingCall![0].document.metadata.title).toBe(expectedTitle); // Check extracted title (from <title>) }, 10000); it("should respect the followRedirects option", async () => { options.followRedirects = false; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify followRedirects option was passed to the fetcher mock expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", { signal: undefined, followRedirects: false, // Explicitly false from options }); // Also check that processing still happened expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( (call) => call[0].document, ); expect(documentProcessingCall).toBeDefined(); }, 10000); // --- Scope Tests --- // These tests now rely on the actual pipeline running, // verifying behavior by checking mockFetchFn calls and progressCallback results. it("should follow links based on scope=subpages", async () => { const baseHtml = ` <html><head><title>Test Site</title></head><body> <h1>Test Page</h1> <a href="https://example.com/subpage1">Subpage 1</a> <a href="https://example.com/subpage2/">Subpage 2</a> <a href="https://otherdomain.com/page">External Link</a> <a href="https://api.example.com/endpoint">Different Subdomain</a> <a href="/relative-path">Relative Path</a> </body></html>`; mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") return { content: baseHtml, mimeType: "text/html", source: url }; // Return simple content for subpages, title reflects URL return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.scope = "subpages"; options.maxDepth = 1; // Limit depth for simplicity options.maxPages = 5; // Allow enough pages const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify fetcher calls expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/subpage1", expect.anything(), ); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/subpage2/", expect.anything(), ); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/relative-path", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith( "https://otherdomain.com/page", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith( "https://api.example.com/endpoint", expect.anything(), ); // Verify documents via callback const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); // Type guard expect(receivedDocs).toHaveLength(4); expect(receivedDocs.some((doc) => doc.metadata.title === "Test Site")).toBe(true); expect( receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage1"), ).toBe(true); expect( receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage2/"), ).toBe(true); expect( receivedDocs.some( (doc) => doc.metadata.title === "https://example.com/relative-path", ), ).toBe(true); }, 10000); it("should follow links based on scope=hostname", async () => { mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { return { content: '<html><head><title>Base</title></head><body><a href="/subpage">Sub</a><a href="https://api.example.com/ep">API</a><a href="https://other.com">Other</a></body></html>', mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.scope = "hostname"; options.maxDepth = 1; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify fetcher calls expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/subpage", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith( "https://api.example.com/ep", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); expect(receivedDocs).toHaveLength(2); expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); expect( receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), ).toBe(true); }, 10000); it("should follow links based on scope=domain", async () => { mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { return { content: '<html><head><title>Base</title></head><body><a href="/subpage">Sub</a><a href="https://api.example.com/ep">API</a><a href="https://other.com">Other</a></body></html>', mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.scope = "domain"; options.maxDepth = 1; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify fetcher calls expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/subpage", expect.anything(), ); expect(mockFetchFn).toHaveBeenCalledWith( "https://api.example.com/ep", expect.anything(), ); // Same domain expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); expect(receivedDocs).toHaveLength(3); expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); expect( receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), ).toBe(true); expect( receivedDocs.some((doc) => doc.metadata.title === "https://api.example.com/ep"), ).toBe(true); }, 10000); // --- Limit Tests --- it("should respect maxDepth option", async () => { // Configure mock fetcher for depth testing mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { // Depth 0 return { content: '<html><head><title>L0</title></head><body><a href="/level1">L1</a></body></html>', mimeType: "text/html", source: url, }; } if (url === "https://example.com/level1") { // Depth 1 return { content: '<html><head><title>L1</title></head><body><a href="/level2">L2</a></body></html>', mimeType: "text/html", source: url, }; } if (url === "https://example.com/level2") { // Depth 2 return { content: '<html><head><title>L2</title></head><body><a href="/level3">L3</a></body></html>', mimeType: "text/html", source: url, }; } // Default for unexpected calls return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.maxDepth = 1; // Limit depth const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify fetcher calls expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/level1", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith( "https://example.com/level2", expect.anything(), ); // Exceeds depth // Verify documents via callback const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); expect(receivedDocs).toHaveLength(2); // Base (L0) + L1 expect(receivedDocs.some((doc) => doc.metadata.title === "L0")).toBe(true); expect(receivedDocs.some((doc) => doc.metadata.title === "L1")).toBe(true); }, 10000); it("should respect maxPages option", async () => { // Configure mock fetcher mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { return { content: '<html><head><title>Base</title></head><body><a href="/page1">1</a><a href="/page2">2</a><a href="/page3">3</a></body></html>', mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.maxPages = 2; // Limit pages const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Verify fetcher calls (should be exactly maxPages) expect(mockFetchFn).toHaveBeenCalledTimes(2); expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); // Check which subpage was called (only one should be) const page1Called = mockFetchFn.mock.calls.some( (call) => call[0] === "https://example.com/page1", ); const page2Called = mockFetchFn.mock.calls.some( (call) => call[0] === "https://example.com/page2", ); const page3Called = mockFetchFn.mock.calls.some( (call) => call[0] === "https://example.com/page3", ); const subpagesFetchedCount = [page1Called, page2Called, page3Called].filter( Boolean, ).length; expect(subpagesFetchedCount).toBe(1); // Exactly one subpage fetched // Verify documents via callback const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); expect(receivedDocs).toHaveLength(2); // Base + 1 subpage }, 10000); // --- Progress Test --- it("should report progress via callback", async () => { // Configure mock fetcher mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { return { content: '<html><head><title>Base</title></head><body><a href="/page1">1</a><a href="/page2">2</a></body></html>', mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); const progressCallback = vi.fn(); options.maxPages = 3; // Allow all pages options.maxDepth = 1; await strategy.scrape(options, progressCallback); // Verify callback calls const callsWithDocs = progressCallback.mock.calls.filter((call) => call[0].document); expect(callsWithDocs).toHaveLength(3); // Base + page1 + page2 // Check structure of a progress call with a document expect(callsWithDocs[0][0]).toMatchObject({ pagesScraped: expect.any(Number), totalPages: expect.any(Number), currentUrl: expect.any(String), depth: expect.any(Number), maxDepth: options.maxDepth, document: expect.objectContaining({ content: expect.any(String), metadata: expect.objectContaining({ url: expect.any(String), title: expect.any(String), // Title comes from pipeline now library: options.library, version: options.version, }), }), }); // Check specific URLs reported const reportedUrls = callsWithDocs.map((call) => call[0].document.metadata.url); expect(reportedUrls).toEqual( expect.arrayContaining([ "https://example.com", "https://example.com/page1", "https://example.com/page2", ]), ); }, 10000); it("should support scraping for URLs with embedded credentials (user:password@host)", async () => { // Test that the strategy can handle URLs with embedded credentials // Note: Actual credential extraction and browser auth is tested in HtmlPlaywrightMiddleware.test.ts // This test focuses on the strategy's ability to process such URLs through the pipeline const urlWithCreds = "https://user:password@example.com/"; options.url = urlWithCreds; options.scrapeMode = ScrapeMode.Fetch; // Use fetch mode to avoid Playwright browser operations const expectedMarkdown = "# Processed Content"; const expectedTitle = "Test Page"; // Mock fetch to simulate content processing // We'll mock the fetch to simulate processed output mockFetchFn.mockResolvedValue({ content: `<html><head><title>${expectedTitle}</title></head><body><h1>Processed Content</h1></body></html>`, mimeType: "text/html", source: urlWithCreds, }); const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Ensure fetch was called with the credentialed URL expect(mockFetchFn).toHaveBeenCalledWith( urlWithCreds, expect.objectContaining({ followRedirects: true }), ); // Ensure a document was produced with the expected markdown and title const docCall = progressCallback.mock.calls.find((call) => call[0].document); expect(docCall).toBeDefined(); expect(docCall![0].document.content).toContain(expectedMarkdown); expect(docCall![0].document.metadata.title).toBe(expectedTitle); }, 10000); // Keep timeout for consistency but test should run quickly with fetch mode it("should forward custom headers to HttpFetcher", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com"; options.url = testUrl; options.headers = { Authorization: "Bearer test-token", "X-Test-Header": "test-value", }; mockFetchFn.mockResolvedValue({ content: "<html><body>Header Test</body></html>", mimeType: "text/html", source: testUrl, }); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith( testUrl, expect.objectContaining({ headers: { Authorization: "Bearer test-token", "X-Test-Header": "test-value", }, }), ); }); describe("pipeline selection", () => { it("should process HTML content through HtmlPipeline", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com"; options.url = testUrl; mockFetchFn.mockResolvedValue({ content: "<html><head><title>HTML Test</title></head><body><h1>HTML Content</h1></body></html>", mimeType: "text/html", source: testUrl, }); await strategy.scrape(options, progressCallback); // Verify HTML content was processed (converted to markdown) const docCall = progressCallback.mock.calls.find((call) => call[0].document); expect(docCall).toBeDefined(); expect(docCall![0].document.content).toContain("# HTML Content"); expect(docCall![0].document.metadata.title).toBe("HTML Test"); }); it("should process markdown content through MarkdownPipeline", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com/readme.md"; options.url = testUrl; const markdownContent = "# Markdown Title\n\nThis is already markdown content."; mockFetchFn.mockResolvedValue({ content: markdownContent, mimeType: "text/markdown", source: testUrl, }); await strategy.scrape(options, progressCallback); // Verify markdown content was processed const docCall = progressCallback.mock.calls.find((call) => call[0].document); expect(docCall).toBeDefined(); expect(docCall![0].document.content).toContain("# Markdown Title"); expect(docCall![0].document.content).toContain("This is already markdown content."); }); it("should skip unsupported content types", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com/image.png"; options.url = testUrl; mockFetchFn.mockResolvedValue({ content: Buffer.from([0x89, 0x50, 0x4e, 0x47]), // PNG header mimeType: "image/png", source: testUrl, }); await strategy.scrape(options, progressCallback); // Verify no document was produced for unsupported content const docCall = progressCallback.mock.calls.find((call) => call[0].document); expect(docCall).toBeUndefined(); }); }); describe("error handling", () => { it("should handle fetch failures gracefully", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com/error"; options.url = testUrl; mockFetchFn.mockRejectedValue(new Error("Network error")); // Should throw the error (not swallow it) await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( "Network error", ); // Verify no documents were processed const docCalls = progressCallback.mock.calls.filter((call) => call[0].document); expect(docCalls).toHaveLength(0); }); it("should handle empty content gracefully", async () => { const progressCallback = vi.fn(); const testUrl = "https://example.com/empty"; options.url = testUrl; mockFetchFn.mockResolvedValue({ content: "<html><body></body></html>", // Empty content mimeType: "text/html", source: testUrl, }); await strategy.scrape(options, progressCallback); // Should complete without error but may not produce useful content // The behavior here depends on the pipeline implementation expect(mockFetchFn).toHaveBeenCalledWith(testUrl, expect.anything()); }); }); describe("custom link filtering", () => { it("should use custom shouldFollowLink function when provided", async () => { const customFilter = vi.fn().mockImplementation((_baseUrl: URL, targetUrl: URL) => { // Only follow links containing 'allowed' return targetUrl.pathname.includes("allowed"); }); const customStrategy = new WebScraperStrategy({ shouldFollowLink: customFilter, }); mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") { return { content: ` <html><head><title>Base</title></head><body> <a href="/allowed-page">Allowed Page</a> <a href="/blocked-page">Blocked Page</a> <a href="/also-allowed">Also Allowed</a> </body></html>`, mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.maxDepth = 1; const progressCallback = vi.fn(); await customStrategy.scrape(options, progressCallback); // Verify custom filter was called expect(customFilter).toHaveBeenCalled(); // Verify only allowed pages were fetched expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/allowed-page", expect.anything(), ); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/also-allowed", expect.anything(), ); expect(mockFetchFn).not.toHaveBeenCalledWith( "https://example.com/blocked-page", expect.anything(), ); // Verify documents were produced for allowed pages const receivedDocs = progressCallback.mock.calls .map((call) => call[0].document) .filter((doc): doc is Document => doc !== undefined); expect(receivedDocs).toHaveLength(3); // Base + 2 allowed pages }); }); // Canonical redirect test: relative links resolve against canonical final URL (directory form) it("should resolve relative links against canonical final URL with trailing slash + query", async () => { const original = "https://learn.microsoft.com/en-us/azure/bot-service"; const canonical = `${original}/?view=azure-bot-service-4.0`; // What the server redirects to const relHref = "bot-overview?view=azure-bot-service-4.0"; const expectedCanonicalFollow = "https://learn.microsoft.com/en-us/azure/bot-service/bot-overview?view=azure-bot-service-4.0"; // Mock fetch: initial fetch returns HTML with relative link and final canonical source (post-redirect) mockFetchFn.mockImplementation(async (url: string) => { if (url === original) { return { content: `<html><body><a href="${relHref}">Link</a></body></html>`, mimeType: "text/html", source: canonical, // Final URL after redirect }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.url = original; options.maxDepth = 1; options.maxPages = 5; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(original, expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith(expectedCanonicalFollow, expect.anything()); }); // Integration: relative resolution from index.html with subpages scope it("should follow nested descendant from index.html (subpages scope) but not upward sibling", async () => { const start = "https://example.com/api/index.html"; const nestedRel = "aiq/agent/index.html"; const upwardRel = "../shared/index.html"; const expectedNested = "https://example.com/api/aiq/agent/index.html"; const expectedUpward = "https://example.com/shared/index.html"; mockFetchFn.mockImplementation(async (url: string) => { if (url === start) { return { content: `<html><body> <a href="${nestedRel}">Nested</a> <a href="${upwardRel}">UpOne</a> </body></html>`, mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.url = start; options.scope = "subpages"; options.maxDepth = 1; options.maxPages = 5; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith(expectedNested, expect.anything()); expect(mockFetchFn).not.toHaveBeenCalledWith(expectedUpward, expect.anything()); }); // Integration: upward relative allowed with hostname scope it("should follow upward relative link when scope=hostname", async () => { const start = "https://example.com/api/index.html"; const nestedRel = "aiq/agent/index.html"; const upwardRel = "../shared/index.html"; const expectedNested = "https://example.com/api/aiq/agent/index.html"; const expectedUpward = "https://example.com/shared/index.html"; mockFetchFn.mockImplementation(async (url: string) => { if (url === start) { return { content: `<html><body> <a href="${nestedRel}">Nested</a> <a href="${upwardRel}">UpOne</a> </body></html>`, mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.url = start; options.scope = "hostname"; options.maxDepth = 1; options.maxPages = 10; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith(expectedNested, expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith(expectedUpward, expect.anything()); }); // Integration: directory base parity it("should treat directory base and index.html base equivalently for nested descendant", async () => { const startDir = "https://example.com/api/"; const nestedRel = "aiq/agent/index.html"; const expectedNested = "https://example.com/api/aiq/agent/index.html"; mockFetchFn.mockImplementation(async (url: string) => { if (url === startDir) { return { content: `<html><body><a href="${nestedRel}">Nested</a></body></html>`, mimeType: "text/html", source: url, }; } return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.url = startDir; options.scope = "subpages"; options.maxDepth = 1; options.maxPages = 5; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(startDir, expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith(expectedNested, expect.anything()); }); it("should not enqueue cross-origin links introduced via <base href> when scope=subpages", async () => { const start = "https://example.com/app/index.html"; const cdnBase = "https://cdn.example.com/lib/"; const relLink = "script.js"; const resolved = `${cdnBase}${relLink}`; mockFetchFn.mockImplementation(async (url: string) => { if (url === start) { return { content: `<html><head><base href="${cdnBase}"></head><body><a href="${relLink}">Script</a></body></html>`, mimeType: "text/html", source: url, }; } // Any unexpected fetches return generic content return { content: `<html><head><title>${url}</title></head><body>${url}</body></html>`, mimeType: "text/html", source: url, }; }); options.url = start; options.scope = "subpages"; options.maxDepth = 1; options.maxPages = 5; const progressCallback = vi.fn(); await strategy.scrape(options, progressCallback); // Should fetch only the start page; the cross-origin (different hostname) base-derived link is filtered out expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); expect(mockFetchFn).not.toHaveBeenCalledWith(resolved, expect.anything()); }); describe("cleanup", () => { it("should call close() on all pipelines when cleanup() is called", async () => { const strategy = new WebScraperStrategy(); // Spy on the close method of all pipelines (strategy as any).pipelines.forEach((pipeline: any) => { vi.spyOn(pipeline, "close"); }); await strategy.cleanup(); // Verify close was called on all pipelines (strategy as any).pipelines.forEach((pipeline: any) => { expect(pipeline.close).toHaveBeenCalledOnce(); }); }); it("should handle cleanup errors gracefully", async () => { const strategy = new WebScraperStrategy(); // Mock one pipeline to throw an error during cleanup vi.spyOn((strategy as any).pipelines[0], "close").mockRejectedValue( new Error("Pipeline cleanup failed"), ); // cleanup() should still complete and not throw await expect(strategy.cleanup()).resolves.not.toThrow(); }); it("should be idempotent - multiple cleanup() calls should not error", async () => { const strategy = new WebScraperStrategy(); // Multiple calls should not throw await expect(strategy.cleanup()).resolves.not.toThrow(); await expect(strategy.cleanup()).resolves.not.toThrow(); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server