Skip to main content
Glama

docs-mcp-server

GitHubWikiScraperStrategy.test.ts22.4 kB
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { HttpFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { HtmlPipeline } from "../pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; import { ScrapeMode, type ScraperOptions } from "../types"; import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; // Mock the fetcher and pipelines vi.mock("../fetcher"); vi.mock("../pipelines/HtmlPipeline"); vi.mock("../pipelines/MarkdownPipeline"); const mockHttpFetcher = vi.mocked(HttpFetcher); const mockHtmlPipeline = vi.mocked(HtmlPipeline); const mockMarkdownPipeline = vi.mocked(MarkdownPipeline); describe("GitHubWikiScraperStrategy", () => { let strategy: GitHubWikiScraperStrategy; let httpFetcherInstance: any; let htmlPipelineInstance: any; let markdownPipelineInstance: any; beforeEach(() => { // Reset all mocks vi.clearAllMocks(); // Setup fetcher mock httpFetcherInstance = { fetch: vi.fn(), }; mockHttpFetcher.mockImplementation(() => httpFetcherInstance); // Setup pipeline mocks htmlPipelineInstance = { canProcess: vi.fn(), process: vi.fn(), }; markdownPipelineInstance = { canProcess: vi.fn(), process: vi.fn(), }; mockHtmlPipeline.mockImplementation(() => htmlPipelineInstance); mockMarkdownPipeline.mockImplementation(() => markdownPipelineInstance); strategy = new GitHubWikiScraperStrategy(); }); afterEach(() => { vi.restoreAllMocks(); }); describe("canHandle", () => { it("should handle GitHub wiki URLs", () => { expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(true); expect(strategy.canHandle("https://github.com/owner/repo/wiki/")).toBe(true); expect(strategy.canHandle("https://github.com/owner/repo/wiki/Home")).toBe(true); expect( strategy.canHandle("https://github.com/owner/repo/wiki/Getting-Started"), ).toBe(true); expect(strategy.canHandle("https://www.github.com/owner/repo/wiki/API")).toBe(true); }); it("should not handle non-wiki GitHub URLs", () => { expect(strategy.canHandle("https://github.com/owner/repo")).toBe(false); expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(false); expect( strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), ).toBe(false); expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); }); it("should not handle non-GitHub URLs", () => { expect(strategy.canHandle("https://example.com/wiki")).toBe(false); expect(strategy.canHandle("https://gitlab.com/owner/repo/wiki")).toBe(false); expect(strategy.canHandle("https://bitbucket.org/owner/repo/wiki")).toBe(false); }); it("should handle malformed URLs gracefully", () => { expect(strategy.canHandle("invalid-url")).toBe(false); expect(strategy.canHandle("")).toBe(false); expect(strategy.canHandle("not-a-url-at-all")).toBe(false); }); }); describe("parseGitHubWikiUrl", () => { it("should parse basic wiki URL", () => { const result = (strategy as any).parseGitHubWikiUrl( "https://github.com/owner/repo/wiki", ); expect(result).toEqual({ owner: "owner", repo: "repo", }); }); it("should parse wiki URL with trailing slash", () => { const result = (strategy as any).parseGitHubWikiUrl( "https://github.com/owner/repo/wiki/", ); expect(result).toEqual({ owner: "owner", repo: "repo", }); }); it("should parse wiki URL with specific page", () => { const result = (strategy as any).parseGitHubWikiUrl( "https://github.com/owner/repo/wiki/Home", ); expect(result).toEqual({ owner: "owner", repo: "repo", }); }); it("should parse wiki URL with complex page name", () => { const result = (strategy as any).parseGitHubWikiUrl( "https://github.com/owner/repo/wiki/Getting-Started-Guide", ); expect(result).toEqual({ owner: "owner", repo: "repo", }); }); it("should handle www subdomain", () => { const result = (strategy as any).parseGitHubWikiUrl( "https://www.github.com/owner/repo/wiki", ); expect(result).toEqual({ owner: "owner", repo: "repo", }); }); it("should throw error for invalid wiki URL", () => { expect(() => { (strategy as any).parseGitHubWikiUrl("https://github.com/invalid"); }).toThrow("Invalid GitHub wiki URL"); expect(() => { (strategy as any).parseGitHubWikiUrl("https://github.com/owner/repo"); }).toThrow("Invalid GitHub wiki URL"); }); }); describe("shouldProcessUrl", () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki", library: "test-lib", version: "1.0.0", }; it("should process URLs within the same wiki", () => { expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/Home", options, ), ).toBe(true); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/API", options, ), ).toBe(true); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/Getting-Started", options, ), ).toBe(true); }); it("should not process URLs outside the wiki", () => { expect( (strategy as any).shouldProcessUrl("https://github.com/owner/repo", options), ).toBe(false); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/tree/main", options, ), ).toBe(false); expect( (strategy as any).shouldProcessUrl( "https://github.com/other/repo/wiki/Home", options, ), ).toBe(false); }); it("should respect include patterns", () => { const optionsWithInclude = { ...options, includePatterns: ["API*", "Getting*"], }; expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/API-Reference", optionsWithInclude, ), ).toBe(true); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/Getting-Started", optionsWithInclude, ), ).toBe(true); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/Home", optionsWithInclude, ), ).toBe(false); }); it("should respect exclude patterns", () => { const optionsWithExclude = { ...options, excludePatterns: ["*deprecated*", "old-*"], }; expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/deprecated-api", optionsWithExclude, ), ).toBe(false); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/old-guide", optionsWithExclude, ), ).toBe(false); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/current-guide", optionsWithExclude, ), ).toBe(true); }); it("should handle Home page as default", () => { expect( (strategy as any).shouldProcessUrl("https://github.com/owner/repo/wiki", options), ).toBe(true); expect( (strategy as any).shouldProcessUrl( "https://github.com/owner/repo/wiki/", options, ), ).toBe(true); }); it("should handle malformed URLs gracefully", () => { expect((strategy as any).shouldProcessUrl("invalid-url", options)).toBe(false); expect((strategy as any).shouldProcessUrl("", options)).toBe(false); }); }); describe("processItem", () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki", library: "test-lib", version: "1.0.0", }; it("should process wiki page and return document with links", async () => { const rawContent: RawContent = { content: ` <!DOCTYPE html> <html> <head><title>Wiki Home</title></head> <body> <h1>Welcome to the Wiki</h1> <p>This is the home page of our documentation.</p> <ul> <li><a href="/owner/repo/wiki/API">API Documentation</a></li> <li><a href="/owner/repo/wiki/Getting-Started">Getting Started</a></li> <li><a href="https://external.com">External Link</a></li> </ul> </body> </html> `, mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", }; const processedContent = { textContent: "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", metadata: { title: "Wiki Home" }, errors: [], links: [ "/owner/repo/wiki/API", "/owner/repo/wiki/Getting-Started", "https://external.com", ], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.document).toEqual({ content: "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", contentType: "text/html", metadata: { url: "https://github.com/owner/repo/wiki/Home", title: "Wiki Home", library: "test-lib", version: "1.0.0", }, }); // Should only include wiki links, not external links expect(result.links).toEqual([ "https://github.com/owner/repo/wiki/API", "https://github.com/owner/repo/wiki/Getting-Started", ]); }); it("should use page name as title fallback when no title found", async () => { const rawContent: RawContent = { content: "<html><body><p>Content without title</p></body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Getting-Started", charset: "utf-8", }; const processedContent = { textContent: "Content without title", metadata: { title: "" }, errors: [], links: [], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Getting-Started", depth: 1, }; const result = await (strategy as any).processItem(item, options); expect(result.document?.metadata.title).toBe("Getting-Started"); }); it("should handle Home page title fallback", async () => { const rawContent: RawContent = { content: "<html><body><p>Home page content</p></body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki", charset: "utf-8", }; const processedContent = { textContent: "Home page content", metadata: { title: "" }, errors: [], links: [], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.document?.metadata.title).toBe("Home"); }); it("should force ScrapeMode.Fetch for consistent behavior", async () => { const rawContent: RawContent = { content: "<html><body><h1>Test</h1></body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Test", charset: "utf-8", }; const processedContent = { textContent: "Test", metadata: { title: "Test" }, errors: [], links: [], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockImplementation( async (_content: any, opts: any) => { expect(opts.scrapeMode).toBe("fetch"); return processedContent; }, ); const optionsWithPlaywright = { ...options, scrapeMode: ScrapeMode.Playwright, }; const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; await (strategy as any).processItem(item, optionsWithPlaywright); expect(htmlPipelineInstance.process).toHaveBeenCalledWith( rawContent, expect.objectContaining({ scrapeMode: "fetch" }), expect.any(Object), ); }); it("should handle unsupported content types", async () => { const rawContent: RawContent = { content: "binary content", mimeType: "application/octet-stream", source: "https://github.com/owner/repo/wiki/Binary", charset: "utf-8", }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(false); markdownPipelineInstance.canProcess.mockReturnValue(false); const item = { url: "https://github.com/owner/repo/wiki/Binary", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.document).toBeUndefined(); expect(result.links).toEqual([]); }); it("should handle fetch errors gracefully", async () => { httpFetcherInstance.fetch.mockRejectedValue(new Error("Network error")); const item = { url: "https://github.com/owner/repo/wiki/Unreachable", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.document).toBeUndefined(); expect(result.links).toEqual([]); }); it("should handle processing errors from pipelines", async () => { const rawContent: RawContent = { content: "<html><body><h1>Test</h1></body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Test", charset: "utf-8", }; const processedContentWithErrors = { textContent: "Test", metadata: { title: "Test" }, errors: [new Error("Processing warning")], links: [], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContentWithErrors); const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.document).toBeDefined(); expect(result.document?.content).toBe("Test"); }); }); describe("scrape", () => { it("should validate GitHub wiki URL", async () => { const invalidOptions: ScraperOptions = { url: "https://example.com/wiki", library: "test-lib", version: "1.0.0", }; await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( "URL must be a GitHub wiki URL", ); }); it("should validate GitHub URL without wiki path", async () => { const invalidOptions: ScraperOptions = { url: "https://github.com/owner/repo", library: "test-lib", version: "1.0.0", }; await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( "URL must be a GitHub wiki URL", ); }); it("should append /Home to bare wiki URLs", async () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki", library: "test-lib", version: "1.0.0", }; // Mock super.scrape to capture the options passed to it const superScrapeSpy = vi.spyOn( Object.getPrototypeOf(Object.getPrototypeOf(strategy)), "scrape", ); superScrapeSpy.mockResolvedValue(undefined); await strategy.scrape(options, vi.fn()); expect(superScrapeSpy).toHaveBeenCalledWith( expect.objectContaining({ url: "https://github.com/owner/repo/wiki/Home", }), expect.any(Function), undefined, ); superScrapeSpy.mockRestore(); }); it("should append /Home to wiki URLs with trailing slash", async () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki/", library: "test-lib", version: "1.0.0", }; const superScrapeSpy = vi.spyOn( Object.getPrototypeOf(Object.getPrototypeOf(strategy)), "scrape", ); superScrapeSpy.mockResolvedValue(undefined); await strategy.scrape(options, vi.fn()); expect(superScrapeSpy).toHaveBeenCalledWith( expect.objectContaining({ url: "https://github.com/owner/repo/wiki/Home", }), expect.any(Function), undefined, ); superScrapeSpy.mockRestore(); }); it("should not modify URLs that already point to specific pages", async () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki/Getting-Started", library: "test-lib", version: "1.0.0", }; const superScrapeSpy = vi.spyOn( Object.getPrototypeOf(Object.getPrototypeOf(strategy)), "scrape", ); superScrapeSpy.mockResolvedValue(undefined); await strategy.scrape(options, vi.fn()); expect(superScrapeSpy).toHaveBeenCalledWith( expect.objectContaining({ url: "https://github.com/owner/repo/wiki/Getting-Started", }), expect.any(Function), undefined, ); superScrapeSpy.mockRestore(); }); }); describe("Link filtering and URL normalization", () => { const options: ScraperOptions = { url: "https://github.com/owner/repo/wiki", library: "test-lib", version: "1.0.0", }; it("should convert relative links to absolute URLs", async () => { const rawContent: RawContent = { content: ` <html><body> <a href="/owner/repo/wiki/API">API Docs</a> <a href="Getting-Started">Getting Started</a> <a href="./Advanced-Topics">Advanced</a> </body></html> `, mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", }; const processedContent = { textContent: "Content", metadata: { title: "Test" }, errors: [], links: ["/owner/repo/wiki/API", "Getting-Started", "./Advanced-Topics"], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.links).toEqual([ "https://github.com/owner/repo/wiki/API", "https://github.com/owner/repo/wiki/Getting-Started", "https://github.com/owner/repo/wiki/Advanced-Topics", ]); }); it("should filter out non-wiki links", async () => { const rawContent: RawContent = { content: "<html><body>Content</body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", }; const processedContent = { textContent: "Content", metadata: { title: "Test" }, errors: [], links: [ "https://github.com/owner/repo/wiki/API", // Should include "https://github.com/owner/repo", // Should exclude (not wiki) "https://github.com/other/repo/wiki/Home", // Should exclude (different repo) "https://external.com/wiki", // Should exclude (external domain) "mailto:test@example.com", // Should exclude (different protocol) ], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; const result = await (strategy as any).processItem(item, options); expect(result.links).toEqual(["https://github.com/owner/repo/wiki/API"]); }); it("should handle malformed URLs in links gracefully", async () => { const rawContent: RawContent = { content: "<html><body>Content</body></html>", mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", }; const processedContent = { textContent: "Content", metadata: { title: "Test" }, errors: [], links: [ "invalid-url", "https://github.com/owner/repo/wiki/Valid", "", "not-a-url-at-all", ], }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); htmlPipelineInstance.canProcess.mockReturnValue(true); htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; const result = await (strategy as any).processItem(item, options); // Should only include the valid wiki link expect(result.links).toEqual(["https://github.com/owner/repo/wiki/Valid"]); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server