Skip to main content
Glama

MCP Server for Crawl4AI

by omgwtfwow
index.server.test.ts63.5 kB
/* eslint-env jest */ import { jest } from '@jest/globals'; import { describe, it, expect, beforeEach } from '@jest/globals'; // Create mock functions const mockGetMarkdown = jest.fn(); const mockCaptureScreenshot = jest.fn(); const mockGeneratePDF = jest.fn(); const mockExecuteJS = jest.fn(); const mockGetHTML = jest.fn(); const mockBatchCrawl = jest.fn(); const mockExtractWithLLM = jest.fn(); const mockCrawl = jest.fn(); const mockParseSitemap = jest.fn(); // Mock the Crawl4AIService module jest.unstable_mockModule('../crawl4ai-service.js', () => ({ Crawl4AIService: jest.fn().mockImplementation(() => ({ getMarkdown: mockGetMarkdown, captureScreenshot: mockCaptureScreenshot, generatePDF: mockGeneratePDF, executeJS: mockExecuteJS, getHTML: mockGetHTML, batchCrawl: mockBatchCrawl, extractWithLLM: mockExtractWithLLM, crawl: mockCrawl, parseSitemap: mockParseSitemap, })), })); // Mock MCP SDK const mockSetRequestHandler = jest.fn(); const mockTool = jest.fn(); const mockConnect = jest.fn(); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/index.js', () => ({ Server: jest.fn().mockImplementation(() => ({ setRequestHandler: mockSetRequestHandler, tool: mockTool, connect: mockConnect, })), })); // Mock the types module that exports the schemas const CallToolRequestSchema = { method: 'tools/call' }; const ListToolsRequestSchema = { method: 'tools/list' }; jest.unstable_mockModule('@modelcontextprotocol/sdk/types.js', () => ({ CallToolRequestSchema, ListToolsRequestSchema, })); jest.unstable_mockModule('@modelcontextprotocol/sdk/server/stdio.js', () => ({ StdioServerTransport: jest.fn(), })); // Mock axios const mockPost = jest.fn(); const mockGet = jest.fn(); const mockHead = jest.fn(); jest.unstable_mockModule('axios', () => ({ default: { create: jest.fn(() => ({ post: mockPost, get: mockGet, head: mockHead, })), get: mockGet, }, })); // Now dynamically import the modules after mocks are set up const { Crawl4AIServer } = await import('../server.js'); const { GetMarkdownSchema, CrawlSchema, BatchCrawlSchema, CaptureScreenshotSchema: _CaptureScreenshotSchema, GeneratePdfSchema: _GeneratePdfSchema, ExecuteJsSchema: _ExecuteJsSchema, ExtractWithLlmSchema: _ExtractWithLlmSchema, SmartCrawlSchema: _SmartCrawlSchema, CrawlRecursiveSchema: _CrawlRecursiveSchema, } = await import('../schemas/validation-schemas.js'); const { Crawl4AIService } = await import('../crawl4ai-service.js'); // Import types statically (these are removed at compile time) import type { MarkdownEndpointResponse, ScreenshotEndpointResponse, PDFEndpointResponse, HTMLEndpointResponse, CrawlEndpointResponse, } from '../types.js'; // Define types for test results interface ContentItem { type: string; text?: string; data?: string; resource?: { uri: string; mimeType: string; blob: string; }; } interface ToolResult { content: ContentItem[]; } type RequestHandler = (request: { method: string; params: unknown }) => Promise<ToolResult>; // Removed TestServerMethods interface - no longer needed since we use 'any' type describe('Crawl4AIServer Tool Handlers', () => { let server: any; // eslint-disable-line @typescript-eslint/no-explicit-any let requestHandler: RequestHandler; beforeEach(async () => { jest.clearAllMocks(); // Reset all mock functions mockGetMarkdown.mockReset(); mockCaptureScreenshot.mockReset(); mockGeneratePDF.mockReset(); mockExecuteJS.mockReset(); mockGetHTML.mockReset(); mockBatchCrawl.mockReset(); mockExtractWithLLM.mockReset(); mockCrawl.mockReset(); mockParseSitemap.mockReset(); mockPost.mockReset(); mockGet.mockReset(); mockHead.mockReset(); // Create server instance - the mock will be used automatically server = new Crawl4AIServer( process.env.CRAWL4AI_BASE_URL || 'http://test.example.com', process.env.CRAWL4AI_API_KEY || 'test-api-key', 'test-server', '1.0.0', ); // Start the server to register handlers await server.start(); // Get the request handler for CallToolRequestSchema const handlerCalls = mockSetRequestHandler.mock.calls; // Find the handler for CallToolRequestSchema (tools/call) for (const call of handlerCalls) { const [schema, handler] = call; if (schema && schema.method === 'tools/call') { requestHandler = handler; break; } } // Debug: Check if we found the handler if (!requestHandler) { console.log('Handler calls:', handlerCalls.length); handlerCalls.forEach((call, i) => { console.log(`Call ${i}:`, call[0], typeof call[1]); }); } }); // Add a simple test to verify mocking works it('should use the mocked service', () => { const MockedService = Crawl4AIService as jest.MockedClass<typeof Crawl4AIService>; expect(MockedService).toHaveBeenCalledTimes(1); expect(MockedService).toHaveBeenCalledWith('http://localhost:11235', 'test-api-key'); }); describe('Constructor and setup', () => { it('should initialize with correct configuration', () => { expect(server).toBeDefined(); expect(server.service).toBeDefined(); expect(server.sessions).toBeDefined(); }); it('should set up handlers on construction', () => { expect(mockSetRequestHandler).toHaveBeenCalled(); expect(mockSetRequestHandler.mock.calls.length).toBeGreaterThan(0); }); }); describe('Tool Handler Success Cases', () => { describe('get_markdown', () => { it('should handle successful markdown extraction', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'fit', query: null, cache: 'false', markdown: '# Example Page\n\nThis is example content.', success: true, }; mockGetMarkdown.mockResolvedValue(mockResponse); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('# Example Page'); expect(result.content[0].text).toContain('URL: https://example.com'); expect(result.content[0].text).toContain('Filter: fit'); }); it('should handle markdown with query', async () => { const mockResponse: MarkdownEndpointResponse = { url: 'https://example.com', filter: 'bm25', query: 'test query', cache: 'false', markdown: 'Filtered content', success: true, }; mockGetMarkdown.mockResolvedValue(mockResponse); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', filter: 'bm25', query: 'test query', }); expect(mockGetMarkdown).toHaveBeenCalledWith({ url: 'https://example.com', f: 'bm25', q: 'test query', }); expect(result.content[0].text).toContain('Query: test query'); }); }); describe('capture_screenshot', () => { it('should handle successful screenshot capture', async () => { const mockResponse: ScreenshotEndpointResponse = { success: true, screenshot: 'base64-encoded-screenshot-data', }; mockCaptureScreenshot.mockResolvedValue(mockResponse); const result: ToolResult = await server.captureScreenshot({ url: 'https://example.com', }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('image'); expect(result.content[0].data).toBe('base64-encoded-screenshot-data'); expect(result.content[1].type).toBe('text'); expect(result.content[1].text).toBe('Screenshot captured for: https://example.com'); }); }); describe('generate_pdf', () => { it('should handle successful PDF generation', async () => { const mockResponse: PDFEndpointResponse = { success: true, pdf: 'base64-encoded-pdf-data', }; mockGeneratePDF.mockResolvedValue(mockResponse); const result: ToolResult = await server.generatePDF({ url: 'https://example.com', }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('resource'); expect(result.content[0].resource.blob).toBeDefined(); expect(result.content[1].type).toBe('text'); expect(result.content[1].text).toContain('PDF generated for: https://example.com'); }); }); describe('execute_js', () => { it('should handle successful JS execution', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: ['Title: Example', 'Link count: 5'], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: ['return document.title', 'return document.links.length'], }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Title: Example'); expect(result.content[0].text).toContain('Link count: 5'); }); it('should handle JS execution without results', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: null, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'console.log("test")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('No results returned'); }); it('should handle JS execution with error status', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: [ { success: false, error: 'Error: Test error', stack: 'Error: Test error\n at eval (eval at evaluate (:291:30), <anonymous>:4:43)', }, ], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'throw new Error("Test error")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Script: throw new Error("Test error")'); expect(result.content[0].text).toContain('Returned: Error: Error: Test error'); }); it('should handle JS execution with no return value', async () => { const mockResponse = { markdown: 'Page content', js_execution_result: { success: true, results: [{ success: true }], }, }; mockExecuteJS.mockResolvedValue(mockResponse); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: 'console.log("hello")', }); expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); expect(result.content[0].text).toContain('Returned: Executed successfully (no return value)'); }); }); describe('get_html', () => { it('should handle successful HTML retrieval', async () => { const mockResponse: HTMLEndpointResponse = { html: '<html><body><h1>Example</h1></body></html>', url: 'https://example.com', success: true, }; mockGetHTML.mockResolvedValue(mockResponse); const result: ToolResult = await server.getHTML({ url: 'https://example.com', }); expect(result.content).toHaveLength(1); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('<html><body><h1>Example</h1></body></html>'); }); }); describe('batch_crawl', () => { it('should handle successful batch crawl', async () => { const mockResponse = { results: [ { url: 'https://example1.com', markdown: { raw_markdown: 'Content 1' }, success: true }, { url: 'https://example2.com', markdown: { raw_markdown: 'Content 2' }, success: true }, ], success: true, }; // Mock axios response since batchCrawl uses axiosClient directly mockPost.mockResolvedValue({ data: mockResponse }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example1.com', 'https://example2.com'], }); expect(result.content).toHaveLength(1); expect(result.content[0].text).toContain('Batch crawl completed'); expect(result.content[0].text).toContain('Processed 2 URLs'); }); it('should handle batch crawl with remove_images', async () => { // Mock axios response since batchCrawl uses axiosClient directly mockPost.mockResolvedValue({ data: { results: [] } }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com'], remove_images: true, }); expect(mockPost).toHaveBeenCalledWith('/crawl', { urls: ['https://example.com'], crawler_config: { exclude_tags: ['img', 'picture', 'svg'], }, }); expect(result.content[0].text).toContain('Batch crawl completed'); }); }); describe('crawl', () => { it('should handle successful crawl with all options', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', html: '<html>...</html>', cleaned_html: '<html>clean</html>', fit_html: '<html>fit</html>', success: true, status_code: 200, response_headers: {}, session_id: 'test-session', metadata: { title: 'Example' }, links: { internal: [], external: [] }, media: { images: [], videos: [], audios: [] }, markdown: { raw_markdown: '# Example', markdown_with_citations: '# Example [1]', references_markdown: '[1]: https://example.com', fit_markdown: '# Example', fit_html: '<h1>Example</h1>', }, tables: [], extracted_content: null, screenshot: 'screenshot-data', pdf: 'pdf-data', mhtml: null, js_execution_result: { success: true, results: ['JS result'] }, downloaded_files: null, network_requests: null, console_messages: ['Console log'], ssl_certificate: null, dispatch_result: null, }, ], server_processing_time_s: 1.5, server_memory_delta_mb: 10, server_peak_memory_mb: 100, }; mockCrawl.mockResolvedValue(mockResponse); const result: ToolResult = await server.crawl({ url: 'https://example.com', screenshot: true, pdf: true, js_code: 'return document.title', session_id: 'test-session', }); expect(result.content.length).toBeGreaterThan(0); // Multiple content types // Check text content const textContent = result.content.find((c) => c.type === 'text' && c.text?.includes('# Example')); expect(textContent).toBeDefined(); // Check screenshot const screenshotContent = result.content.find((c) => c.type === 'image'); expect(screenshotContent?.data).toBe('screenshot-data'); }); it('should handle crawl with proxy configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Proxied content' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', proxy_server: 'http://proxy.example.com:8080', proxy_username: 'user', proxy_password: 'pass', }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ browser_config: expect.objectContaining({ proxy_config: { server: 'http://proxy.example.com:8080', username: 'user', password: 'pass', }, }), }), ); }); it('should handle crawl with cookies and headers', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content with auth' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', cookies: [{ name: 'session', value: 'abc123' }], headers: { Authorization: 'Bearer token123' }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ browser_config: expect.objectContaining({ cookies: [{ name: 'session', value: 'abc123' }], headers: { Authorization: 'Bearer token123' }, }), }), ); }); it('should handle virtual scroll configuration', async () => { const mockResponse: CrawlEndpointResponse = { success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Scrolled content' }, success: true, status_code: 200, }, ], }; mockCrawl.mockResolvedValue(mockResponse); await server.crawl({ url: 'https://example.com', virtual_scroll_config: { enabled: true, scroll_step: 100, max_scrolls: 10, }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ virtual_scroll_config: { enabled: true, scroll_step: 100, max_scrolls: 10, }, }), }), ); }); it('should handle js_code as null error', async () => { await expect( server.crawl({ url: 'https://example.com', js_code: null, }), ).rejects.toThrow('js_code parameter is null'); }); }); describe('extract_with_llm', () => { it('should handle successful LLM extraction', async () => { mockExtractWithLLM.mockResolvedValue({ answer: 'The main topic is JavaScript testing.', }); const result: ToolResult = await server.extractWithLLM({ url: 'https://example.com', query: 'What is the main topic?', }); expect(result.content).toHaveLength(1); expect(result.content[0].text).toBe('The main topic is JavaScript testing.'); }); }); describe('extract_links', () => { it('should extract and categorize links', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/page2', text: 'Page 2' }, ], external: [{ href: 'https://external.com', text: 'External' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('Link analysis for https://example.com:'); expect(result.content[0].text).toContain('internal (2)'); expect(result.content[0].text).toContain('/page1'); expect(result.content[0].text).toContain('external (1)'); }); it('should categorize external links (social, images, scripts)', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [ 'https://facebook.com/profile', 'https://example.com/image.jpg', 'https://cdn.com/script.js', ], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('social (1)'); expect(result.content[0].text).toContain('images (1)'); expect(result.content[0].text).toContain('scripts (1)'); expect(result.content[0].text).toContain('external (0)'); }); }); describe('crawl_recursive', () => { it('should crawl recursively with depth limit', async () => { // Ensure mock is clean before setting up mockPost.mockReset(); mockPost .mockResolvedValueOnce({ data: { results: [ { url: 'https://example.com', links: { internal: [{ href: 'https://example.com/page1', text: 'Page 1' }], }, markdown: { raw_markdown: 'Home page' }, success: true, }, ], }, }) .mockResolvedValueOnce({ data: { results: [ { url: 'https://example.com/page1', links: { internal: [] }, markdown: { raw_markdown: 'Page 1 content' }, success: true, }, ], }, }); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com', max_depth: 2, }); expect(result.content[0].text).toContain('Recursive crawl completed:'); expect(result.content[0].text).toContain('Pages crawled: 2'); expect(result.content[0].text).toContain('https://example.com'); expect(result.content[0].text).toContain('https://example.com/page1'); }); }); describe('parse_sitemap', () => { it('should parse sitemap successfully', async () => { mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/</loc></url> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/page2</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Sitemap parsed successfully:'); expect(result.content[0].text).toContain('Total URLs found: 3'); expect(result.content[0].text).toContain('https://example.com/'); expect(result.content[0].text).toContain('https://example.com/page1'); }); }); describe('smart_crawl', () => { it('should handle smart crawl for HTML content', async () => { mockHead.mockResolvedValue({ headers: { 'content-type': 'text/html' }, }); mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'HTML content' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); // Already contains 'Smart crawl detected content type' }); it('should handle smart crawl for PDF content', async () => { mockHead.mockResolvedValue({ headers: { 'content-type': 'application/pdf' }, }); // Mock the crawl response for PDF mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'PDF content extracted' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/doc.pdf', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); expect(result.content[0].text).toContain('PDF content extracted'); }); }); }); describe('Tool Handler Error Cases', () => { describe('Service errors', () => { it('should handle service error for get_markdown', async () => { mockGetMarkdown.mockRejectedValue(new Error('Network error')); await expect(server.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 'Failed to get markdown: Network error', ); }); it('should handle axios error with response detail', async () => { const axiosError = { response: { data: { detail: 'Invalid API key', }, }, }; mockCaptureScreenshot.mockRejectedValue(axiosError); await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Failed to capture screenshot: Invalid API key', ); }); it('should handle missing screenshot data', async () => { mockCaptureScreenshot.mockResolvedValue({ success: false, screenshot: '', }); await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 'Screenshot capture failed - no screenshot data in response', ); }); it('should handle missing PDF data', async () => { mockGeneratePDF.mockResolvedValue({ success: true, pdf: '', }); await expect(server.generatePDF({ url: 'https://example.com' })).rejects.toThrow( 'PDF generation failed - no PDF data in response', ); }); }); describe('Validation errors', () => { it('should handle missing scripts for execute_js', async () => { await expect( server.executeJS({ url: 'https://example.com', scripts: null as unknown as string }), ).rejects.toThrow('scripts is required'); }); it('should handle empty crawl options', async () => { await expect(server.crawl(null as unknown as Parameters<typeof server.crawl>[0])).rejects.toThrow( 'crawl requires options object with at least a url parameter', ); }); it('should handle crawl_recursive errors', async () => { // Setup the mock to fail - crawlRecursive catches the error internally mockPost.mockRejectedValue(new Error('API error')); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com' }); // The method catches errors and returns a message about no pages crawled expect(result.content[0].text).toContain('Pages crawled: 0'); expect(result.content[0].text).toContain('No pages could be crawled'); }); it('should handle parse_sitemap errors', async () => { mockGet.mockRejectedValue(new Error('Failed to fetch sitemap')); await expect(server.parseSitemap({ url: 'https://example.com/sitemap.xml' })).rejects.toThrow( 'Failed to parse sitemap: Failed to fetch sitemap', ); }); }); describe('Edge cases', () => { it('should handle batch crawl with no results', async () => { mockPost.mockResolvedValue({ data: { results: [], }, }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com'], }); expect(result.content[0].text).toContain('Batch crawl completed'); expect(result.content[0].text).toContain('Processed 0 URLs'); }); it('should handle extract_links with no links', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', }); expect(result.content[0].text).toContain('All links from https://example.com:'); expect(result.content[0].text).toMatch(/\n\s*$/); }); it('should handle smart crawl with HEAD request failure', async () => { mockHead.mockRejectedValue(new Error('HEAD failed')); // Fallback to HTML crawl mockPost.mockResolvedValue({ data: { results: [ { markdown: { raw_markdown: 'Fallback content' }, links: { internal: [], external: [] }, }, ], }, }); const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type'); }); }); describe('ZodError validation tests', () => { it('should validate get_markdown parameters', () => { // Valid case expect(() => { GetMarkdownSchema.parse({ url: 'https://example.com' }); }).not.toThrow(); // Invalid - missing url expect(() => { GetMarkdownSchema.parse({ filter: 'fit' }); }).toThrow(); // Invalid - bm25 without query expect(() => { GetMarkdownSchema.parse({ url: 'https://example.com', filter: 'bm25' }); }).toThrow('Query parameter is required when using bm25 or llm filter'); }); it('should validate crawl parameters', () => { // Valid case expect(() => { CrawlSchema.parse({ url: 'https://example.com' }); }).not.toThrow(); // Invalid - js_only without session_id expect(() => { CrawlSchema.parse({ url: 'https://example.com', js_only: true }); }).toThrow('js_only requires session_id'); // Invalid - empty js_code array expect(() => { CrawlSchema.parse({ url: 'https://example.com', js_code: [] }); }).toThrow('js_code array cannot be empty'); }); it('should validate batch_crawl parameters', () => { // Valid case expect(() => { BatchCrawlSchema.parse({ urls: ['https://example.com'] }); }).not.toThrow(); // Invalid - not an array expect(() => { BatchCrawlSchema.parse({ urls: 'not-an-array' }); }).toThrow(); }); }); describe('Parameter validation edge cases', () => { // These tests require proper schema validation which happens at the handler level // Skipping direct method calls as they bypass validation }); describe('Additional coverage tests', () => { it('should handle crawl with media extraction', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, media: { images: [ { src: 'https://example.com/img1.jpg', alt: 'Image 1' }, { src: 'https://example.com/img2.jpg', alt: 'Image 2' }, ], videos: [{ src: 'https://example.com/video.mp4', type: 'video/mp4' }], audios: [], }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', media_handling: { images: true, videos: true }, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with tables extraction', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, tables: [ { headers: ['Name', 'Age'], rows: [ ['John', '30'], ['Jane', '25'], ], markdown: '| Name | Age |\n|------|-----|\n| John | 30 |\n| Jane | 25 |', }, ], success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with network_requests', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, network_requests: [ { url: 'https://api.example.com/data', method: 'GET', status: 200 }, { url: 'https://api.example.com/post', method: 'POST', status: 201 }, ], success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', network_requests: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with mhtml output', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, mhtml: 'MHTML content here', success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', mhtml: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with downloaded_files', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, downloaded_files: { 'file1.pdf': 'base64content1', 'file2.doc': 'base64content2', }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', download_files: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with ssl_certificate', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, ssl_certificate: { issuer: "Let's Encrypt", subject: '*.example.com', validFrom: '2024-01-01', validTo: '2024-12-31', protocol: 'TLSv1.3', }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', ssl_certificate: true, }); expect(result.content.length).toBeGreaterThan(0); expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toBe('Content'); }); it('should handle crawl with wait_for conditions', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Dynamic content loaded' }, success: true, status_code: 200, }, ], }); await server.crawl({ url: 'https://example.com', wait_for: { selector: '.dynamic-content', timeout: 5000, }, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ wait_for: { selector: '.dynamic-content', timeout: 5000, }, }), }), ); }); it('should handle crawl error scenarios', async () => { mockCrawl.mockResolvedValue({ success: false, results: [ { url: 'https://example.com', success: false, error: 'Page load timeout', status_code: 0, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('No content extracted'); }); it('should handle extract_links with categorized output', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/page2', text: 'Page 2' }, ], external: [{ href: 'https://external.com', text: 'External' }], social: [{ href: 'https://twitter.com/example', text: 'Twitter' }], documents: [{ href: '/file.pdf', text: 'PDF Document' }], images: [{ href: '/image.jpg', text: 'Image' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('internal (2)'); expect(result.content[0].text).toContain('external (1)'); expect(result.content[0].text).toContain('social (0)'); // No social links in internal/external expect(result.content[0].text).toContain('documents (0)'); // No documents in internal/external expect(result.content[0].text).toContain('images (0)'); // No images in internal/external }); it('should handle smart_crawl for sitemap', async () => { // Set up axios client mock for the server instance const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/sitemap.xml', markdown: { raw_markdown: 'Sitemap content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('Sitemap content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/sitemap.xml'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should handle smart_crawl for RSS feed', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/rss+xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/feed.rss', markdown: { raw_markdown: 'RSS feed content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/feed.rss', }); expect(result.content[0].text).toContain('Smart crawl detected content type: rss'); expect(result.content[0].text).toContain('RSS feed content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/feed.rss'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should handle smart_crawl for JSON content', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/json' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/data.json', markdown: { raw_markdown: 'JSON content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/data.json', }); expect(result.content[0].text).toContain('Smart crawl detected content type: json'); expect(result.content[0].text).toContain('JSON content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/data.json'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); it('should correctly categorize internal documents and images', async () => { mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [ { href: '/page1', text: 'Page 1' }, { href: '/docs/manual.pdf', text: 'Manual' }, { href: '/images/logo.png', text: 'Logo' }, { href: '/assets/style.css', text: 'Styles' }, ], external: [{ href: 'https://example.com/report.pdf', text: 'External Report' }], }, }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: true, }); expect(result.content[0].text).toContain('internal (1)'); // Only /page1 remains as internal expect(result.content[0].text).toContain('external (0)'); // External PDF moved to documents expect(result.content[0].text).toContain('documents (2)'); // Both PDFs expect(result.content[0].text).toContain('images (1)'); // The PNG expect(result.content[0].text).toContain('scripts (1)'); // The CSS }); it('should handle smart_crawl for plain text', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'text/plain' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/file.txt', markdown: { raw_markdown: 'This is plain text content' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/file.txt', }); expect(result.content[0].text).toContain('Smart crawl detected content type: text'); expect(result.content[0].text).toContain('This is plain text content'); expect(axiosClientMock.post).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ urls: ['https://example.com/file.txt'], crawler_config: expect.objectContaining({ cache_mode: 'ENABLED', }), browser_config: expect.objectContaining({ headless: true, browser_type: 'chromium', }), }), ); }); }); describe('Additional Method Tests', () => { it('should handle parse_sitemap', async () => { // Mock axios.get to return sitemap XML mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/page2</loc></url> <url><loc>https://example.com/page3</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', }); expect(result.content[0].text).toContain('Sitemap parsed successfully'); expect(result.content[0].text).toContain('Total URLs found: 3'); }); it('should handle parse_sitemap with filter', async () => { // Mock axios.get to return sitemap XML with blog URLs mockGet.mockResolvedValue({ data: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>https://example.com/page1</loc></url> <url><loc>https://example.com/blog/post1</loc></url> <url><loc>https://example.com/blog/post2</loc></url> <url><loc>https://example.com/page2</loc></url> </urlset>`, }); const result: ToolResult = await server.parseSitemap({ url: 'https://example.com/sitemap.xml', filter_pattern: '.*blog.*', }); expect(result.content[0].text).toContain('Total URLs found: 4'); expect(result.content[0].text).toContain('Filtered URLs: 2'); }); it('should handle crawl_recursive', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, links: { internal: [], external: [] }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Recursive crawl completed'); }); it('should handle parse_sitemap error', async () => { mockParseSitemap.mockRejectedValue(new Error('Network error')); await expect( server.parseSitemap({ url: 'https://example.com/sitemap.xml', }), ).rejects.toThrow('Failed to parse sitemap'); }); it('should handle crawl with error result', async () => { mockCrawl.mockResolvedValue({ success: false, results: [], }); await expect( server.crawl({ url: 'https://example.com', }), ).rejects.toThrow('Invalid response from server'); }); it('should handle crawl with metadata and links', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content' }, metadata: { title: 'Test Page', description: 'Test' }, links: { internal: ['/page1'], external: ['https://external.com'] }, js_execution_result: { results: [42, 'test'] }, success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', }); expect(result.content.length).toBeGreaterThan(1); expect(result.content.some((c) => c.text?.includes('Metadata'))).toBe(true); expect(result.content.some((c) => c.text?.includes('Links'))).toBe(true); expect(result.content.some((c) => c.text?.includes('JavaScript Execution Results'))).toBe(true); }); it('should handle executeJS with no scripts', async () => { await expect( server.executeJS({ url: 'https://example.com', scripts: null, }), ).rejects.toThrow('scripts is required'); }); it('should handle executeJS with array of scripts', async () => { mockExecuteJS.mockResolvedValue({ content: [{ type: 'text', text: 'JS executed' }], }); const result: ToolResult = await server.executeJS({ url: 'https://example.com', scripts: ['return 1', 'return 2'], }); expect(result.content[0].text).toContain('JavaScript executed on:'); }); it('should handle batchCrawl with cache bypass', async () => { mockPost.mockResolvedValue({ data: { results: [{ success: true }, { success: false }], }, }); const result: ToolResult = await server.batchCrawl({ urls: ['https://example.com/1', 'https://example.com/2'], bypass_cache: true, remove_images: true, }); expect(result.content[0].text).toContain('Batch crawl completed'); expect(mockPost).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ crawler_config: expect.objectContaining({ cache_mode: 'BYPASS', exclude_tags: ['img', 'picture', 'svg'], }), }), ); }); it('should handle smart_crawl with follow_links', async () => { const axiosClientMock = { head: jest.fn().mockResolvedValue({ headers: { 'content-type': 'application/xml' }, }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com/sitemap.xml', markdown: { raw_markdown: '<url><loc>https://example.com/page1</loc></url>' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com/sitemap.xml', follow_links: true, }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); }); it('should handle smart_crawl with HEAD request failure', async () => { const axiosClientMock = { head: jest.fn().mockRejectedValue({ response: { status: 500 } }), post: jest.fn().mockResolvedValue({ data: { results: [ { url: 'https://example.com', markdown: { raw_markdown: 'Content from crawl' }, success: true, status_code: 200, }, ], }, }), }; server.axiosClientForTesting = axiosClientMock; const result: ToolResult = await server.smartCrawl({ url: 'https://example.com', }); // Should continue despite HEAD failure expect(result.content[0].text).toContain('Smart crawl detected content type: html'); expect(result.content[0].text).toContain('Content from crawl'); }); it('should handle extractLinks with no links', async () => { mockPost.mockResolvedValue({ data: { results: [ { markdown: 'Content without links', }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', categorize: false, }); expect(result.content[0].text).toContain('All links from'); }); it('should handle extractLinks with manually extracted links', async () => { mockPost.mockResolvedValue({ data: { results: [ { markdown: 'Check out <a href="/page1">Page 1</a>', }, ], }, }); const result: ToolResult = await server.extractLinks({ url: 'https://example.com', }); expect(result.content[0].text).toContain('All links from'); }); it('should handle MCP request handler for all tools', async () => { // Request handler should be available from beforeEach expect(requestHandler).toBeDefined(); // Test various tools through the request handler const tools = [ { name: 'get_markdown', args: { url: 'https://example.com' } }, { name: 'capture_screenshot', args: { url: 'https://example.com' } }, { name: 'generate_pdf', args: { url: 'https://example.com' } }, { name: 'execute_js', args: { url: 'https://example.com', scripts: 'return 1' } }, { name: 'batch_crawl', args: { urls: ['https://example.com'] } }, { name: 'smart_crawl', args: { url: 'https://example.com' } }, { name: 'get_html', args: { url: 'https://example.com' } }, { name: 'extract_links', args: { url: 'https://example.com' } }, { name: 'crawl_recursive', args: { url: 'https://example.com' } }, { name: 'parse_sitemap', args: { url: 'https://example.com/sitemap.xml' } }, { name: 'crawl', args: { url: 'https://example.com' } }, { name: 'manage_session', args: { action: 'create' } }, { name: 'manage_session', args: { action: 'clear', session_id: 'test' } }, { name: 'manage_session', args: { action: 'list' } }, { name: 'extract_with_llm', args: { url: 'https://example.com', prompt: 'test' } }, ]; // Mock all service methods to return success mockGetMarkdown.mockResolvedValue({ content: [{ type: 'text', text: 'markdown' }] }); mockCaptureScreenshot.mockResolvedValue({ content: [{ type: 'text', text: 'screenshot' }] }); mockGeneratePDF.mockResolvedValue({ content: [{ type: 'text', text: 'pdf' }] }); mockExecuteJS.mockResolvedValue({ content: [{ type: 'text', text: 'js' }] }); mockBatchCrawl.mockResolvedValue({ content: [{ type: 'text', text: 'batch' }] }); mockGetHTML.mockResolvedValue({ content: [{ type: 'text', text: 'html' }] }); mockExtractWithLLM.mockResolvedValue({ content: [{ type: 'text', text: 'llm' }] }); mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', markdown: { raw_markdown: 'content' }, success: true, status_code: 200, }, ], }); mockPost.mockResolvedValue({ data: { results: [ { links: { internal: [], external: [] }, }, ], }, }); mockParseSitemap.mockResolvedValue(['https://example.com/page1']); // Test each tool for (const tool of tools) { const result = await requestHandler({ method: 'tools/call', params: { name: tool.name, arguments: tool.args, }, }); expect(result).toBeDefined(); expect(result.content).toBeDefined(); } // Test unknown tool const unknownResult = await requestHandler({ method: 'tools/call', params: { name: 'unknown_tool', arguments: {}, }, }); expect(unknownResult.content[0].text).toContain('Error: Unknown tool'); // The handler only handles tools/call requests, // so we don't test other methods here }); it('should handle MCP request handler validation errors', async () => { expect(requestHandler).toBeDefined(); // Test validation errors for various tools const invalidRequests = [ { name: 'get_markdown', args: {} }, // missing url { name: 'capture_screenshot', args: {} }, // missing url { name: 'generate_pdf', args: {} }, // missing url { name: 'execute_js', args: { url: 'https://example.com' } }, // missing scripts { name: 'batch_crawl', args: {} }, // missing urls { name: 'smart_crawl', args: {} }, // missing url { name: 'get_html', args: {} }, // missing url { name: 'extract_links', args: {} }, // missing url { name: 'crawl_recursive', args: {} }, // missing url { name: 'parse_sitemap', args: {} }, // missing url { name: 'crawl', args: {} }, // missing url { name: 'manage_session', args: {} }, // missing action { name: 'manage_session', args: { action: 'clear' } }, // missing session_id for clear { name: 'manage_session', args: { action: 'invalid' } }, // invalid action { name: 'extract_with_llm', args: { url: 'https://example.com' } }, // missing prompt ]; for (const req of invalidRequests) { const result = await requestHandler({ method: 'tools/call', params: { name: req.name, arguments: req.args, }, }); expect(result.content[0].text).toContain(`Error: Invalid parameters for ${req.name}`); } }); it('should handle crawl with all output types', async () => { mockCrawl.mockResolvedValue({ success: true, results: [ { url: 'https://example.com', extracted_content: { data: 'extracted' }, screenshot: 'base64screenshot', pdf: 'base64pdf', success: true, status_code: 200, }, ], }); const result: ToolResult = await server.crawl({ url: 'https://example.com', screenshot: true, pdf: true, }); expect(result.content.some((c) => c.type === 'text')).toBe(true); expect(result.content.some((c) => c.type === 'image')).toBe(true); expect(result.content.some((c) => c.type === 'resource' && c.resource?.mimeType === 'application/pdf')).toBe( true, ); }); }); describe('MCP Protocol Handler Tests', () => { it('should handle tools/list request', async () => { // Find the tools/list handler const toolsListHandler = mockSetRequestHandler.mock.calls.find( (call) => (call[0] as any).method === 'tools/list', )?.[1]; expect(toolsListHandler).toBeDefined(); const result = await (toolsListHandler as any)({ method: 'tools/list', params: {} }); // eslint-disable-line @typescript-eslint/no-explicit-any expect(result).toBeDefined(); expect(result.tools).toBeDefined(); expect(result.tools.length).toBe(13); // Should have 13 tools }); it('should handle get_markdown query functionality', async () => { mockGetMarkdown.mockResolvedValue({ url: 'https://example.com', filter: 'fit', query: 'What products are listed?', cache: 'false', markdown: 'Page content about products', success: true, }); const result: ToolResult = await server.getMarkdown({ url: 'https://example.com', query: 'What products are listed?', }); expect(result.content[0].text).toContain('Query: What products are listed?'); expect(result.content[0].text).toContain('Page content about products'); }); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server