MCP Server for Crawl4AI

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

crawl-handlers.test.ts•17.2 kB

/* eslint-env jest */ import { jest } from '@jest/globals'; import { AxiosError } from 'axios'; import type { CrawlHandlers as CrawlHandlersType } from '../../handlers/crawl-handlers.js'; import type { Crawl4AIService } from '../../crawl4ai-service.js'; // Mock the service const mockCrawl = jest.fn(); const mockService = { crawl: mockCrawl, } as unknown as Crawl4AIService; // Mock axios client const mockPost = jest.fn() as jest.Mock; const mockHead = jest.fn() as jest.Mock; const mockAxiosClient = { post: mockPost, head: mockHead, } as any; // eslint-disable-line @typescript-eslint/no-explicit-any // Mock axios for parseSitemap const mockAxiosGet = jest.fn(); jest.unstable_mockModule('axios', () => ({ default: { get: mockAxiosGet, }, AxiosError, })); // Import after setting up mocks const { CrawlHandlers: CrawlHandlersClass } = await import('../../handlers/crawl-handlers.js'); describe('CrawlHandlers', () => { let handler: CrawlHandlersType; let sessions: Map<string, any>; // eslint-disable-line @typescript-eslint/no-explicit-any beforeEach(() => { jest.clearAllMocks(); sessions = new Map(); handler = new CrawlHandlersClass(mockService, mockAxiosClient, sessions); }); describe('batchCrawl', () => { it('should handle API errors gracefully', async () => { // Mock API error response (mockPost as jest.Mock).mockRejectedValue( new AxiosError('Request failed with status code 500', 'ERR_BAD_RESPONSE', undefined, undefined, { status: 500, statusText: 'Internal Server Error', data: 'Internal Server Error', headers: {}, config: {} as any, // eslint-disable-line @typescript-eslint/no-explicit-any } as any), // eslint-disable-line @typescript-eslint/no-explicit-any ); await expect( handler.batchCrawl({ urls: ['not-a-valid-url', 'https://invalid-domain.com'], max_concurrent: 2, }), ).rejects.toThrow('Failed to batch crawl: Internal Server Error'); }); it('should support per-URL configs array', async () => { (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { url: 'https://example1.com', success: true, markdown: { raw_markdown: 'Test 1' } }, { url: 'https://example2.com', success: true, markdown: { raw_markdown: 'Test 2' } }, ], }, }); const result = await handler.batchCrawl({ urls: ['https://example1.com', 'https://example2.com'], configs: [ { url: 'https://example1.com', browser_config: { browser_type: 'chromium' }, crawler_config: { screenshot: true }, }, { url: 'https://example2.com', browser_config: { browser_type: 'undetected' }, crawler_config: { pdf: true }, extraction_strategy: { provider: 'openai' }, }, ], max_concurrent: 2, }); // Verify the configs array was passed through expect(mockPost).toHaveBeenCalledWith( '/crawl', expect.objectContaining({ configs: expect.arrayContaining([ expect.objectContaining({ url: 'https://example1.com', browser_config: { browser_type: 'chromium' }, crawler_config: { screenshot: true }, }), expect.objectContaining({ url: 'https://example2.com', browser_config: { browser_type: 'undetected' }, crawler_config: { pdf: true }, extraction_strategy: { provider: 'openai' }, }), ]), max_concurrent: 2, }), ); expect(result.content[0].text).toContain('Batch crawl completed'); }); }); describe('smartCrawl', () => { it('should detect XML content type from HEAD request', async () => { // Mock HEAD response with XML content type (mockHead as jest.Mock).mockResolvedValue({ headers: { 'content-type': 'application/xml', }, }); // Mock crawl response (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: '<xml>Test content</xml>', }, }, ], }, }); const result = await handler.smartCrawl({ url: 'https://example.com/data.xml', }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('<xml>Test content</xml>'); }); it('should handle HEAD request failure gracefully', async () => { // Mock HEAD request failure (mockHead as jest.Mock).mockRejectedValue(new Error('HEAD request failed')); // Mock successful crawl (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }, }); const result = await handler.smartCrawl({ url: 'https://example.com', }); expect(result.content[0].text).toContain('Smart crawl detected content type: html'); }); it('should follow links from sitemap when follow_links is true', async () => { // Mock successful HEAD request (mockHead as jest.Mock).mockResolvedValue({ headers: { 'content-type': 'application/xml', }, }); // Mock initial crawl with sitemap content (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: `<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>https://example.com/page1</loc> </url> <url> <loc>https://example.com/page2</loc> </url> </urlset>`, }, ], }, }); // Mock follow-up crawl (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [{ success: true }, { success: true }], }, }); const result = await handler.smartCrawl({ url: 'https://example.com/sitemap.xml', follow_links: true, max_depth: 2, }); expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); expect(result.content[0].text).toContain('Followed 2 links:'); expect(result.content[0].text).toContain('https://example.com/page1'); expect(result.content[0].text).toContain('https://example.com/page2'); }); it('should handle smartCrawl API errors', async () => { (mockHead as jest.Mock).mockResolvedValue({ headers: {} }); // Mock crawl to get empty results first, then error on follow-up (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [], }, }); const result = await handler.smartCrawl({ url: 'https://example.com', }); // With empty results, it should still return a response expect(result.content[0].text).toContain('Smart crawl detected content type: html'); expect(result.content[0].text).toContain('No content extracted'); }); }); describe('crawlRecursive', () => { it('should handle max_depth limit correctly', async () => { // Mock successful crawl with links (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [{ href: 'https://example.com/page1' }, { href: 'https://example.com/page2' }], external: [], }, }, ], }, }); // Mock second crawl for page1 (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Page 1 content', }, links: { internal: [], external: [], }, }, ], }, }); // Mock third crawl for page2 (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Page 2 content', }, links: { internal: [], external: [], }, }, ], }, }); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, // Should crawl initial URL and one level deep }); expect(result.content[0].text).toContain('Pages crawled: 3'); // Initial + 2 pages at depth 1 expect(result.content[0].text).toContain('Max depth reached: 1'); expect(mockPost).toHaveBeenCalledTimes(3); // Initial crawl + two more }); it('should handle invalid URLs in discovered links', async () => { // Mock crawl with invalid link (mockPost as jest.Mock).mockResolvedValue({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [ { href: 'javascript:void(0)' }, // Invalid URL { href: 'https://example.com/valid' }, // Valid URL ], external: [], }, }, ], }, }); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, }); // Should continue despite invalid URL expect(result.content[0].text).toContain('Pages crawled:'); }); it('should handle crawl failures during recursion', async () => { // First crawl succeeds (mockPost as jest.Mock).mockResolvedValueOnce({ data: { results: [ { success: true, markdown: { raw_markdown: 'Test content', }, links: { internal: [{ href: 'https://example.com/page1' }], external: [], }, }, ], }, }); // Second crawl fails (mockPost as jest.Mock).mockRejectedValueOnce(new Error('Crawl failed')); const result = await handler.crawlRecursive({ url: 'https://example.com', max_depth: 1, }); // Should continue despite failure expect(result.content[0].text).toContain('Pages crawled: 1'); }); it('should handle crawlRecursive API errors', async () => { (mockPost as jest.Mock).mockRejectedValue(new Error('API Error')); const result = await handler.crawlRecursive({ url: 'https://example.com', }); // When the initial crawl fails, it should return a result with no pages crawled expect(result.content[0].text).toContain('Pages crawled: 0'); expect(result.content[0].text).toContain('No pages could be crawled'); }); }); describe('parseSitemap', () => { it('should handle network errors gracefully', async () => { // Mock ENOTFOUND error const error = new Error('getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); (error as { code?: string }).code = 'ENOTFOUND'; mockAxiosGet.mockRejectedValue(error); await expect( handler.parseSitemap({ url: 'https://not-a-real-domain-12345.com/sitemap.xml', }), ).rejects.toThrow('Failed to parse sitemap: getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); }); }); describe('crawl', () => { it('should handle word_count_threshold parameter', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); const result = await handler.crawl({ url: 'https://example.com', word_count_threshold: 100, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ word_count_threshold: 100, }), }), ); expect(result.content[0].text).toBe('Test content'); }); it('should update session last_used time when using session_id', async () => { const sessionId = 'test-session'; const session = { id: sessionId, created_at: new Date(), last_used: new Date('2025-08-01'), }; sessions.set(sessionId, session); (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', session_id: sessionId, }); const updatedSession = sessions.get(sessionId) as { last_used: Date }; expect(updatedSession.last_used.getTime()).toBeGreaterThan(new Date('2025-08-01').getTime()); }); it('should handle image description parameters', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', image_description_min_word_threshold: 10, image_score_threshold: 0.5, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ image_description_min_word_threshold: 10, image_score_threshold: 0.5, }), }), ); }); it('should handle exclude_social_media_links parameter', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, markdown: { raw_markdown: 'Test content', }, }, ], }); await handler.crawl({ url: 'https://example.com', exclude_social_media_links: true, }); expect(mockCrawl).toHaveBeenCalledWith( expect.objectContaining({ crawler_config: expect.objectContaining({ exclude_social_media_links: true, }), }), ); }); it('should use extracted_content when available as string', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, extracted_content: 'Extracted text content', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('Extracted text content'); }); it('should handle extracted_content as object', async () => { const extractedObj = { title: 'Test', body: 'Content' }; (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, extracted_content: extractedObj, }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe(JSON.stringify(extractedObj, null, 2)); }); it('should fallback to html when markdown is not available', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, html: '<html><body>HTML content</body></html>', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('<html><body>HTML content</body></html>'); }); it('should fallback to fit_html when neither markdown nor html is available', async () => { (mockCrawl as jest.Mock).mockResolvedValue({ results: [ { success: true, fit_html: '<div>Fit HTML content</div>', }, ], }); const result = await handler.crawl({ url: 'https://example.com', }); expect(result.content[0].text).toBe('<div>Fit HTML content</div>'); }); it('should handle js_code as null error', async () => { await expect( handler.crawl({ url: 'https://example.com', js_code: null, }), ).rejects.toThrow( 'Failed to crawl: js_code parameter is null. Please provide JavaScript code as a string or array of strings.', ); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server