MCP Server for Crawl4AI

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

crawl-handlers.integration.test.ts•7.27 kB

/* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('Crawl Handlers Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('batch_crawl error handling', () => { it( 'should handle batch crawl with invalid URLs', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['not-a-valid-url', 'https://this-domain-does-not-exist-12345.com'], max_concurrent: 2, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].type).toBe('text'); // Zod validation will catch the invalid URL format expect(content[0].text).toContain('Invalid parameters'); }, TEST_TIMEOUTS.medium, ); }); describe('smart_crawl edge cases', () => { it( 'should detect XML content type for XML URLs', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/xml', bypass_cache: true, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Smart crawl detected content type:'); // Should detect as XML based on content-type header expect(content[0].text?.toLowerCase()).toMatch(/xml|json/); // httpbin.org/xml actually returns JSON }, TEST_TIMEOUTS.medium, ); it( 'should handle follow_links with sitemap URLs', async () => { // Note: Most sites don't have accessible sitemaps, so this tests the logic const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://example.com/sitemap.xml', follow_links: true, max_depth: 2, bypass_cache: true, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Smart crawl detected content type:'); }, TEST_TIMEOUTS.long, // Increase timeout for sitemap processing ); }); describe('crawl_recursive edge cases', () => { it( 'should respect max_depth limit of 0', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/5/0', max_depth: 0, // Should only crawl the initial page }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; // The test might show 0 pages if the URL fails, or 1 page if it succeeds expect(content[0].text).toMatch(/Pages crawled: [01]/); // If pages were crawled, check for max depth message if (content[0].text?.includes('Pages crawled: 1')) { expect(content[0].text).toContain('Max depth reached: 0'); } }, TEST_TIMEOUTS.medium, ); it( 'should handle sites with no internal links', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/json', // JSON endpoint has no links max_depth: 2, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Pages crawled: 1'); expect(content[0].text).toContain('Internal links found: 0'); }, TEST_TIMEOUTS.medium, ); }); describe('parse_sitemap error handling', () => { it( 'should handle non-existent sitemap URLs', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://this-domain-does-not-exist-12345.com/sitemap.xml', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toMatch(/failed|error|not found/); }, TEST_TIMEOUTS.medium, ); }); describe('crawl method edge cases', () => { it( 'should handle crawl with all image and filtering parameters', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', word_count_threshold: 50, image_description_min_word_threshold: 10, image_score_threshold: 0.5, exclude_social_media_links: true, cache_mode: 'BYPASS', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].type).toBe('text'); // Should successfully crawl with these parameters expect(content[0].text).not.toContain('Error'); }, TEST_TIMEOUTS.medium, ); it( 'should handle js_code as null with validation error', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', js_code: null as unknown as string, // Intentionally pass null }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Invalid parameters for crawl'); expect(content[0].text).toContain('js_code'); }, TEST_TIMEOUTS.short, ); it( 'should work with session_id parameter using manage_session', async () => { // First create a session using manage_session const sessionResult = await client.callTool({ name: 'manage_session', arguments: { action: 'create', session_id: 'test-crawl-session-new', }, }); expect(sessionResult).toBeDefined(); // Then use it for crawling const crawlResult = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', session_id: 'test-crawl-session-new', }, }); expect(crawlResult).toBeDefined(); const content = (crawlResult as ToolResult).content; expect(content[0].type).toBe('text'); expect(content[0].text).not.toContain('Error'); // Clean up using manage_session await client.callTool({ name: 'manage_session', arguments: { action: 'clear', session_id: 'test-crawl-session-new', }, }); }, TEST_TIMEOUTS.medium, ); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server