MCP Server for Crawl4AI

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

crawl.integration.test.ts•46.6 kB

/* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, generateSessionId, expectSuccessfulCrawl, expectScreenshot, delay, TEST_TIMEOUTS, } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; data?: string; mimeType?: string; }>; } describe('crawl Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Basic Crawling', () => { it( 'should crawl a simple page with basic configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', cache_mode: 'BYPASS', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URL gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'not-a-valid-url', cache_mode: 'BYPASS', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); // Our Zod validation catches this before it reaches the API expect(content[0].text).toContain('Invalid parameters for crawl'); expect(content[0].text).toContain('Invalid url'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domain gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://this-domain-definitely-does-not-exist-12345.com', cache_mode: 'BYPASS', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); // Could be DNS error, connection error, or "Internal Server Error" expect(content[0].text).toMatch(/Failed to crawl|Internal Server Error|DNS|connection/i); }, TEST_TIMEOUTS.medium, ); it( 'should handle browser configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', viewport_width: 1920, viewport_height: 1080, user_agent: 'MCP Integration Test Bot', cache_mode: 'DISABLED', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Dynamic Content Tests', () => { it( 'should execute JavaScript on page', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ['return document.querySelectorAll("a").length', 'return document.title'], wait_after_js: 1000, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin.org/html contains links and a title expect(textContent?.text?.toLowerCase()).toMatch(/herman|melville|moby/); // Content from the page }, TEST_TIMEOUTS.medium, ); it( 'should wait for specific elements', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/2', wait_for: 'body', wait_for_timeout: 5000, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should handle virtual scrolling for infinite feeds', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com/trending', virtual_scroll_config: { container_selector: '.Box-row', scroll_count: 3, scroll_by: 'container_height', wait_after_scroll: 1.0, }, cache_mode: 'BYPASS', wait_for: '.Box-row', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // Should have captured multiple trending repos after scrolling expect(textContent?.text).toBeTruthy(); expect(textContent?.text?.length).toBeGreaterThan(1000); }, TEST_TIMEOUTS.long, ); }); describe('Session Management Tests', () => { it( 'should create and use a session', async () => { const sessionId = generateSessionId(); // First crawl with session const result1 = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', session_id: sessionId, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result1); // Second crawl reusing session const result2 = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com/features', session_id: sessionId, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result2); }, TEST_TIMEOUTS.medium, ); it( 'should handle cookies in session', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', session_id: sessionId, cookies: [ { name: 'test_cookie', value: 'test_value', domain: '.github.com', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Content Extraction Tests', () => { it.skip( 'should extract content using CSS selectors - SKIPPED: Not supported via REST API', async () => { // CSS extraction is not supported via the REST API due to Python class serialization limitations // This test is kept for documentation purposes but skipped const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://www.google.com', extraction_type: 'css', css_selectors: { title: 'title', search_button: 'input[type="submit"]', logo: 'img[alt*="Google"]', }, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); }, TEST_TIMEOUTS.short, ); it( 'should extract content using LLM via extract_with_llm tool', async () => { // Note: This test requires the Crawl4AI server to have an LLM provider configured try { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/html', query: 'Extract the main page title and any author names mentioned', }, }); expect(result).toBeTruthy(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The response should be JSON with an "answer" field try { const parsed = JSON.parse(textContent?.text || '{}'); expect(parsed).toHaveProperty('answer'); expect(typeof parsed.answer).toBe('string'); expect(parsed.answer.length).toBeGreaterThan(0); } catch { // If parsing fails, at least check we got text expect(textContent?.text?.length || 0).toBeGreaterThan(0); } } catch (error) { // If the server doesn't have LLM configured, it will return an error if (error instanceof Error && error.message?.includes('No LLM provider configured')) { console.log('⚠️ LLM extraction test skipped: Server needs LLM provider configured'); return; } throw error; } }, TEST_TIMEOUTS.long, ); }); describe('Media Handling Tests', () => { it( 'should capture screenshots', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', screenshot: true, screenshot_wait_for: 1.0, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); }, TEST_TIMEOUTS.medium, ); it( 'should generate PDF', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', pdf: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // PDF generation should return some content const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain some content from the page expect(textContent?.text?.toLowerCase()).toContain('herman'); }, TEST_TIMEOUTS.medium, ); it( 'should handle image filtering', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', exclude_external_images: true, image_description_min_word_threshold: 20, image_score_threshold: 5, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Performance & Caching Tests', () => { it( 'should respect cache modes', async () => { const url = 'https://httpbin.org/html'; // Use a simpler page for cache testing // First request - populate cache with ENABLED mode const result1 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'ENABLED', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result1); const content1 = (result1 as ToolResult).content.find((c) => c.type === 'text')?.text; // Wait a bit to ensure cache is saved await delay(500); // Second request - should use cache (ENABLED mode) const startTime = Date.now(); const result2 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'ENABLED', word_count_threshold: 10, }, }); const cacheTime = Date.now() - startTime; await expectSuccessfulCrawl(result2); const content2 = (result2 as ToolResult).content.find((c) => c.type === 'text')?.text; // Content should be identical if cache was used expect(content2).toBe(content1); // Third request - bypass cache const bypassStartTime = Date.now(); const result3 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); const bypassTime = Date.now() - bypassStartTime; await expectSuccessfulCrawl(result3); // Cache hit should typically be faster, but we'll make this test more lenient // Just verify all requests succeeded expect(cacheTime).toBeGreaterThan(0); expect(bypassTime).toBeGreaterThan(0); // Fourth request - DISABLED mode should not use cache const result4 = await client.callTool({ name: 'crawl', arguments: { url, cache_mode: 'DISABLED', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result4); }, TEST_TIMEOUTS.long, ); it( 'should handle timeout configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/1', timeout: 20000, page_timeout: 15000, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Content Filtering Tests', () => { it( 'should filter content by tags', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', // Simpler page for testing excluded_tags: ['script', 'style', 'nav', 'footer', 'header'], word_count_threshold: 10, cache_mode: 'BYPASS', only_text: true, // Force text-only output remove_overlay_elements: true, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Just verify we got content back - the server's filtering behavior may vary // With all the filters applied, content might be minimal expect(textContent?.text?.length).toBeGreaterThan(10); // Should contain some text from the page expect(textContent?.text).toBeTruthy(); }, TEST_TIMEOUTS.short, ); it( 'should filter content by selectors', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', excluded_selector: '.footer, .header-nav, [aria-label="Advertisement"]', remove_overlay_elements: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should handle link filtering', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', exclude_external_links: true, exclude_social_media_links: true, exclude_domains: ['twitter.com', 'facebook.com', 'linkedin.com'], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // Should not contain social media domains expect(textContent?.text).not.toMatch(/twitter\.com|facebook\.com/); }, TEST_TIMEOUTS.medium, ); }); describe('Bot Detection Avoidance Tests', () => { it( 'should simulate user behavior', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://github.com', simulate_user: true, override_navigator: true, magic: true, delay_before_scroll: 1000, scroll_delay: 500, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should use custom headers and user agent', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/headers', user_agent: 'Mozilla/5.0 (compatible; MCP Test Bot)', headers: { 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'X-Custom-Header': 'MCP-Test', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); // httpbin returns headers in response expect(textContent?.text).toContain('MCP Test Bot'); expect(textContent?.text).toContain('X-Custom-Header'); }, TEST_TIMEOUTS.medium, ); }); describe('Error Handling Tests', () => { it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'not-a-valid-url', cache_mode: 'BYPASS', }, }); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toContain('Error'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', cache_mode: 'BYPASS', }, }); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text?.toLowerCase()).toMatch(/error|failed/); }, TEST_TIMEOUTS.short, ); it( 'should handle JavaScript errors gracefully', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'throw new Error("Test error")', cache_mode: 'BYPASS', }, }); // Should still return content even if JS fails const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.short, ); }); describe('Advanced Configurations', () => { it( 'should handle complex multi-feature crawl', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', // Browser config viewport_width: 1920, viewport_height: 1080, user_agent: 'MCP Advanced Test Bot', // Session session_id: sessionId, // JavaScript js_code: 'return document.querySelectorAll("h1").length', wait_after_js: 1000, // Content filtering excluded_tags: ['script', 'style'], word_count_threshold: 50, remove_overlay_elements: true, // Media screenshot: true, screenshot_wait_for: 1.0, // Performance cache_mode: 'DISABLED', timeout: 60000, // Bot detection simulate_user: true, override_navigator: true, }, }); await expectSuccessfulCrawl(result); // Screenshot might not always be returned in complex multi-feature crawls // especially with httpbin.org which is a simple HTML page const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.long, ); it( 'should handle proxy configuration', async () => { // Test that proxy configuration is accepted, even without a real proxy const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/ip', proxy_server: 'http://example-proxy.com:8080', proxy_username: 'testuser', proxy_password: 'testpass', cache_mode: 'BYPASS', word_count_threshold: 10, }, }); // The request should complete (even if proxy doesn't exist, the config should be accepted) expect(result).toBeDefined(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.medium, ); it( 'should process iframes', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://www.w3schools.com/html/html_iframe.asp', process_iframes: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Browser Configuration Tests', () => { describe('Cookie handling', () => { it( 'should set and send cookies correctly', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/cookies', cookies: [ { name: 'test_cookie', value: 'test_value', domain: '.httpbin.org', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns cookies as JSON in the response expect(textContent?.text).toContain('test_cookie'); expect(textContent?.text).toContain('test_value'); }, TEST_TIMEOUTS.short, ); it( 'should handle multiple cookies', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/cookies', cookies: [ { name: 'session_id', value: 'abc123', domain: '.httpbin.org', path: '/', }, { name: 'user_pref', value: 'dark_mode', domain: '.httpbin.org', path: '/', }, ], cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Verify both cookies are present expect(textContent?.text).toContain('session_id'); expect(textContent?.text).toContain('abc123'); expect(textContent?.text).toContain('user_pref'); expect(textContent?.text).toContain('dark_mode'); }, TEST_TIMEOUTS.short, ); }); describe('Custom headers', () => { it( 'should send custom headers', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/headers', headers: { 'X-Custom-Header': 'test-value', 'X-Request-ID': '12345', 'Accept-Language': 'en-US,en;q=0.9', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns headers in the response expect(textContent?.text).toContain('X-Custom-Header'); expect(textContent?.text).toContain('test-value'); // Note: Some headers may be filtered by the browser // Just verify our custom header got through }, TEST_TIMEOUTS.short, ); }); describe('User-Agent configuration', () => { it( 'should set custom user agent', async () => { const customUserAgent = 'MCP-Crawl4AI-Test/1.0 (Integration Tests)'; const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', user_agent: customUserAgent, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin returns the user-agent in the response expect(textContent?.text).toContain(customUserAgent); }, TEST_TIMEOUTS.short, ); }); describe('Viewport sizes and screenshots', () => { it( 'should capture screenshot at mobile size (375x667)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 375, viewport_height: 667, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for mobile screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(10000); // At least 10KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for mobile (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should capture screenshot at tablet size (768x1024)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 768, viewport_height: 1024, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for tablet screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(15000); // At least 15KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for tablet (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should capture screenshot at HD size (1280x720)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 1280, viewport_height: 720, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); await expectScreenshot(result); // Check screenshot was captured const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); // Verify reasonable data size for HD screenshot const dataLength = imageContent?.data?.length || 0; expect(dataLength).toBeGreaterThan(20000); // At least 20KB expect(dataLength).toBeLessThan(3000000); // Less than 3MB for HD (base64 encoded) }, TEST_TIMEOUTS.medium, ); it( 'should fail gracefully for very large viewport (1920x1080)', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', viewport_width: 1920, viewport_height: 1080, screenshot: true, screenshot_wait_for: 1, cache_mode: 'BYPASS', }, }); // This should either timeout or return an error based on testing // We expect either an error or no screenshot data const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); // If we got text but no image, that's expected for large viewports if (textContent && !imageContent) { expect(textContent).toBeDefined(); } else if (textContent?.text?.includes('Error') || textContent?.text?.includes('timeout')) { // Expected error for large viewport expect(textContent.text).toMatch(/Error|timeout/i); } }, TEST_TIMEOUTS.long, ); }); describe('Combined browser configurations', () => { it( 'should handle cookies, headers, and custom viewport together', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/anything', viewport_width: 768, viewport_height: 1024, user_agent: 'MCP-Test-Bot/2.0', cookies: [ { name: 'auth_token', value: 'secret123', domain: '.httpbin.org', path: '/', }, ], headers: { 'X-Test-Header': 'combined-test', }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // httpbin/anything endpoint returns all request data // Verify all configurations were applied expect(textContent?.text).toContain('MCP-Test-Bot/2.0'); expect(textContent?.text).toContain('auth_token'); expect(textContent?.text).toContain('X-Test-Header'); expect(textContent?.text).toContain('combined-test'); }, TEST_TIMEOUTS.medium, ); }); }); describe('Crawler Configuration Advanced Tests', () => { describe('Content filtering parameters', () => { it( 'should remove forms when remove_forms is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Forms should be removed, so no form-related text should appear expect(textContent?.text).not.toContain('<form'); expect(textContent?.text).not.toContain('type="submit"'); expect(textContent?.text).not.toContain('input type='); }, TEST_TIMEOUTS.short, ); it( 'should keep forms when remove_forms is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: false, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Forms should be present - check for form-related keywords const text = textContent?.text?.toLowerCase() || ''; // httpbin forms page should have form elements expect(text.length).toBeGreaterThan(100); }, TEST_TIMEOUTS.short, ); it( 'should preserve data attributes when keep_data_attributes is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://getbootstrap.com/docs/4.0/components/alerts/', keep_data_attributes: true, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain alert content expect(textContent?.text).toContain('alert'); }, TEST_TIMEOUTS.medium, ); }); describe('JavaScript execution parameters', () => { it( 'should return only JS results when js_only is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ['return document.title', 'return document.querySelectorAll("p").length'], js_only: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain JS execution results but not the full HTML content // The result should be much shorter than full page content expect(textContent?.text?.length).toBeLessThan(1000); // Should not contain the full Moby Dick text from the page expect(textContent?.text).not.toContain('Herman Melville'); }, TEST_TIMEOUTS.short, ); it( 'should handle js_only with session_id', async () => { const sessionId = generateSessionId(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', session_id: sessionId, js_code: 'return window.location.href', js_only: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Page visibility parameters', () => { it( 'should extract content when body is hidden and ignore_body_visibility is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', ignore_body_visibility: true, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should still extract content despite hidden body expect(textContent?.text).toContain('Herman Melville'); }, TEST_TIMEOUTS.short, ); it( 'should respect body visibility when ignore_body_visibility is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', ignore_body_visibility: false, cache_mode: 'BYPASS', word_count_threshold: 10, }, }); await expectSuccessfulCrawl(result); // Content extraction behavior may vary when body is hidden }, TEST_TIMEOUTS.short, ); }); describe('Debug and logging parameters', () => { it( 'should capture console logs when log_console is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: [ 'console.log("Test log message 1")', 'console.warn("Test warning")', 'console.error("Test error")', 'return "logs executed"', ], log_console: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // Note: Console logs may be captured in a separate field or in verbose output }, TEST_TIMEOUTS.short, ); it( 'should provide verbose output when verbose is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', verbose: true, cache_mode: 'BYPASS', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); // Verbose output may include additional debugging information }, TEST_TIMEOUTS.short, ); }); describe('Media filtering parameters', () => { it( 'should exclude external images when exclude_external_images is true', async () => { // First, let's create a page with external images via JS const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', js_code: ` const img1 = document.createElement('img'); img1.src = 'https://httpbin.org/image/png'; img1.alt = 'External PNG'; document.body.appendChild(img1); const img2 = document.createElement('img'); img2.src = '/local-image.png'; img2.alt = 'Local image'; document.body.appendChild(img2); return document.images.length; `, exclude_external_images: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The external image references should be filtered out }, TEST_TIMEOUTS.medium, ); it( 'should include external images when exclude_external_images is false', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', exclude_external_images: false, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Combined crawler configuration tests', () => { it( 'should handle multiple filtering options together', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/forms/post', remove_forms: true, exclude_external_links: true, exclude_external_images: true, only_text: true, word_count_threshold: 10, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should have filtered content expect(textContent?.text).not.toContain('<form'); expect(textContent?.text).not.toContain('type="submit"'); }, TEST_TIMEOUTS.short, ); it( 'should handle debug options with content extraction', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', verbose: true, log_console: true, js_code: 'console.log("Debug test"); return document.title', keep_data_attributes: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('New crawler parameters (0.7.3/0.7.4)', () => { it( 'should accept undetected browser type', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/user-agent', browser_type: 'undetected', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // The undetected browser should mask automation indicators // but we can at least verify the request was accepted }, TEST_TIMEOUTS.short, ); it( 'should filter content using css_selector', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', css_selector: 'h1', cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // css_selector returns ONLY the selected element content expect(textContent?.text?.toLowerCase()).toContain('example domain'); // Should NOT contain the paragraph text that's outside the h1 expect(textContent?.text).not.toContain('use in illustrative examples'); }, TEST_TIMEOUTS.short, ); it( 'should include links when include_links is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', include_links: true, }, }); await expectSuccessfulCrawl(result); // Check if links section is included const hasLinksInfo = (result as ToolResult).content.some( (item) => item.type === 'text' && item.text?.includes('Links:'), ); expect(hasLinksInfo).toBe(true); }, TEST_TIMEOUTS.short, ); it( 'should respect delay_before_return_html parameter', async () => { const startTime = Date.now(); const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/delay/1', // 1 second delay from server delay_before_return_html: 2, // Additional 2 second delay (in seconds, not ms) cache_mode: 'BYPASS', }, }); const elapsed = Date.now() - startTime; await expectSuccessfulCrawl(result); // Total time should be at least 3 seconds (1s from endpoint + 2s delay) expect(elapsed).toBeGreaterThanOrEqual(3000); }, TEST_TIMEOUTS.medium, ); it( 'should convert relative URLs when resolve_absolute_urls is true', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', resolve_absolute_urls: true, include_links: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); // Links should be in a separate content item const linksContent = (result as ToolResult).content.find( (c) => c.type === 'text' && c.text?.includes('Links:'), ); // With include_links=true, links info should be present expect(linksContent).toBeDefined(); expect(linksContent?.text).toContain('External: 1'); }, TEST_TIMEOUTS.short, ); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server