MCP Server for Crawl4AI

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

index.test.ts•14.2 kB

import { jest } from '@jest/globals'; import { z } from 'zod'; // Mock the MCP SDK jest.mock('@modelcontextprotocol/sdk/server/index.js'); jest.mock('@modelcontextprotocol/sdk/server/stdio.js'); describe('MCP Server Validation', () => { describe('Stateless tool validation', () => { // Test the createStatelessSchema helper const createStatelessSchema = <T extends z.ZodTypeAny>(schema: T, toolName: string) => { // Tool-specific guidance for common scenarios const toolGuidance: Record<string, string> = { capture_screenshot: 'To capture screenshots with sessions, use crawl(session_id, screenshot: true)', generate_pdf: 'To generate PDFs with sessions, use crawl(session_id, pdf: true)', execute_js: 'To run JavaScript with sessions, use crawl(session_id, js_code: [...])', get_html: 'To get HTML with sessions, use crawl(session_id)', extract_with_llm: 'To extract data with sessions, first use crawl(session_id) then extract from the response', }; const message = `${toolName} does not support session_id. This tool is stateless - each call creates a new browser. ${ toolGuidance[toolName] || 'For persistent operations, use crawl with session_id.' }`; return z .object({ session_id: z.never({ message }).optional(), }) .passthrough() .and(schema) .transform((data) => { const { session_id, ...rest } = data as Record<string, unknown> & { session_id?: unknown }; if (session_id !== undefined) { throw new Error(message); } return rest; }); }; it('should reject session_id for stateless tools', () => { const ExecuteJsSchema = createStatelessSchema( z.object({ url: z.string().url(), js_code: z.union([z.string(), z.array(z.string())]), }), 'execute_js', ); // Should reject with session_id expect(() => { ExecuteJsSchema.parse({ url: 'https://example.com', js_code: 'return document.title', session_id: 'test-session', }); }).toThrow('execute_js does not support session_id'); }); it('should accept valid parameters without session_id', () => { const ExecuteJsSchema = createStatelessSchema( z.object({ url: z.string().url(), js_code: z.union([z.string(), z.array(z.string())]), }), 'execute_js', ); const result = ExecuteJsSchema.parse({ url: 'https://example.com', js_code: 'return document.title', }); expect(result).toEqual({ url: 'https://example.com', js_code: 'return document.title', }); }); it('should provide helpful error message when session_id is used', () => { const GetMarkdownSchema = createStatelessSchema( z.object({ url: z.string().url(), }), 'get_markdown', ); try { GetMarkdownSchema.parse({ url: 'https://example.com', session_id: 'my-session', }); } catch (error) { expect(error).toBeInstanceOf(z.ZodError); const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('get_markdown does not support session_id'); expect(zodError.errors[0].message).toContain('For persistent operations, use crawl'); } }); it('should provide tool-specific guidance for common tools', () => { // Test capture_screenshot guidance const CaptureScreenshotSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'capture_screenshot'); try { CaptureScreenshotSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, screenshot: true)'); } // Test generate_pdf guidance const GeneratePdfSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'generate_pdf'); try { GeneratePdfSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, pdf: true)'); } // Test execute_js guidance const ExecuteJsSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'execute_js'); try { ExecuteJsSchema.parse({ url: 'https://example.com', session_id: 'test' }); } catch (error) { const zodError = error as z.ZodError; expect(zodError.errors[0].message).toContain('use crawl(session_id, js_code: [...])'); } }); it('should validate all stateless tools', () => { const statelessTools = [ 'get_markdown', 'capture_screenshot', 'generate_pdf', 'execute_js', 'batch_crawl', 'smart_crawl', 'get_html', 'extract_links', 'crawl_recursive', 'parse_sitemap', 'extract_with_llm', ]; statelessTools.forEach((toolName) => { const schema = createStatelessSchema( z.object({ url: z.string().url(), }), toolName, ); // Should reject session_id expect(() => { schema.parse({ url: 'https://example.com', session_id: 'test', }); }).toThrow(`${toolName} does not support session_id`); // Should accept without session_id const result = schema.parse({ url: 'https://example.com', }); expect(result).toEqual({ url: 'https://example.com', }); }); }); }); describe('Extract links tool', () => { it('should validate extract_links parameters', () => { const ExtractLinksSchema = z.object({ url: z.string().url(), categorize: z.boolean().optional().default(true), }); // Valid input with categorize true const result1 = ExtractLinksSchema.parse({ url: 'https://example.com', categorize: true, }); expect(result1.categorize).toBe(true); // Valid input with categorize false const result2 = ExtractLinksSchema.parse({ url: 'https://example.com', categorize: false, }); expect(result2.categorize).toBe(false); // Default categorize should be true const result3 = ExtractLinksSchema.parse({ url: 'https://example.com', }); expect(result3.categorize).toBe(true); }); }); describe('Session management tools', () => { it('should validate create_session parameters', () => { const CreateSessionSchema = z.object({ session_id: z.string(), initial_url: z.string().optional(), browser_type: z.string().optional(), }); // Valid input const result = CreateSessionSchema.parse({ session_id: 'my-session', initial_url: 'https://example.com', }); expect(result.session_id).toBe('my-session'); // Missing required session_id expect(() => { CreateSessionSchema.parse({ initial_url: 'https://example.com', }); }).toThrow(); }); it('should validate clear_session parameters', () => { const ClearSessionSchema = z.object({ session_id: z.string(), }); // Valid input const result = ClearSessionSchema.parse({ session_id: 'my-session', }); expect(result.session_id).toBe('my-session'); // Missing required session_id expect(() => { ClearSessionSchema.parse({}); }).toThrow(); }); }); describe('crawl validation', () => { it('should accept session_id for crawl', () => { const CrawlWithConfigSchema = z.object({ url: z.string().url(), session_id: z.string().optional(), js_code: z.union([z.string(), z.array(z.string())]).optional(), }); const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'my-session', js_code: 'document.querySelector("button").click()', }); expect(result.session_id).toBe('my-session'); }); it('should work without session_id', () => { const CrawlWithConfigSchema = z.object({ url: z.string().url(), session_id: z.string().optional(), }); const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', }); expect(result.session_id).toBeUndefined(); }); it('should require js_only when using js_code with session_id WITHOUT output options', () => { // Create a schema that mirrors the real one's refinement const CrawlWithConfigSchema = z .object({ url: z.string().url(), session_id: z.string().optional(), js_code: z.union([z.string(), z.array(z.string())]).optional(), js_only: z.boolean().optional(), screenshot: z.boolean().optional(), pdf: z.boolean().optional(), }) .refine( (data) => { // Only require js_only when using js_code + session_id WITHOUT any output options if (data.js_code && data.session_id && !data.js_only && !data.screenshot && !data.pdf) { return false; } return true; }, { message: 'When using js_code with session_id WITHOUT screenshot or pdf, you MUST set js_only: true to prevent server errors. If you want screenshots/PDFs, you can omit js_only. Correct usage: crawl({url, session_id, js_code: [...], js_only: true})', }, ); // Should fail without js_only when no output options expect(() => { CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], }); }).toThrow('When using js_code with session_id WITHOUT screenshot or pdf'); // Should pass with js_only: true const result = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], js_only: true, }); expect(result.js_only).toBe(true); // Should pass with screenshot (no js_only required) const result2 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], screenshot: true, }); expect(result2.screenshot).toBe(true); expect(result2.js_only).toBeUndefined(); // Should pass with pdf (no js_only required) const result3 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', js_code: ['document.querySelector("button").click()'], pdf: true, }); expect(result3.pdf).toBe(true); expect(result3.js_only).toBeUndefined(); // Should pass without js_code const result4 = CrawlWithConfigSchema.parse({ url: 'https://example.com', session_id: 'test-session', }); expect(result4.session_id).toBe('test-session'); // Should pass without session_id const result5 = CrawlWithConfigSchema.parse({ url: 'https://example.com', js_code: ['document.querySelector("button").click()'], }); expect(result5.js_code).toBeDefined(); }); }); describe('JavaScript code validation', () => { const validateJavaScriptCode = (code: string): boolean => { if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { return false; } if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { return false; } if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { return false; } if (/[;})]\s*\\n\s*\w/.test(code)) { return false; } return true; }; const JsCodeSchema = z.union([ z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), z.array( z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), ), ]); it('should reject JavaScript with HTML entities', () => { expect(() => { JsCodeSchema.parse('document.querySelector("button").click()'); }).toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should reject JavaScript with literal \\n between statements', () => { expect(() => { JsCodeSchema.parse('console.log("line1");\\nconsole.log("line2")'); }).toThrow('Invalid JavaScript: Contains HTML entities'); }); it('should accept valid JavaScript with \\n inside strings', () => { const result = JsCodeSchema.parse('console.log("line1\\nline2")'); expect(result).toBe('console.log("line1\\nline2")'); }); it('should accept valid multiline JavaScript', () => { const code = `// Comment document.querySelector('button').click(); return true;`; const result = JsCodeSchema.parse(code); expect(result).toBe(code); }); it('should validate arrays of JavaScript code', () => { // Invalid array expect(() => { JsCodeSchema.parse(['document.querySelector("input").value = "test"', 'form.submit()']); }).toThrow('Invalid JavaScript: Contains HTML entities'); // Valid array const validArray = ['document.querySelector("input").value = "test"', 'form.submit()']; const result = JsCodeSchema.parse(validArray); expect(result).toEqual(validArray); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/omgwtfwow/mcp-crawl4ai-ts'

If you have feedback or need assistance with the MCP directory API, please join our Discord server