Crawl4AI MCP Server

test_tools.py•17.7 kB

""" Unit tests for all MCP tools in the Crawl4AI MCP Server. This module tests server_status, get_page_structure, crawl_with_schema, and take_screenshot tools using in-memory FastMCP client. """ import pytest import json from pathlib import Path import fastmcp from crawl4ai_mcp_server import mcp from tests.mocks import ( MockCrawlerRunConfig, create_mock_crawler, create_mock_extraction_strategy ) class TestServerStatusTool: """Test cases for the server_status tool.""" @pytest.mark.asyncio @pytest.mark.unit async def test_server_status_success(self): """Test server_status tool returns correct status information.""" async with fastmcp.Client(mcp) as client: result = await client.call_tool("server_status", {}) assert result is not None assert hasattr(result, 'content') assert result.content is not None # Get the content (handle both list and single content formats) content = result.content[0] if isinstance(result.content, list) else result.content data = json.loads(content.text) # Verify required fields assert "server_name" in data assert "version" in data assert "status" in data assert "transport" in data assert "working_directory" in data assert "capabilities" in data assert "dependencies" in data assert "message" in data # Verify values assert data["server_name"] == "Crawl4AI-MCP-Server" assert data["version"] == "1.0.0" assert data["status"] == "operational" assert data["transport"] == "stdio" assert isinstance(data["capabilities"], list) assert isinstance(data["dependencies"], dict) # Verify capabilities expected_capabilities = [ "web_crawling", "content_extraction", "screenshot_capture", "schema_based_extraction" ] for capability in expected_capabilities: assert capability in data["capabilities"] # Verify dependencies assert "crawl4ai" in data["dependencies"] assert "fastmcp" in data["dependencies"] assert "playwright" in data["dependencies"] @pytest.mark.asyncio @pytest.mark.unit async def test_server_status_working_directory(self, mcp_client): """Test that server_status includes correct working directory.""" result = await mcp_client.call_tool("server_status", {}) content = result.content[0] if isinstance(result.content, list) else result.content data = json.loads(content.text) # Verify working directory is a valid path working_dir = data["working_directory"] assert working_dir is not None assert len(working_dir) > 0 # The working directory should be the current directory expected_cwd = str(Path.cwd()) assert working_dir == expected_cwd class TestGetPageStructureTool: """Test cases for the get_page_structure tool.""" @pytest.mark.asyncio @pytest.mark.unit async def test_get_page_structure_html_format(self, mcp_client, crawl4ai_patches): """Test get_page_structure with HTML format.""" # Configure mock crawler mock_crawler = create_mock_crawler(success=True) crawl4ai_patches['crawler'].return_value = mock_crawler result = await mcp_client.call_tool("get_page_structure", { "url": "https://example.com", "format": "html" }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content html_content = content.text # Verify HTML content assert len(html_content) > 100 assert "<html>" in html_content assert "</html>" in html_content assert "<title>" in html_content @pytest.mark.asyncio @pytest.mark.unit async def test_get_page_structure_markdown_format(self, mcp_client, crawl4ai_patches): """Test get_page_structure with markdown format.""" mock_crawler = create_mock_crawler(success=True) crawl4ai_patches['crawler'].return_value = mock_crawler result = await mcp_client.call_tool("get_page_structure", { "url": "https://example.com", "format": "markdown" }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content markdown_content = content.text # Verify markdown content assert len(markdown_content) > 50 assert "#" in markdown_content # Markdown headers @pytest.mark.asyncio @pytest.mark.unit async def test_get_page_structure_default_format(self, mcp_client, crawl4ai_patches): """Test get_page_structure with default (HTML) format.""" mock_crawler = create_mock_crawler(success=True) crawl4ai_patches['crawler'].return_value = mock_crawler result = await mcp_client.call_tool("get_page_structure", { "url": "https://example.com" }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content html_content = content.text # Should default to HTML assert "<html>" in html_content @pytest.mark.asyncio @pytest.mark.unit async def test_get_page_structure_invalid_url(self, mcp_client, crawl4ai_patches): """Test get_page_structure with invalid URL.""" mock_crawler = create_mock_crawler(success=False) crawl4ai_patches['crawler'].return_value = mock_crawler with pytest.raises(Exception): await mcp_client.call_tool("get_page_structure", { "url": "https://invalid-url-test.com" }) @pytest.mark.asyncio @pytest.mark.unit async def test_get_page_structure_invalid_format(self, mcp_client): """Test get_page_structure with invalid format parameter.""" with pytest.raises(Exception): await mcp_client.call_tool("get_page_structure", { "url": "https://example.com", "format": "invalid_format" }) class TestCrawlWithSchemaTool: """Test cases for the crawl_with_schema tool.""" @pytest.mark.asyncio @pytest.mark.unit async def test_crawl_with_schema_valid_schema(self, mcp_client, crawl4ai_patches): """Test crawl_with_schema with valid schema.""" # Configure mocks mock_crawler = create_mock_crawler(success=True) mock_strategy = create_mock_extraction_strategy( extracted_data=[{"title": "Test Title", "price": "$19.99"}] ) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['strategy'].return_value = mock_strategy schema = json.dumps({ "title": "h1", "price": ".price" }) result = await mcp_client.call_tool("crawl_with_schema", { "url": "https://example.com", "extraction_schema": schema }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content response = json.loads(content.text) # Verify response structure assert "success" in response assert "extracted_data" in response assert response["success"] is True assert isinstance(response["extracted_data"], list) assert len(response["extracted_data"]) > 0 # Verify extracted data extracted = response["extracted_data"][0] assert "title" in extracted assert "price" in extracted @pytest.mark.asyncio @pytest.mark.unit async def test_crawl_with_schema_invalid_json(self, mcp_client): """Test crawl_with_schema with invalid JSON schema.""" with pytest.raises(Exception): await mcp_client.call_tool("crawl_with_schema", { "url": "https://example.com", "extraction_schema": "invalid json {" }) @pytest.mark.asyncio @pytest.mark.unit async def test_crawl_with_schema_empty_schema(self, mcp_client, crawl4ai_patches): """Test crawl_with_schema with empty schema.""" mock_crawler = create_mock_crawler(success=True) mock_strategy = create_mock_extraction_strategy(extracted_data=[]) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['strategy'].return_value = mock_strategy result = await mcp_client.call_tool("crawl_with_schema", { "url": "https://example.com", "extraction_schema": "{}" }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content response = json.loads(content.text) assert "success" in response assert "extracted_data" in response # Empty schema should still succeed but return empty data assert isinstance(response["extracted_data"], list) @pytest.mark.asyncio @pytest.mark.unit async def test_crawl_with_schema_crawl_failure(self, mcp_client, crawl4ai_patches): """Test crawl_with_schema when crawling fails.""" mock_crawler = create_mock_crawler(success=False) crawl4ai_patches['crawler'].return_value = mock_crawler schema = json.dumps({"title": "h1"}) with pytest.raises(Exception): await mcp_client.call_tool("crawl_with_schema", { "url": "https://invalid-url-test.com", "extraction_schema": schema }) @pytest.mark.asyncio @pytest.mark.unit async def test_crawl_with_schema_complex_schema(self, mcp_client, crawl4ai_patches): """Test crawl_with_schema with complex schema.""" mock_crawler = create_mock_crawler(success=True) complex_data = [{ "title": "Product Title", "price": "$29.99", "description": "Product description", "features": ["Feature 1", "Feature 2"], "link": "https://example.com/product" }] mock_strategy = create_mock_extraction_strategy(extracted_data=complex_data) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['strategy'].return_value = mock_strategy complex_schema = json.dumps({ "title": "h1", "price": ".price", "description": ".description", "features": ".features li", "link": "a.product-link" }) result = await mcp_client.call_tool("crawl_with_schema", { "url": "https://example.com", "extraction_schema": complex_schema }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content response = json.loads(content.text) assert response["success"] is True extracted = response["extracted_data"][0] assert all(key in extracted for key in ["title", "price", "description", "features", "link"]) class TestTakeScreenshotTool: """Test cases for the take_screenshot tool.""" @pytest.mark.asyncio @pytest.mark.unit async def test_take_screenshot_success(self, mcp_client, crawl4ai_patches): """Test take_screenshot tool successful capture.""" mock_crawler = create_mock_crawler(success=True, screenshot_enabled=True) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True) result = await mcp_client.call_tool("take_screenshot", { "url": "https://example.com" }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content response = json.loads(content.text) # Verify response structure assert "success" in response assert "screenshot_data" in response assert "format" in response assert "url" in response assert response["success"] is True assert response["url"] == "https://example.com" assert response["format"] == "base64" assert len(response["screenshot_data"]) > 50 # Base64 data should be substantial @pytest.mark.asyncio @pytest.mark.unit async def test_take_screenshot_invalid_url(self, mcp_client, crawl4ai_patches): """Test take_screenshot with invalid URL.""" mock_crawler = create_mock_crawler(success=False) crawl4ai_patches['crawler'].return_value = mock_crawler with pytest.raises(Exception): await mcp_client.call_tool("take_screenshot", { "url": "https://invalid-url-test.com" }) @pytest.mark.asyncio @pytest.mark.unit async def test_take_screenshot_no_screenshot_data(self, mcp_client, crawl4ai_patches): """Test take_screenshot when no screenshot data is returned.""" # Create a mock crawler that succeeds but returns no screenshot mock_crawler = create_mock_crawler(success=True, screenshot_enabled=False) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True) with pytest.raises(Exception): await mcp_client.call_tool("take_screenshot", { "url": "https://example.com" }) class TestToolIntegration: """Integration tests for tool interactions.""" @pytest.mark.asyncio @pytest.mark.integration async def test_all_tools_available(self, mcp_client): """Test that all expected tools are available.""" # This test verifies tool registration without calling them tools = await mcp_client.list_tools() tool_names = [tool.name for tool in tools] expected_tools = [ "server_status", "get_page_structure", "crawl_with_schema", "take_screenshot" ] for tool in expected_tools: assert tool in tool_names @pytest.mark.asyncio @pytest.mark.integration async def test_tools_have_proper_schemas(self, mcp_client): """Test that all tools have proper input schemas.""" tools = await mcp_client.list_tools() for tool in tools: assert hasattr(tool, 'inputSchema') assert tool.inputSchema is not None assert 'type' in tool.inputSchema assert tool.inputSchema['type'] == 'object' @pytest.mark.asyncio @pytest.mark.unit async def test_sequential_tool_calls(self, mcp_client, crawl4ai_patches): """Test calling multiple tools in sequence.""" # Configure mocks mock_crawler = create_mock_crawler(success=True, screenshot_enabled=True) crawl4ai_patches['crawler'].return_value = mock_crawler crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True) # Call server_status first status_result = await mcp_client.call_tool("server_status", {}) assert status_result is not None # Then get page structure page_result = await mcp_client.call_tool("get_page_structure", { "url": "https://example.com" }) assert page_result is not None # Finally take screenshot screenshot_result = await mcp_client.call_tool("take_screenshot", { "url": "https://example.com" }) assert screenshot_result is not None # All should succeed independently assert all(result is not None for result in [status_result, page_result, screenshot_result]) # Additional parametrized tests @pytest.mark.asyncio @pytest.mark.unit @pytest.mark.parametrize("format_type", ["html", "markdown"]) async def test_get_page_structure_formats(mcp_client, crawl4ai_patches, format_type): """Test get_page_structure with different formats.""" mock_crawler = create_mock_crawler(success=True) crawl4ai_patches['crawler'].return_value = mock_crawler result = await mcp_client.call_tool("get_page_structure", { "url": "https://example.com", "format": format_type }) assert result is not None content = result.content[0] if isinstance(result.content, list) else result.content assert len(content.text) > 50 @pytest.mark.asyncio @pytest.mark.unit @pytest.mark.parametrize("url", [ "https://example.com", "https://httpbin.org/html", "https://www.google.com" ]) async def test_tools_with_different_urls(mcp_client, crawl4ai_patches, url): """Test tools with different valid URLs.""" mock_crawler = create_mock_crawler(success=True) crawl4ai_patches['crawler'].return_value = mock_crawler # Test get_page_structure result = await mcp_client.call_tool("get_page_structure", {"url": url}) assert result is not None # Test crawl_with_schema schema = json.dumps({"title": "h1"}) result = await mcp_client.call_tool("crawl_with_schema", { "url": url, "extraction_schema": schema }) assert result is not None

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Nexus-Digital-Automations/crawl4ai-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server