test_tools.pyā¢17.7 kB
"""
Unit tests for all MCP tools in the Crawl4AI MCP Server.
This module tests server_status, get_page_structure, crawl_with_schema,
and take_screenshot tools using in-memory FastMCP client.
"""
import pytest
import json
from pathlib import Path
import fastmcp
from crawl4ai_mcp_server import mcp
from tests.mocks import (
MockCrawlerRunConfig,
create_mock_crawler,
create_mock_extraction_strategy
)
class TestServerStatusTool:
"""Test cases for the server_status tool."""
@pytest.mark.asyncio
@pytest.mark.unit
async def test_server_status_success(self):
"""Test server_status tool returns correct status information."""
async with fastmcp.Client(mcp) as client:
result = await client.call_tool("server_status", {})
assert result is not None
assert hasattr(result, 'content')
assert result.content is not None
# Get the content (handle both list and single content formats)
content = result.content[0] if isinstance(result.content, list) else result.content
data = json.loads(content.text)
# Verify required fields
assert "server_name" in data
assert "version" in data
assert "status" in data
assert "transport" in data
assert "working_directory" in data
assert "capabilities" in data
assert "dependencies" in data
assert "message" in data
# Verify values
assert data["server_name"] == "Crawl4AI-MCP-Server"
assert data["version"] == "1.0.0"
assert data["status"] == "operational"
assert data["transport"] == "stdio"
assert isinstance(data["capabilities"], list)
assert isinstance(data["dependencies"], dict)
# Verify capabilities
expected_capabilities = [
"web_crawling",
"content_extraction",
"screenshot_capture",
"schema_based_extraction"
]
for capability in expected_capabilities:
assert capability in data["capabilities"]
# Verify dependencies
assert "crawl4ai" in data["dependencies"]
assert "fastmcp" in data["dependencies"]
assert "playwright" in data["dependencies"]
@pytest.mark.asyncio
@pytest.mark.unit
async def test_server_status_working_directory(self, mcp_client):
"""Test that server_status includes correct working directory."""
result = await mcp_client.call_tool("server_status", {})
content = result.content[0] if isinstance(result.content, list) else result.content
data = json.loads(content.text)
# Verify working directory is a valid path
working_dir = data["working_directory"]
assert working_dir is not None
assert len(working_dir) > 0
# The working directory should be the current directory
expected_cwd = str(Path.cwd())
assert working_dir == expected_cwd
class TestGetPageStructureTool:
"""Test cases for the get_page_structure tool."""
@pytest.mark.asyncio
@pytest.mark.unit
async def test_get_page_structure_html_format(self, mcp_client, crawl4ai_patches):
"""Test get_page_structure with HTML format."""
# Configure mock crawler
mock_crawler = create_mock_crawler(success=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
result = await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com",
"format": "html"
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
html_content = content.text
# Verify HTML content
assert len(html_content) > 100
assert "<html>" in html_content
assert "</html>" in html_content
assert "<title>" in html_content
@pytest.mark.asyncio
@pytest.mark.unit
async def test_get_page_structure_markdown_format(self, mcp_client, crawl4ai_patches):
"""Test get_page_structure with markdown format."""
mock_crawler = create_mock_crawler(success=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
result = await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com",
"format": "markdown"
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
markdown_content = content.text
# Verify markdown content
assert len(markdown_content) > 50
assert "#" in markdown_content # Markdown headers
@pytest.mark.asyncio
@pytest.mark.unit
async def test_get_page_structure_default_format(self, mcp_client, crawl4ai_patches):
"""Test get_page_structure with default (HTML) format."""
mock_crawler = create_mock_crawler(success=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
result = await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com"
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
html_content = content.text
# Should default to HTML
assert "<html>" in html_content
@pytest.mark.asyncio
@pytest.mark.unit
async def test_get_page_structure_invalid_url(self, mcp_client, crawl4ai_patches):
"""Test get_page_structure with invalid URL."""
mock_crawler = create_mock_crawler(success=False)
crawl4ai_patches['crawler'].return_value = mock_crawler
with pytest.raises(Exception):
await mcp_client.call_tool("get_page_structure", {
"url": "https://invalid-url-test.com"
})
@pytest.mark.asyncio
@pytest.mark.unit
async def test_get_page_structure_invalid_format(self, mcp_client):
"""Test get_page_structure with invalid format parameter."""
with pytest.raises(Exception):
await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com",
"format": "invalid_format"
})
class TestCrawlWithSchemaTool:
"""Test cases for the crawl_with_schema tool."""
@pytest.mark.asyncio
@pytest.mark.unit
async def test_crawl_with_schema_valid_schema(self, mcp_client, crawl4ai_patches):
"""Test crawl_with_schema with valid schema."""
# Configure mocks
mock_crawler = create_mock_crawler(success=True)
mock_strategy = create_mock_extraction_strategy(
extracted_data=[{"title": "Test Title", "price": "$19.99"}]
)
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['strategy'].return_value = mock_strategy
schema = json.dumps({
"title": "h1",
"price": ".price"
})
result = await mcp_client.call_tool("crawl_with_schema", {
"url": "https://example.com",
"extraction_schema": schema
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
response = json.loads(content.text)
# Verify response structure
assert "success" in response
assert "extracted_data" in response
assert response["success"] is True
assert isinstance(response["extracted_data"], list)
assert len(response["extracted_data"]) > 0
# Verify extracted data
extracted = response["extracted_data"][0]
assert "title" in extracted
assert "price" in extracted
@pytest.mark.asyncio
@pytest.mark.unit
async def test_crawl_with_schema_invalid_json(self, mcp_client):
"""Test crawl_with_schema with invalid JSON schema."""
with pytest.raises(Exception):
await mcp_client.call_tool("crawl_with_schema", {
"url": "https://example.com",
"extraction_schema": "invalid json {"
})
@pytest.mark.asyncio
@pytest.mark.unit
async def test_crawl_with_schema_empty_schema(self, mcp_client, crawl4ai_patches):
"""Test crawl_with_schema with empty schema."""
mock_crawler = create_mock_crawler(success=True)
mock_strategy = create_mock_extraction_strategy(extracted_data=[])
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['strategy'].return_value = mock_strategy
result = await mcp_client.call_tool("crawl_with_schema", {
"url": "https://example.com",
"extraction_schema": "{}"
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
response = json.loads(content.text)
assert "success" in response
assert "extracted_data" in response
# Empty schema should still succeed but return empty data
assert isinstance(response["extracted_data"], list)
@pytest.mark.asyncio
@pytest.mark.unit
async def test_crawl_with_schema_crawl_failure(self, mcp_client, crawl4ai_patches):
"""Test crawl_with_schema when crawling fails."""
mock_crawler = create_mock_crawler(success=False)
crawl4ai_patches['crawler'].return_value = mock_crawler
schema = json.dumps({"title": "h1"})
with pytest.raises(Exception):
await mcp_client.call_tool("crawl_with_schema", {
"url": "https://invalid-url-test.com",
"extraction_schema": schema
})
@pytest.mark.asyncio
@pytest.mark.unit
async def test_crawl_with_schema_complex_schema(self, mcp_client, crawl4ai_patches):
"""Test crawl_with_schema with complex schema."""
mock_crawler = create_mock_crawler(success=True)
complex_data = [{
"title": "Product Title",
"price": "$29.99",
"description": "Product description",
"features": ["Feature 1", "Feature 2"],
"link": "https://example.com/product"
}]
mock_strategy = create_mock_extraction_strategy(extracted_data=complex_data)
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['strategy'].return_value = mock_strategy
complex_schema = json.dumps({
"title": "h1",
"price": ".price",
"description": ".description",
"features": ".features li",
"link": "a.product-link"
})
result = await mcp_client.call_tool("crawl_with_schema", {
"url": "https://example.com",
"extraction_schema": complex_schema
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
response = json.loads(content.text)
assert response["success"] is True
extracted = response["extracted_data"][0]
assert all(key in extracted for key in ["title", "price", "description", "features", "link"])
class TestTakeScreenshotTool:
"""Test cases for the take_screenshot tool."""
@pytest.mark.asyncio
@pytest.mark.unit
async def test_take_screenshot_success(self, mcp_client, crawl4ai_patches):
"""Test take_screenshot tool successful capture."""
mock_crawler = create_mock_crawler(success=True, screenshot_enabled=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True)
result = await mcp_client.call_tool("take_screenshot", {
"url": "https://example.com"
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
response = json.loads(content.text)
# Verify response structure
assert "success" in response
assert "screenshot_data" in response
assert "format" in response
assert "url" in response
assert response["success"] is True
assert response["url"] == "https://example.com"
assert response["format"] == "base64"
assert len(response["screenshot_data"]) > 50 # Base64 data should be substantial
@pytest.mark.asyncio
@pytest.mark.unit
async def test_take_screenshot_invalid_url(self, mcp_client, crawl4ai_patches):
"""Test take_screenshot with invalid URL."""
mock_crawler = create_mock_crawler(success=False)
crawl4ai_patches['crawler'].return_value = mock_crawler
with pytest.raises(Exception):
await mcp_client.call_tool("take_screenshot", {
"url": "https://invalid-url-test.com"
})
@pytest.mark.asyncio
@pytest.mark.unit
async def test_take_screenshot_no_screenshot_data(self, mcp_client, crawl4ai_patches):
"""Test take_screenshot when no screenshot data is returned."""
# Create a mock crawler that succeeds but returns no screenshot
mock_crawler = create_mock_crawler(success=True, screenshot_enabled=False)
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True)
with pytest.raises(Exception):
await mcp_client.call_tool("take_screenshot", {
"url": "https://example.com"
})
class TestToolIntegration:
"""Integration tests for tool interactions."""
@pytest.mark.asyncio
@pytest.mark.integration
async def test_all_tools_available(self, mcp_client):
"""Test that all expected tools are available."""
# This test verifies tool registration without calling them
tools = await mcp_client.list_tools()
tool_names = [tool.name for tool in tools]
expected_tools = [
"server_status",
"get_page_structure",
"crawl_with_schema",
"take_screenshot"
]
for tool in expected_tools:
assert tool in tool_names
@pytest.mark.asyncio
@pytest.mark.integration
async def test_tools_have_proper_schemas(self, mcp_client):
"""Test that all tools have proper input schemas."""
tools = await mcp_client.list_tools()
for tool in tools:
assert hasattr(tool, 'inputSchema')
assert tool.inputSchema is not None
assert 'type' in tool.inputSchema
assert tool.inputSchema['type'] == 'object'
@pytest.mark.asyncio
@pytest.mark.unit
async def test_sequential_tool_calls(self, mcp_client, crawl4ai_patches):
"""Test calling multiple tools in sequence."""
# Configure mocks
mock_crawler = create_mock_crawler(success=True, screenshot_enabled=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
crawl4ai_patches['config'].return_value = MockCrawlerRunConfig(screenshot=True)
# Call server_status first
status_result = await mcp_client.call_tool("server_status", {})
assert status_result is not None
# Then get page structure
page_result = await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com"
})
assert page_result is not None
# Finally take screenshot
screenshot_result = await mcp_client.call_tool("take_screenshot", {
"url": "https://example.com"
})
assert screenshot_result is not None
# All should succeed independently
assert all(result is not None for result in [status_result, page_result, screenshot_result])
# Additional parametrized tests
@pytest.mark.asyncio
@pytest.mark.unit
@pytest.mark.parametrize("format_type", ["html", "markdown"])
async def test_get_page_structure_formats(mcp_client, crawl4ai_patches, format_type):
"""Test get_page_structure with different formats."""
mock_crawler = create_mock_crawler(success=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
result = await mcp_client.call_tool("get_page_structure", {
"url": "https://example.com",
"format": format_type
})
assert result is not None
content = result.content[0] if isinstance(result.content, list) else result.content
assert len(content.text) > 50
@pytest.mark.asyncio
@pytest.mark.unit
@pytest.mark.parametrize("url", [
"https://example.com",
"https://httpbin.org/html",
"https://www.google.com"
])
async def test_tools_with_different_urls(mcp_client, crawl4ai_patches, url):
"""Test tools with different valid URLs."""
mock_crawler = create_mock_crawler(success=True)
crawl4ai_patches['crawler'].return_value = mock_crawler
# Test get_page_structure
result = await mcp_client.call_tool("get_page_structure", {"url": url})
assert result is not None
# Test crawl_with_schema
schema = json.dumps({"title": "h1"})
result = await mcp_client.call_tool("crawl_with_schema", {
"url": url,
"extraction_schema": schema
})
assert result is not None