mocks.pyā¢12.2 kB
"""
Mock infrastructure for Crawl4AI MCP Server testing.
This module provides comprehensive mocking for external dependencies including
AsyncWebCrawler, Crawl4AI extraction strategies, and HTTP requests.
"""
from typing import Dict, Any, List
class MockCrawlResult:
"""Mock class for Crawl4AI crawl results."""
def __init__(
self,
url: str = "https://example.com",
success: bool = True,
cleaned_html: str = None,
markdown: str = None,
screenshot: str = None,
extracted_content: List[Dict] = None,
error_message: str = None
):
self.url = url
self.success = success
self.cleaned_html = cleaned_html or self._default_html()
self.markdown = markdown or self._default_markdown()
self.screenshot = screenshot or self._default_screenshot()
self.extracted_content = extracted_content or []
self.error_message = error_message
# Additional attributes that might be accessed
self.status_code = 200 if success else 500
self.response_headers = {"content-type": "text/html"}
self.crawl_time = 1.5
def _default_html(self) -> str:
"""Generate default HTML content."""
return """
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<h1>Main Title</h1>
<div class="content">
<p class="description">This is a test page</p>
<span class="price">$29.99</span>
<ul class="features">
<li>Feature 1</li>
<li>Feature 2</li>
</ul>
</div>
</body>
</html>
"""
def _default_markdown(self) -> str:
"""Generate default markdown content."""
return """
# Main Title
This is a test page
**Price:** $29.99
## Features
- Feature 1
- Feature 2
"""
def _default_screenshot(self) -> str:
"""Generate default base64 screenshot data."""
# Minimal valid PNG data (1x1 transparent pixel)
return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
class MockAsyncWebCrawler:
"""Mock class for AsyncWebCrawler."""
def __init__(self, verbose: bool = False, **kwargs):
self.verbose = verbose
self.config = kwargs
self._is_started = False
async def __aenter__(self):
"""Async context manager entry."""
await self.astart()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.aclose()
async def astart(self):
"""Start the crawler."""
self._is_started = True
async def aclose(self):
"""Close the crawler."""
self._is_started = False
async def arun(
self,
url: str,
extraction_strategy = None,
crawler_run_config = None,
**kwargs
) -> MockCrawlResult:
"""
Mock crawl operation.
Args:
url: URL to crawl
extraction_strategy: Optional extraction strategy
crawler_run_config: Optional crawler configuration
**kwargs: Additional arguments
Returns:
MockCrawlResult: Mocked crawl result
"""
if not self._is_started:
raise RuntimeError("Crawler not started")
# Simulate different URL scenarios
if "invalid-url" in url:
return MockCrawlResult(
url=url,
success=False,
error_message="Invalid URL"
)
if "timeout" in url:
return MockCrawlResult(
url=url,
success=False,
error_message="Request timeout"
)
if "status/500" in url:
return MockCrawlResult(
url=url,
success=False,
error_message="HTTP 500 Internal Server Error"
)
# Handle extraction strategy
extracted_content = []
if extraction_strategy:
# Simulate successful extraction with some sample data
extracted_content = [{
"title": "Extracted Title",
"price": "$19.99",
"description": "Extracted description"
}]
# Handle screenshot configuration
screenshot_data = None
if crawler_run_config and hasattr(crawler_run_config, 'screenshot') and crawler_run_config.screenshot:
screenshot_data = MockCrawlResult()._default_screenshot()
return MockCrawlResult(
url=url,
success=True,
extracted_content=extracted_content,
screenshot=screenshot_data
)
class MockJsonCssExtractionStrategy:
"""Mock class for JsonCssExtractionStrategy."""
def __init__(self, schema: Dict[str, str], verbose: bool = False):
self.schema = schema
self.verbose = verbose
def extract(self, html: str) -> List[Dict[str, Any]]:
"""
Mock extraction method.
Args:
html: HTML content to extract from
Returns:
List of extracted data dictionaries
"""
# Simulate extraction based on schema
extracted_data = {}
for field, selector in self.schema.items():
# Simple simulation based on common patterns
if field.lower() in ['title', 'heading', 'name']:
extracted_data[field] = "Extracted Title"
elif field.lower() in ['price', 'cost', 'amount']:
extracted_data[field] = "$19.99"
elif field.lower() in ['description', 'content', 'text']:
extracted_data[field] = "Extracted description content"
elif field.lower() in ['link', 'url', 'href']:
extracted_data[field] = "https://example.com/link"
else:
extracted_data[field] = f"Extracted {field}"
return [extracted_data] if extracted_data else []
class MockCrawlerRunConfig:
"""Mock class for CrawlerRunConfig."""
def __init__(self, screenshot: bool = False, **kwargs):
self.screenshot = screenshot
for key, value in kwargs.items():
setattr(self, key, value)
class MockHttpResponse:
"""Mock HTTP response class."""
def __init__(
self,
status_code: int = 200,
text: str = None,
json_data: Dict = None,
headers: Dict = None
):
self.status_code = status_code
self.text = text or "<html><body>Mock response</body></html>"
self._json_data = json_data or {"status": "ok"}
self.headers = headers or {"content-type": "text/html"}
def json(self):
"""Return JSON data."""
return self._json_data
async def aclose(self):
"""Async close method."""
pass
class MockAsyncHttpClient:
"""Mock async HTTP client."""
def __init__(self):
self.closed = False
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.aclose()
async def get(self, url: str, **kwargs) -> MockHttpResponse:
"""Mock GET request."""
if "invalid-url" in url:
raise Exception("Connection error")
if "timeout" in url:
raise Exception("Request timeout")
if "status/500" in url:
return MockHttpResponse(status_code=500, text="Internal Server Error")
return MockHttpResponse(status_code=200)
async def aclose(self):
"""Close the client."""
self.closed = True
# Factory functions for creating configured mocks
def create_mock_crawler(
success: bool = True,
extracted_data: List[Dict] = None,
screenshot_enabled: bool = False
) -> MockAsyncWebCrawler:
"""
Create a configured mock AsyncWebCrawler.
Args:
success: Whether crawl operations should succeed
extracted_data: Data to return from extractions
screenshot_enabled: Whether to include screenshot data
Returns:
Configured MockAsyncWebCrawler
"""
crawler = MockAsyncWebCrawler()
# Override arun method if needed for specific scenarios
original_arun = crawler.arun
async def custom_arun(url, extraction_strategy=None, crawler_run_config=None, **kwargs):
result = await original_arun(url, extraction_strategy, crawler_run_config, **kwargs)
if not success:
result.success = False
result.error_message = "Mock crawl failure"
if extracted_data:
result.extracted_content = extracted_data
if screenshot_enabled and crawler_run_config and getattr(crawler_run_config, 'screenshot', False):
result.screenshot = MockCrawlResult()._default_screenshot()
return result
crawler.arun = custom_arun
return crawler
def create_mock_extraction_strategy(
extracted_data: List[Dict] = None
) -> MockJsonCssExtractionStrategy:
"""
Create a configured mock JsonCssExtractionStrategy.
Args:
extracted_data: Data to return from extraction
Returns:
Configured MockJsonCssExtractionStrategy
"""
strategy = MockJsonCssExtractionStrategy({})
if extracted_data:
def custom_extract(html):
return extracted_data
strategy.extract = custom_extract
return strategy
# Test data generators
def generate_test_html(
title: str = "Test Page",
content: str = "Test content",
include_price: bool = True,
include_features: bool = True
) -> str:
"""Generate test HTML with configurable content."""
price_html = '<span class="price">$29.99</span>' if include_price else ''
features_html = '''
<ul class="features">
<li>Feature 1</li>
<li>Feature 2</li>
</ul>
''' if include_features else ''
return f"""
<!DOCTYPE html>
<html>
<head>
<title>{title}</title>
</head>
<body>
<h1>{title}</h1>
<div class="content">
<p class="description">{content}</p>
{price_html}
{features_html}
</div>
</body>
</html>
"""
def generate_test_schema(fields: List[str] = None) -> Dict[str, str]:
"""Generate test extraction schema."""
if fields is None:
fields = ["title", "price", "description"]
schema_mapping = {
"title": "h1",
"price": ".price",
"description": ".description",
"features": ".features li",
"link": "a",
"image": "img"
}
return {field: schema_mapping.get(field, f".{field}") for field in fields}
def generate_extracted_data(
schema: Dict[str, str] = None,
num_items: int = 1
) -> List[Dict[str, Any]]:
"""Generate test extracted data matching a schema."""
if schema is None:
schema = generate_test_schema()
base_data = {}
for field in schema.keys():
if field.lower() in ['title', 'heading', 'name']:
base_data[field] = "Test Title"
elif field.lower() in ['price', 'cost', 'amount']:
base_data[field] = "$19.99"
elif field.lower() in ['description', 'content', 'text']:
base_data[field] = "Test description content"
elif field.lower() in ['link', 'url', 'href']:
base_data[field] = "https://example.com/test"
else:
base_data[field] = f"Test {field}"
return [base_data.copy() for _ in range(num_items)]