Scrapy MCP Server

test_server_mcp_tools.py•22.2 KiB

""" 单元测试：MCP Server 工具函数测试所有 14 个 @app.tool() 装饰器的 MCP 工具函数 """ from unittest.mock import AsyncMock, Mock, patch import pytest import extractor.server as server_module # BaseModel request classes have been removed - tools now use individual parameters with Annotated Field # 获取实际的函数，而不是 FunctionTool 包装器 scrape_webpage = server_module.scrape_webpage.fn scrape_multiple_webpages = server_module.scrape_multiple_webpages.fn extract_links = server_module.extract_links.fn get_page_info = server_module.get_page_info.fn check_robots_txt = server_module.check_robots_txt.fn scrape_with_stealth = server_module.scrape_with_stealth.fn fill_and_submit_form = server_module.fill_and_submit_form.fn get_server_metrics = server_module.get_server_metrics.fn clear_cache = server_module.clear_cache.fn extract_structured_data = server_module.extract_structured_data.fn convert_webpage_to_markdown = server_module.convert_webpage_to_markdown.fn batch_convert_webpages_to_markdown = server_module.batch_convert_webpages_to_markdown.fn convert_pdf_to_markdown = server_module.convert_pdf_to_markdown.fn batch_convert_pdfs_to_markdown = server_module.batch_convert_pdfs_to_markdown.fn class TestMCPToolsScraping: """测试基础网页抓取 MCP 工具""" @pytest.mark.asyncio async def test_scrape_webpage_success(self): """测试单页面抓取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "url": "https://example.com", "status_code": 200, "title": "Test Page", "content": {"text": "Sample content"}, } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Now using individual parameters instead of request object result = await scrape_webpage( url="https://example.com", method="simple", extract_config=None, wait_for_element=None, ) assert result.success is True assert result.data == mock_result assert result.method == "simple" mock_scraper.scrape_url.assert_called_once() @pytest.mark.asyncio async def test_scrape_webpage_invalid_url(self): """测试无效URL处理 - 现在在函数内部验证""" result = await scrape_webpage( url="invalid-url", method="simple", extract_config=None, wait_for_element=None, ) assert result.success is False assert "Invalid URL format" in result.error @pytest.mark.asyncio async def test_scrape_webpage_invalid_method(self): """测试无效方法处理 - 现在在函数内部验证""" result = await scrape_webpage( url="https://example.com", method="invalid-method", extract_config=None, wait_for_element=None, ) assert result.success is False assert "Method must be one of" in result.error @pytest.mark.asyncio async def test_scrape_multiple_webpages_success(self): """测试批量抓取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_results = [ {"url": "https://example.com/1", "status_code": 200}, {"url": "https://example.com/2", "status_code": 200}, ] mock_scraper.scrape_multiple_urls = AsyncMock(return_value=mock_results) # Now using individual parameters result = await scrape_multiple_webpages( urls=["https://example.com/1", "https://example.com/2"], method="simple", extract_config=None, ) assert result.success is True assert result.summary["total"] == 2 assert result.summary["successful"] == 2 @pytest.mark.asyncio async def test_scrape_multiple_webpages_empty_list(self): """测试空URL列表处理 - 现在在函数内部验证""" result = await scrape_multiple_webpages( urls=[], method="simple", extract_config=None ) assert result.success is False assert "URLs list cannot be empty" in result.summary["error"] @pytest.mark.asyncio async def test_extract_links_success(self): """测试链接提取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "content": { "links": [ {"url": "https://example.com/page1", "text": "Page 1"}, {"url": "https://external.com/page", "text": "External"}, ] } } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_links( url="https://example.com", filter_domains=None, exclude_domains=None, internal_only=True, ) assert result.success is True # 内部链接过滤应该只保留同域名链接 internal_links = [ link for link in result.links if "example.com" in link.url ] assert len(internal_links) >= 1 @pytest.mark.asyncio async def test_extract_links_domain_filtering(self): """测试域名过滤功能""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "content": { "links": [ {"url": "https://example.com/page1", "text": "Page 1"}, {"url": "https://allowed.com/page", "text": "Allowed"}, {"url": "https://blocked.com/page", "text": "Blocked"}, ] } } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_links( url="https://example.com", filter_domains=["example.com", "allowed.com"], exclude_domains=["blocked.com"], internal_only=False, ) assert result.success is True # 检查过滤结果 for link in result.links: assert "blocked.com" not in link.url class TestMCPToolsInformation: """测试页面信息获取 MCP 工具""" @pytest.mark.asyncio async def test_get_page_info_success(self): """测试页面信息获取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "url": "https://example.com", "status_code": 200, "title": "Test Page", "meta_description": "A test page", } mock_scraper.simple_scraper.scrape = AsyncMock(return_value=mock_result) # Using individual parameter result = await get_page_info(url="https://example.com") assert result.success is True assert result.title == "Test Page" assert result.status_code == 200 @pytest.mark.asyncio async def test_check_robots_txt_success(self): """测试robots.txt检查成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = {"content": {"text": "User-agent: *\nDisallow: /admin/"}} mock_scraper.simple_scraper.scrape = AsyncMock(return_value=mock_result) # Using individual parameter result = await check_robots_txt(url="https://example.com") assert result.success is True assert "User-agent" in result.robots_content assert "example.com" in result.url @pytest.mark.asyncio async def test_check_robots_txt_not_found(self): """测试robots.txt不存在""" with patch("extractor.server.web_scraper") as mock_scraper: mock_scraper.simple_scraper.scrape = AsyncMock( return_value={"error": "404 Not Found"} ) # Using individual parameter result = await check_robots_txt(url="https://example.com") assert result.success is False assert "Could not fetch robots.txt" in result.error class TestMCPToolsAdvanced: """测试高级功能 MCP 工具""" @pytest.mark.asyncio async def test_scrape_with_stealth_success(self): """测试反检测抓取成功""" with ( patch("extractor.server.anti_detection_scraper"), patch("extractor.server.rate_limiter") as mock_limiter, patch("extractor.server.cache_manager") as mock_cache, patch("extractor.server.retry_manager") as mock_retry, ): mock_limiter.wait = AsyncMock() mock_cache.get.return_value = None mock_result = { "url": "https://example.com", "status_code": 200, "content": {"text": "Stealth content"}, } mock_retry.retry_async = AsyncMock(return_value=mock_result) result = await scrape_with_stealth( url="https://example.com", method="selenium", extract_config=None, wait_for_element=None, scroll_page=False, ) assert result.success is True assert result.data == mock_result @pytest.mark.asyncio async def test_fill_and_submit_form_success(self): """测试表单填写成功""" with ( patch("extractor.server.rate_limiter") as mock_limiter, patch("selenium.webdriver.Chrome") as mock_driver, patch("extractor.server.settings") as mock_settings, ): mock_limiter.wait = AsyncMock() mock_settings.browser_headless = True mock_settings.browser_timeout = 10 mock_driver_instance = Mock() mock_driver.return_value = mock_driver_instance result = await fill_and_submit_form( url="https://example.com/form", form_data={"#username": "test", "#password": "secret"}, submit=False, submit_button_selector=None, method="selenium", wait_for_element=None, ) # 由于复杂的浏览器交互，这里主要测试参数验证 assert hasattr(result, "success") @pytest.mark.asyncio async def test_extract_structured_data_success(self): """测试结构化数据提取成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_result = { "content": { "text": "Contact us at info@example.com or call 123-456-7890", "links": [ {"url": "https://facebook.com/page", "text": "Facebook"}, {"url": "https://twitter.com/page", "text": "Twitter"}, ], }, "title": "Contact Page", "meta_description": "Contact information", } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_structured_data( url="https://example.com/contact", data_type="contact" ) assert result.success is True assert result.extracted_data is not None assert result.data_type == "contact" class TestMCPToolsServer: """测试服务器管理 MCP 工具""" @pytest.mark.asyncio async def test_get_server_metrics_success(self): """测试服务器指标获取成功""" with ( patch("extractor.server.metrics_collector") as mock_metrics, patch("extractor.server.cache_manager") as mock_cache, patch("extractor.server.settings") as mock_settings, ): mock_metrics.get_stats.return_value = { "total_requests": 100, "successful_requests": 95, "failed_requests": 5, } mock_cache.stats.return_value = {"cache_hits": 50, "cache_misses": 50} mock_settings.server_name = "Test Server" mock_settings.server_version = "0.1.6.1" result = await get_server_metrics() assert result.success is True assert result.total_requests == 100 assert result.successful_requests == 95 assert result.failed_requests == 5 @pytest.mark.asyncio async def test_clear_cache_success(self): """测试缓存清理成功""" with patch("extractor.server.cache_manager") as mock_cache: mock_cache.clear.return_value = None result = await clear_cache() assert result.success is True assert "Cache cleared successfully" in result.message mock_cache.clear.assert_called_once() class TestMCPToolsMarkdown: """测试 Markdown 转换 MCP 工具""" @pytest.mark.asyncio async def test_convert_webpage_to_markdown_success(self): """测试单页面Markdown转换成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.markdown_converter") as mock_converter, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_scrape_result = { "url": "https://example.com", "content": {"html": "<h1>Test</h1><p>Content</p>"}, "title": "Test Page", } mock_scraper.scrape_url = AsyncMock(return_value=mock_scrape_result) mock_conversion_result = { "success": True, "markdown": "# Test\n\nContent", "metadata": {"word_count": 2, "processing_time": 0.5}, } mock_converter.convert_webpage_to_markdown.return_value = ( mock_conversion_result ) result = await convert_webpage_to_markdown( url="https://example.com", method="simple", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) assert result.success is True assert result.markdown_content == "# Test\n\nContent" @pytest.mark.asyncio async def test_batch_convert_webpages_to_markdown_success(self): """测试批量Markdown转换成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.markdown_converter") as mock_converter, ): mock_scrape_results = [ { "url": "https://example.com/1", "content": {"html": "<h1>Page 1</h1>"}, }, { "url": "https://example.com/2", "content": {"html": "<h1>Page 2</h1>"}, }, ] mock_scraper.scrape_multiple_urls = AsyncMock( return_value=mock_scrape_results ) mock_conversion_result = { "success": True, "results": [ {"success": True, "markdown": "# Page 1"}, {"success": True, "markdown": "# Page 2"}, ], "summary": {"total": 2, "successful": 2, "failed": 0}, } mock_converter.batch_convert_to_markdown.return_value = ( mock_conversion_result ) result = await batch_convert_webpages_to_markdown( urls=["https://example.com/1", "https://example.com/2"], method="simple", extract_main_content=True, include_metadata=True, custom_options=None, embed_images=False, embed_options=None, ) assert result.success is True assert result.total_urls == 2 class TestMCPToolsPDF: """测试 PDF 处理 MCP 工具""" @pytest.mark.asyncio async def test_convert_pdf_to_markdown_success(self): """测试PDF转Markdown成功""" with ( patch("extractor.server._get_pdf_processor") as mock_get_processor, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_processor = Mock() mock_processor.process_pdf = AsyncMock( return_value={ "success": True, "markdown": "# PDF Title\n\nPDF content", "metadata": {"pages": 10, "word_count": 500}, } ) mock_get_processor.return_value = mock_processor result = await convert_pdf_to_markdown( pdf_source="https://example.com/document.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", extract_images=True, extract_tables=True, extract_formulas=True, embed_images=False, enhanced_options=None, ) assert result.success is True assert result.content == "# PDF Title\n\nPDF content" @pytest.mark.asyncio async def test_convert_pdf_to_markdown_invalid_method(self): """测试PDF转换无效方法""" result = await convert_pdf_to_markdown( pdf_source="https://example.com/document.pdf", method="invalid-method", include_metadata=True, page_range=None, output_format="markdown", extract_images=True, extract_tables=True, extract_formulas=True, embed_images=False, enhanced_options=None, ) assert result.success is False assert "Method must be one of" in result.error @pytest.mark.asyncio async def test_batch_convert_pdfs_to_markdown_success(self): """测试批量PDF转换成功""" with ( patch("extractor.server._get_pdf_processor") as mock_get_processor, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_processor = Mock() mock_processor.batch_process_pdfs = AsyncMock( return_value={ "success": True, "results": [ {"success": True, "markdown": "# PDF 1"}, {"success": True, "markdown": "# PDF 2"}, ], "summary": {"total": 2, "successful": 2, "failed": 0}, } ) mock_get_processor.return_value = mock_processor result = await batch_convert_pdfs_to_markdown( pdf_sources=[ "https://example.com/doc1.pdf", "https://example.com/doc2.pdf", ], method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is True assert result.total_pdfs == 2 @pytest.mark.asyncio async def test_batch_convert_pdfs_to_markdown_empty_list(self): """测试批量PDF转换空列表""" result = await batch_convert_pdfs_to_markdown( pdf_sources=[], method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is False # Empty list should result in failed operation class TestMCPToolsValidation: """测试 MCP 工具参数验证""" @pytest.mark.asyncio async def test_invalid_urls_handling(self): """测试无效URL的一致性处理""" invalid_urls = [ "not-a-url", "ftp://example.com", # 非HTTP协议 "", # 空字符串 "http://", # 不完整URL ] for invalid_url in invalid_urls: # 测试单页面抓取 - 现在在函数内部验证 result = await scrape_webpage( url=invalid_url, method="simple", extract_config=None, wait_for_element=None, ) # 结果应该失败 assert result.success is False error_msg = result.error assert any( phrase in error_msg for phrase in [ "Invalid URL format", "No connection adapters", "Unsupported protocol", "Invalid schema", ] ) # 测试页面信息获取 result = await get_page_info(url=invalid_url) assert result.success is False error_msg = result.error assert any( phrase in error_msg for phrase in [ "Invalid URL format", "No connection adapters", "Unsupported protocol", "Invalid schema", ] ) @pytest.mark.asyncio async def test_method_validation_consistency(self): """测试方法参数验证的一致性""" invalid_methods = ["invalid", "unknown", "", "AUTO"] # 大写应该无效 for invalid_method in invalid_methods: # 测试不同工具的方法验证一致性 result = await scrape_webpage( url="https://example.com", method=invalid_method, extract_config=None, wait_for_element=None, ) assert result.success is False assert "Method must be one of" in result.error

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_server_mcp_tools.py•22.2 KiB