Scrapy MCP Server

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

test_server_mcp_tools.py•22.4 kB

""" 单元测试：MCP Server 工具函数测试所有 14 个 @app.tool() 装饰器的 MCP 工具函数 """ import pytest from unittest.mock import Mock, patch, AsyncMock import extractor.server as server_module # BaseModel request classes have been removed - tools now use individual parameters with Annotated Field # 获取实际的函数，而不是 FunctionTool 包装器 scrape_webpage = server_module.scrape_webpage.fn scrape_multiple_webpages = server_module.scrape_multiple_webpages.fn extract_links = server_module.extract_links.fn get_page_info = server_module.get_page_info.fn check_robots_txt = server_module.check_robots_txt.fn scrape_with_stealth = server_module.scrape_with_stealth.fn fill_and_submit_form = server_module.fill_and_submit_form.fn get_server_metrics = server_module.get_server_metrics.fn clear_cache = server_module.clear_cache.fn extract_structured_data = server_module.extract_structured_data.fn convert_webpage_to_markdown = server_module.convert_webpage_to_markdown.fn batch_convert_webpages_to_markdown = server_module.batch_convert_webpages_to_markdown.fn convert_pdf_to_markdown = server_module.convert_pdf_to_markdown.fn batch_convert_pdfs_to_markdown = server_module.batch_convert_pdfs_to_markdown.fn class TestMCPToolsScraping: """测试基础网页抓取 MCP 工具""" @pytest.mark.asyncio async def test_scrape_webpage_success(self): """测试单页面抓取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "url": "https://example.com", "status_code": 200, "title": "Test Page", "content": {"text": "Sample content"}, } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Now using individual parameters instead of request object result = await scrape_webpage( url="https://example.com", method="simple", extract_config=None, wait_for_element=None, ) assert result.success is True assert result.data == mock_result assert result.method == "simple" mock_scraper.scrape_url.assert_called_once() @pytest.mark.asyncio async def test_scrape_webpage_invalid_url(self): """测试无效URL处理 - 现在在函数内部验证""" result = await scrape_webpage( url="invalid-url", method="simple", extract_config=None, wait_for_element=None, ) assert result.success is False assert "Invalid URL format" in result.error @pytest.mark.asyncio async def test_scrape_webpage_invalid_method(self): """测试无效方法处理 - 现在在函数内部验证""" result = await scrape_webpage( url="https://example.com", method="invalid-method", extract_config=None, wait_for_element=None, ) assert result.success is False assert "Method must be one of" in result.error @pytest.mark.asyncio async def test_scrape_multiple_webpages_success(self): """测试批量抓取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_results = [ {"url": "https://example.com/1", "status_code": 200}, {"url": "https://example.com/2", "status_code": 200}, ] mock_scraper.scrape_multiple_urls = AsyncMock(return_value=mock_results) # Now using individual parameters result = await scrape_multiple_webpages( urls=["https://example.com/1", "https://example.com/2"], method="simple", extract_config=None, ) assert result.success is True assert result.summary["total"] == 2 assert result.summary["successful"] == 2 @pytest.mark.asyncio async def test_scrape_multiple_webpages_empty_list(self): """测试空URL列表处理 - 现在在函数内部验证""" result = await scrape_multiple_webpages( urls=[], method="simple", extract_config=None ) assert result.success is False assert "URLs list cannot be empty" in result.summary["error"] @pytest.mark.asyncio async def test_extract_links_success(self): """测试链接提取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "content": { "links": [ {"url": "https://example.com/page1", "text": "Page 1"}, {"url": "https://external.com/page", "text": "External"}, ] } } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_links( url="https://example.com", filter_domains=None, exclude_domains=None, internal_only=True, ) assert result.success is True # 内部链接过滤应该只保留同域名链接 internal_links = [ link for link in result.links if "example.com" in link.url ] assert len(internal_links) >= 1 @pytest.mark.asyncio async def test_extract_links_domain_filtering(self): """测试域名过滤功能""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "content": { "links": [ {"url": "https://example.com/page1", "text": "Page 1"}, {"url": "https://allowed.com/page", "text": "Allowed"}, {"url": "https://blocked.com/page", "text": "Blocked"}, ] } } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_links( url="https://example.com", filter_domains=["example.com", "allowed.com"], exclude_domains=["blocked.com"], internal_only=False, ) assert result.success is True # 检查过滤结果 for link in result.links: assert "blocked.com" not in link.url class TestMCPToolsInformation: """测试页面信息获取 MCP 工具""" @pytest.mark.asyncio async def test_get_page_info_success(self): """测试页面信息获取成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = { "url": "https://example.com", "status_code": 200, "title": "Test Page", "meta_description": "A test page", } mock_scraper.simple_scraper.scrape = AsyncMock(return_value=mock_result) # Using individual parameter result = await get_page_info(url="https://example.com") assert result.success is True assert result.title == "Test Page" assert result.status_code == 200 @pytest.mark.asyncio async def test_check_robots_txt_success(self): """测试robots.txt检查成功""" with patch("extractor.server.web_scraper") as mock_scraper: mock_result = {"content": {"text": "User-agent: *\nDisallow: /admin/"}} mock_scraper.simple_scraper.scrape = AsyncMock(return_value=mock_result) # Using individual parameter result = await check_robots_txt(url="https://example.com") assert result.success is True assert "User-agent" in result.robots_content assert "example.com" in result.url @pytest.mark.asyncio async def test_check_robots_txt_not_found(self): """测试robots.txt不存在""" with patch("extractor.server.web_scraper") as mock_scraper: mock_scraper.simple_scraper.scrape = AsyncMock( return_value={"error": "404 Not Found"} ) # Using individual parameter result = await check_robots_txt(url="https://example.com") assert result.success is False assert "Could not fetch robots.txt" in result.error class TestMCPToolsAdvanced: """测试高级功能 MCP 工具""" @pytest.mark.asyncio async def test_scrape_with_stealth_success(self): """测试反检测抓取成功""" with ( patch("extractor.server.anti_detection_scraper"), patch("extractor.server.rate_limiter") as mock_limiter, patch("extractor.server.cache_manager") as mock_cache, patch("extractor.server.retry_manager") as mock_retry, ): mock_limiter.wait = AsyncMock() mock_cache.get.return_value = None mock_result = { "url": "https://example.com", "status_code": 200, "content": {"text": "Stealth content"}, } mock_retry.retry_async = AsyncMock(return_value=mock_result) result = await scrape_with_stealth( url="https://example.com", method="selenium", extract_config=None, wait_for_element=None, scroll_page=False, ) assert result.success is True assert result.data == mock_result @pytest.mark.asyncio async def test_fill_and_submit_form_success(self): """测试表单填写成功""" with ( patch("extractor.server.rate_limiter") as mock_limiter, patch("selenium.webdriver.Chrome") as mock_driver, patch("extractor.server.settings") as mock_settings, ): mock_limiter.wait = AsyncMock() mock_settings.browser_headless = True mock_settings.browser_timeout = 10 mock_driver_instance = Mock() mock_driver.return_value = mock_driver_instance result = await fill_and_submit_form( url="https://example.com/form", form_data={"#username": "test", "#password": "secret"}, submit=False, submit_button_selector=None, method="selenium", wait_for_element=None, ) # 由于复杂的浏览器交互，这里主要测试参数验证 assert hasattr(result, "success") @pytest.mark.asyncio async def test_extract_structured_data_success(self): """测试结构化数据提取成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_result = { "content": { "text": "Contact us at info@example.com or call 123-456-7890", "links": [ {"url": "https://facebook.com/page", "text": "Facebook"}, {"url": "https://twitter.com/page", "text": "Twitter"}, ], }, "title": "Contact Page", "meta_description": "Contact information", } mock_scraper.scrape_url = AsyncMock(return_value=mock_result) # Using individual parameters result = await extract_structured_data( url="https://example.com/contact", data_type="contact" ) assert result.success is True assert result.extracted_data is not None assert result.data_type == "contact" class TestMCPToolsServer: """测试服务器管理 MCP 工具""" @pytest.mark.asyncio async def test_get_server_metrics_success(self): """测试服务器指标获取成功""" with ( patch("extractor.server.metrics_collector") as mock_metrics, patch("extractor.server.cache_manager") as mock_cache, patch("extractor.server.settings") as mock_settings, ): mock_metrics.get_stats.return_value = { "total_requests": 100, "successful_requests": 95, "failed_requests": 5, } mock_cache.stats.return_value = {"cache_hits": 50, "cache_misses": 50} mock_settings.server_name = "Test Server" mock_settings.server_version = "0.1.5" result = await get_server_metrics() assert result.success is True assert result.total_requests == 100 assert result.successful_requests == 95 assert result.failed_requests == 5 @pytest.mark.asyncio async def test_clear_cache_success(self): """测试缓存清理成功""" with patch("extractor.server.cache_manager") as mock_cache: mock_cache.clear.return_value = None result = await clear_cache() assert result.success is True assert "Cache cleared successfully" in result.message mock_cache.clear.assert_called_once() class TestMCPToolsMarkdown: """测试 Markdown 转换 MCP 工具""" @pytest.mark.asyncio async def test_convert_webpage_to_markdown_success(self): """测试单页面Markdown转换成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.markdown_converter") as mock_converter, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_scrape_result = { "url": "https://example.com", "content": {"html": "<h1>Test</h1><p>Content</p>"}, "title": "Test Page", } mock_scraper.scrape_url = AsyncMock(return_value=mock_scrape_result) mock_conversion_result = { "success": True, "markdown": "# Test\n\nContent", "metadata": {"word_count": 2, "processing_time": 0.5}, } mock_converter.convert_webpage_to_markdown.return_value = ( mock_conversion_result ) result = await convert_webpage_to_markdown( url="https://example.com", method="simple", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) assert result.success is True assert result.markdown_content == "# Test\n\nContent" @pytest.mark.asyncio async def test_batch_convert_webpages_to_markdown_success(self): """测试批量Markdown转换成功""" with ( patch("extractor.server.web_scraper") as mock_scraper, patch("extractor.server.markdown_converter") as mock_converter, ): mock_scrape_results = [ { "url": "https://example.com/1", "content": {"html": "<h1>Page 1</h1>"}, }, { "url": "https://example.com/2", "content": {"html": "<h1>Page 2</h1>"}, }, ] mock_scraper.scrape_multiple_urls = AsyncMock( return_value=mock_scrape_results ) mock_conversion_result = { "success": True, "results": [ {"success": True, "markdown": "# Page 1"}, {"success": True, "markdown": "# Page 2"}, ], "summary": {"total": 2, "successful": 2, "failed": 0}, } mock_converter.batch_convert_to_markdown.return_value = ( mock_conversion_result ) result = await batch_convert_webpages_to_markdown( urls=["https://example.com/1", "https://example.com/2"], method="simple", extract_main_content=True, include_metadata=True, custom_options=None, embed_images=False, embed_options=None, ) assert result.success is True assert result.total_urls == 2 class TestMCPToolsPDF: """测试 PDF 处理 MCP 工具""" @pytest.mark.asyncio async def test_convert_pdf_to_markdown_success(self): """测试PDF转Markdown成功""" with ( patch("extractor.server._get_pdf_processor") as mock_get_processor, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_processor = Mock() mock_processor.process_pdf = AsyncMock( return_value={ "success": True, "markdown": "# PDF Title\n\nPDF content", "metadata": {"pages": 10, "word_count": 500}, } ) mock_get_processor.return_value = mock_processor result = await convert_pdf_to_markdown( pdf_source="https://example.com/document.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is True assert result.content == "# PDF Title\n\nPDF content" @pytest.mark.asyncio async def test_convert_pdf_to_markdown_invalid_method(self): """测试PDF转换无效方法""" result = await convert_pdf_to_markdown( pdf_source="https://example.com/document.pdf", method="invalid-method", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is False assert "Method must be one of" in result.error @pytest.mark.asyncio async def test_batch_convert_pdfs_to_markdown_success(self): """测试批量PDF转换成功""" with ( patch("extractor.server._get_pdf_processor") as mock_get_processor, patch("extractor.server.rate_limiter") as mock_limiter, ): mock_limiter.wait = AsyncMock() mock_processor = Mock() mock_processor.batch_process_pdfs = AsyncMock( return_value={ "success": True, "results": [ {"success": True, "markdown": "# PDF 1"}, {"success": True, "markdown": "# PDF 2"}, ], "summary": {"total": 2, "successful": 2, "failed": 0}, } ) mock_get_processor.return_value = mock_processor result = await batch_convert_pdfs_to_markdown( pdf_sources=[ "https://example.com/doc1.pdf", "https://example.com/doc2.pdf", ], method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is True assert result.total_pdfs == 2 @pytest.mark.asyncio async def test_batch_convert_pdfs_to_markdown_empty_list(self): """测试批量PDF转换空列表""" result = await batch_convert_pdfs_to_markdown( pdf_sources=[], method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert result.success is False # Empty list should result in failed operation class TestMCPToolsValidation: """测试 MCP 工具参数验证""" @pytest.mark.asyncio async def test_invalid_urls_handling(self): """测试无效URL的一致性处理""" invalid_urls = [ "not-a-url", "ftp://example.com", # 非HTTP协议 "", # 空字符串 "http://", # 不完整URL ] for invalid_url in invalid_urls: # 测试单页面抓取 - 现在在函数内部验证 result = await scrape_webpage( url=invalid_url, method="simple", extract_config=None, wait_for_element=None, ) # 结果应该失败 assert result.success is False error_msg = result.error assert any( phrase in error_msg for phrase in [ "Invalid URL format", "No connection adapters", "Unsupported protocol", "Invalid schema", ] ) # 测试页面信息获取 result = await get_page_info(url=invalid_url) assert result.success is False error_msg = result.error assert any( phrase in error_msg for phrase in [ "Invalid URL format", "No connection adapters", "Unsupported protocol", "Invalid schema", ] ) @pytest.mark.asyncio async def test_method_validation_consistency(self): """测试方法参数验证的一致性""" invalid_methods = ["invalid", "unknown", "", "AUTO"] # 大写应该无效 for invalid_method in invalid_methods: # 测试不同工具的方法验证一致性 result = await scrape_webpage( url="https://example.com", method=invalid_method, extract_config=None, wait_for_element=None, ) assert result.success is False assert "Method must be one of" in result.error

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server