Skip to main content
Glama

Scrapy MCP Server

by ThreeFish-AI
test_updated_comprehensive_integration.py25.4 kB
""" 集成测试:更新的综合集成测试 测试整个MCP服务器功能的端到端集成,包括所有14个工具的协同工作 """ import pytest import asyncio import time from unittest.mock import patch, AsyncMock from extractor.server import app from extractor.config import settings class TestUpdatedComprehensiveIntegration: """更新的综合集成测试,覆盖所有主要功能""" @pytest.fixture def sample_html_content(self): """样本HTML内容用于测试""" return """ <!DOCTYPE html> <html> <head> <title>综合测试文章</title> <meta name="description" content="用于测试的综合示例文章"> <meta name="author" content="测试作者"> </head> <body> <nav>导航菜单</nav> <main> <article> <header> <h1>综合测试文章</h1> <p class="byline">作者:测试作者</p> </header> <div class="content"> <p>这是文章的主要内容,包含<strong>粗体</strong>和<em>斜体</em>文本。</p> <h2>演示的功能</h2> <ul> <li>HTML到Markdown转换</li> <li>高级格式化选项</li> <li>内容提取</li> <li>链接和图片处理</li> </ul> <table> <thead> <tr> <th>功能</th> <th>状态</th> <th>备注</th> </tr> </thead> <tbody> <tr> <td>网页爬取</td> <td>✅ 完成</td> <td>支持多种方法</td> </tr> <tr> <td>Markdown转换</td> <td>✅ 完成</td> <td>高级格式化</td> </tr> <tr> <td>PDF处理</td> <td>✅ 完成</td> <td>多引擎支持</td> </tr> </tbody> </table> <blockquote> <p>这是一个引用块,展示了丰富的HTML结构。</p> </blockquote> <div class="code-example"> <pre><code class="language-python"> def example_function(): return "Hello, World!" </code></pre> </div> <p>文章还包含一些链接: <a href="https://example.com">示例链接</a>和 <a href="/internal/page">内部链接</a>。 </p> <img src="/images/example.jpg" alt="示例图片" width="300" height="200"> </div> </article> </main> <aside>侧边栏内容</aside> <footer>页脚信息</footer> </body> </html> """ @pytest.fixture def sample_pdf_content(self): """样本PDF内容用于测试""" return "PDF文档内容\n\n这是一个测试PDF文档。\n\n包含多段落内容。" @pytest.mark.asyncio async def test_end_to_end_scrape_to_markdown_workflow(self, sample_html_content): """测试从网页爬取到Markdown转换的端到端工作流""" scrape_result = { "url": "https://example.com/article", "status_code": 200, "title": "综合测试文章", "content": { "text": "综合测试文章 作者:测试作者 这是文章的主要内容...", "html": sample_html_content, "links": [ {"url": "https://example.com", "text": "示例链接"}, {"url": "https://example.com/internal/page", "text": "内部链接"}, ], "images": [{"src": "/images/example.jpg", "alt": "示例图片"}], }, "meta_description": "用于测试的综合示例文章", } with ( patch("extractor.scraper.WebScraper.scrape_url") as mock_scrape, patch( "extractor.markdown_converter.MarkdownConverter.convert_webpage_to_markdown" ) as mock_convert, ): mock_scrape.return_value = scrape_result mock_convert.return_value = { "success": True, "url": "https://example.com/article", "markdown": "# 综合测试文章\n\n这是文章的主要内容,包含**粗体**和*斜体*文本。", "metadata": { "title": "综合测试文章", "word_count": 150, "links_count": 2, "images_count": 1, }, } # 验证工具链可以协同工作 scrape_tool = await app.get_tool("scrape_webpage") convert_tool = await app.get_tool("convert_webpage_to_markdown") assert scrape_tool is not None assert convert_tool is not None @pytest.mark.asyncio async def test_batch_processing_integration(self): """测试批量处理集成""" urls = ["https://example1.com", "https://example2.com", "https://example3.com"] batch_scrape_result = { "results": [ { "url": "https://example1.com", "status_code": 200, "title": "页面1", "content": {"text": "内容1", "html": "<h1>页面1</h1>"}, }, { "url": "https://example2.com", "status_code": 200, "title": "页面2", "content": {"text": "内容2", "html": "<h1>页面2</h1>"}, }, { "url": "https://example3.com", "status_code": 404, "error": "页面未找到", }, ], "summary": {"total": 3, "successful": 2, "failed": 1, "success_rate": 0.67}, } batch_convert_result = { "success": True, "results": [ { "success": True, "url": "https://example1.com", "markdown": "# 页面1\n\n内容1", }, { "success": True, "url": "https://example2.com", "markdown": "# 页面2\n\n内容2", }, ], "summary": {"total": 2, "successful": 2, "failed": 0}, } with ( patch( "extractor.scraper.WebScraper.scrape_multiple_urls" ) as mock_batch_scrape, patch( "extractor.markdown_converter.MarkdownConverter.batch_convert_to_markdown" ) as mock_batch_convert, ): mock_batch_scrape.return_value = batch_scrape_result mock_batch_convert.return_value = batch_convert_result batch_scrape_tool = await app.get_tool("scrape_multiple_webpages") batch_convert_tool = await app.get_tool( "batch_convert_webpages_to_markdown" ) assert batch_scrape_tool is not None assert batch_convert_tool is not None @pytest.mark.asyncio async def test_advanced_features_integration(self): """测试高级功能集成""" # 测试隐身爬取 stealth_result = { "url": "https://protected-site.com", "title": "受保护的页面", "content": { "text": "通过隐身技术获取的内容", "html": "<div>受保护的内容</div>", }, } # 测试表单处理 form_result = { "success": True, "results": { "#username": {"success": True, "value": "testuser"}, "#password": {"success": True, "value": "testpass"}, "_submit": {"success": True, "new_url": "https://site.com/dashboard"}, }, } # 测试结构化数据提取 structured_result = { "url": "https://ecommerce.com", "data": { "products": [ {"name": "产品1", "price": "$99.99"}, {"name": "产品2", "price": "$149.99"}, ], "contact": {"email": "info@ecommerce.com", "phone": "+1-555-0123"}, }, } with ( patch( "extractor.advanced_features.AntiDetectionScraper.scrape_with_stealth" ) as mock_stealth, patch("extractor.advanced_features.FormHandler.fill_form") as mock_form, ): mock_stealth.return_value = stealth_result mock_form.return_value = form_result stealth_tool = await app.get_tool("scrape_with_stealth") form_tool = await app.get_tool("fill_and_submit_form") structured_tool = await app.get_tool("extract_structured_data") assert stealth_tool is not None assert form_tool is not None assert structured_tool is not None @pytest.mark.asyncio async def test_pdf_processing_integration(self, sample_pdf_content): """测试PDF处理集成""" pdf_result = { "success": True, "source": "https://example.com/document.pdf", "text": sample_pdf_content, "markdown": f"# PDF文档\n\n{sample_pdf_content}", "metadata": { "pages_processed": 3, "word_count": 20, "method_used": "pymupdf", "file_size_bytes": 1024000, }, "output_format": "markdown", } batch_pdf_result = { "success": True, "results": [pdf_result, pdf_result], "summary": { "total_pdfs": 2, "successful": 2, "failed": 0, "total_pages_processed": 6, "total_words_extracted": 40, }, } with ( patch("extractor.pdf_processor.PDFProcessor.process_pdf") as mock_pdf, patch( "extractor.pdf_processor.PDFProcessor.batch_process_pdfs" ) as mock_batch_pdf, ): mock_pdf.return_value = pdf_result mock_batch_pdf.return_value = batch_pdf_result pdf_tool = await app.get_tool("convert_pdf_to_markdown") batch_pdf_tool = await app.get_tool("batch_convert_pdfs_to_markdown") assert pdf_tool is not None assert batch_pdf_tool is not None @pytest.mark.asyncio async def test_server_management_integration(self): """测试服务器管理集成""" metrics_result = { "server_info": { "name": settings.server_name, "version": settings.server_version, "uptime": "1h 23m 45s", }, "performance": { "total_requests": 1234, "successful_requests": 1200, "failed_requests": 34, "success_rate": 0.972, "average_response_time": 1.25, }, "cache": {"size": 150, "hit_rate": 0.85, "memory_usage": "25MB"}, "methods": { "simple": 500, "scrapy": 600, "selenium": 100, "playwright": 34, }, } cache_clear_result = { "success": True, "message": "缓存已清理", "items_cleared": 150, "memory_freed": "25MB", } with ( patch("extractor.utils.MetricsCollector.get_stats") as mock_metrics, patch("extractor.utils.CacheManager.clear") as mock_cache_clear, ): mock_metrics.return_value = metrics_result mock_cache_clear.return_value = cache_clear_result metrics_tool = await app.get_tool("get_server_metrics") cache_tool = await app.get_tool("clear_cache") assert metrics_tool is not None assert cache_tool is not None @pytest.mark.asyncio async def test_information_extraction_integration(self): """测试信息提取工具集成""" page_info_result = { "url": "https://example.com", "status": 200, "title": "示例页面", "description": "这是一个示例页面", "content_type": "text/html", "content_length": 5000, "last_modified": "2024-01-15", "server": "nginx/1.18.0", } links_result = { "url": "https://example.com", "links": [ { "url": "https://example.com/page1", "text": "页面1", "type": "internal", }, {"url": "https://external.com", "text": "外部链接", "type": "external"}, {"url": "mailto:info@example.com", "text": "联系我们", "type": "email"}, ], "summary": { "total_links": 3, "internal_links": 1, "external_links": 1, "email_links": 1, }, } robots_result = { "url": "https://example.com/robots.txt", "exists": True, "content": "User-agent: *\nDisallow: /admin/\nAllow: /", "rules": [{"user_agent": "*", "disallow": ["/admin/"], "allow": ["/"]}], "sitemap_urls": ["https://example.com/sitemap.xml"], } with ( patch("extractor.server.get_page_info") as mock_page_info, patch("extractor.server.extract_links") as mock_links, patch("extractor.server.check_robots_txt") as mock_robots, ): mock_page_info.return_value = page_info_result mock_links.return_value = links_result mock_robots.return_value = robots_result page_info_tool = await app.get_tool("get_page_info") links_tool = await app.get_tool("extract_links") robots_tool = await app.get_tool("check_robots_txt") assert page_info_tool is not None assert links_tool is not None assert robots_tool is not None class TestPerformanceAndLoadIntegration: """性能和负载集成测试""" @pytest.mark.slow @pytest.mark.asyncio async def test_concurrent_tool_access_performance(self): """测试并发工具访问性能""" async def access_tools_concurrently(): tasks = [] tool_names = [ "scrape_webpage", "convert_webpage_to_markdown", "get_server_metrics", "extract_links", "get_page_info", ] for tool_name in tool_names: task = app.get_tool(tool_name) tasks.append(task) return await asyncio.gather(*tasks) start_time = time.time() results = await access_tools_concurrently() end_time = time.time() access_time = end_time - start_time # 验证所有工具都能成功访问 assert all(tool is not None for tool in results) # 并发访问应该在合理时间内完成 assert access_time < 1.0, f"并发工具访问时间 {access_time:.2f}s 过长" @pytest.mark.slow @pytest.mark.asyncio async def test_batch_processing_scalability(self): """测试批量处理可扩展性""" # 模拟大量URL的批量处理 large_url_list = [f"https://example{i}.com" for i in range(100)] batch_result = { "results": [ { "url": url, "status_code": 200, "title": f"页面{i}", "content": {"text": f"内容{i}"}, } for i, url in enumerate(large_url_list[:10]) # 只模拟前10个成功 ] + [ {"url": url, "error": "超时"} for url in large_url_list[10:15] # 5个失败 ], "summary": { "total": 100, "successful": 10, "failed": 5, "skipped": 85, # 其他跳过以节省测试时间 }, } with patch("extractor.scraper.WebScraper.scrape_multiple_urls") as mock_batch: mock_batch.return_value = batch_result batch_tool = await app.get_tool("scrape_multiple_webpages") assert batch_tool is not None @pytest.mark.asyncio async def test_memory_usage_integration(self): """测试内存使用集成""" try: import psutil except ImportError: pytest.skip("psutil not available for memory monitoring") import os process = psutil.Process(os.getpid()) initial_memory = process.memory_info().rss # 执行一系列操作 tools = await app.get_tools() # 访问所有工具 for tool_name in tools.keys(): tool = await app.get_tool(tool_name) assert tool is not None final_memory = process.memory_info().rss memory_increase = final_memory - initial_memory # 内存增长应该在合理范围内(比如小于50MB) assert memory_increase < 50 * 1024 * 1024, ( f"内存增长 {memory_increase / 1024 / 1024:.1f}MB 过大" ) class TestErrorHandlingAndResilience: """错误处理和恢复性集成测试""" @pytest.mark.asyncio async def test_network_failure_resilience(self): """测试网络故障恢复性""" with patch("extractor.scraper.WebScraper.scrape_url") as mock_scrape: # 模拟网络故障 mock_scrape.side_effect = [ Exception("连接超时"), Exception("DNS解析失败"), { # 第三次尝试成功 "url": "https://example.com", "status_code": 200, "title": "成功页面", "content": {"text": "成功获取内容"}, }, ] # 工具应该能够处理这些错误 scrape_tool = await app.get_tool("scrape_webpage") assert scrape_tool is not None @pytest.mark.asyncio async def test_invalid_input_handling(self): """测试无效输入处理""" # 所有工具都应该存在并能处理基本的验证 tools = await app.get_tools() critical_tools = [ "scrape_webpage", "convert_webpage_to_markdown", "convert_pdf_to_markdown", "scrape_with_stealth", ] for tool_name in critical_tools: assert tool_name in tools, f"关键工具 {tool_name} 未注册" tool = tools[tool_name] assert tool is not None, f"工具 {tool_name} 为None" @pytest.mark.asyncio async def test_resource_exhaustion_handling(self): """测试资源耗尽处理""" # 模拟资源耗尽情况 with patch("tempfile.mkdtemp") as mock_mkdtemp: mock_mkdtemp.side_effect = OSError("磁盘空间不足") # 系统应该能够优雅地处理这种情况 pdf_tool = await app.get_tool("convert_pdf_to_markdown") assert pdf_tool is not None @pytest.mark.asyncio async def test_configuration_validation(self): """测试配置验证""" # 验证关键配置项 assert settings.server_name is not None assert settings.server_version is not None assert settings.concurrent_requests > 0 assert settings.request_timeout > 0 assert settings.browser_timeout > 0 # 验证工具能够使用这些配置 tools = await app.get_tools() assert len(tools) == 14 class TestSecurityAndCompliance: """安全和合规集成测试""" @pytest.mark.asyncio async def test_robots_txt_compliance_integration(self): """测试robots.txt合规性集成""" robots_result = { "url": "https://example.com/robots.txt", "exists": True, "content": "User-agent: *\nDisallow: /private/\nCrawl-delay: 1", "rules": [{"user_agent": "*", "disallow": ["/private/"], "crawl_delay": 1}], } with patch( "extractor.server.web_scraper.simple_scraper.scrape", new_callable=AsyncMock ) as mock_scrape: # Mock the robots.txt scraping result - no error means success mock_scrape.return_value = { "content": {"text": robots_result["content"]}, "status_code": 200, } # Get the check_robots_txt tool from the FastMCP app from extractor.server import check_robots_txt result = await check_robots_txt.fn(url="https://example.com") assert result.success is True assert "robots.txt" in result.robots_txt_url assert "Disallow: /private/" in result.robots_content @pytest.mark.asyncio async def test_user_agent_and_rate_limiting(self): """测试User-Agent和速率限制""" # 验证配置中的User-Agent和速率限制设置 assert settings.use_random_user_agent is not None assert settings.default_user_agent is not None assert settings.rate_limit_requests_per_minute > 0 # 验证相关工具存在 scrape_tool = await app.get_tool("scrape_webpage") stealth_tool = await app.get_tool("scrape_with_stealth") assert scrape_tool is not None assert stealth_tool is not None @pytest.mark.asyncio async def test_data_privacy_compliance(self): """测试数据隐私合规""" # 验证工具不会意外存储敏感信息 tools = await app.get_tools() # 所有工具都应该正确注册且可访问 for tool_name, tool in tools.items(): assert tool is not None assert hasattr(tool, "name") assert tool.name == tool_name class TestBackwardCompatibilityAndUpgrade: """向后兼容性和升级测试""" @pytest.mark.asyncio async def test_api_backward_compatibility(self): """测试API向后兼容性""" # 验证所有预期的工具仍然存在 expected_core_tools = [ "scrape_webpage", "convert_webpage_to_markdown", "convert_pdf_to_markdown", ] tools = await app.get_tools() for tool_name in expected_core_tools: assert tool_name in tools, f"核心工具 {tool_name} 缺失,可能破坏向后兼容性" @pytest.mark.asyncio async def test_configuration_upgrade_compatibility(self): """测试配置升级兼容性""" # 验证配置系统能够处理新旧配置格式 assert hasattr(settings, "server_name") assert hasattr(settings, "server_version") assert hasattr(settings, "concurrent_requests") assert hasattr(settings, "enable_caching") # 新功能配置也应该存在 assert hasattr(settings, "browser_headless") assert hasattr(settings, "use_random_user_agent") @pytest.mark.asyncio async def test_tool_interface_stability(self): """测试工具接口稳定性""" tools = await app.get_tools() # 验证所有工具都有稳定的接口 for tool_name, tool in tools.items(): assert tool is not None assert hasattr(tool, "name") assert hasattr(tool, "description") assert tool.name == tool_name assert isinstance(tool.description, str) assert len(tool.description) > 0

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server