Scrapy MCP Server

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

scrapy-mcp
tests
integration

test_cross_tool_integration.py•34.4 kB

"""Cross-tool integration tests for combined functionality scenarios.""" import pytest import pytest_asyncio import asyncio from unittest.mock import patch from extractor.server import ( app, web_scraper, _get_pdf_processor, ) class TestCrossToolIntegration: """Integration tests for scenarios involving multiple tools working together.""" @pytest.fixture def pdf_processor(self): """创建 PDF 处理器实例用于测试""" return _get_pdf_processor() @pytest_asyncio.fixture async def all_tools(self, pdf_processor): """Get all MCP tools from the app.""" return await app.get_tools() @pytest.mark.asyncio async def test_webpage_to_pdf_to_markdown_workflow(self, all_tools, pdf_processor): """Test a complete workflow: scrape webpage, then process any PDFs found.""" scrape_tool = all_tools["scrape_webpage"] convert_pdf_tool = all_tools["convert_pdf_to_markdown"] # Mock webpage scraping that finds PDF links webpage_result = { "url": "https://example.com/research-page", "title": "Research Papers", "status_code": 200, "content": { "html": """ <html> <body> <h1>Research Papers</h1> <p>Here are some important research papers:</p> <ul> <li><a href="/papers/paper1.pdf">Machine Learning Basics</a></li> <li><a href="/papers/paper2.pdf">Deep Learning Advanced</a></li> <li><a href="https://external.com/paper3.pdf">Neural Networks</a></li> </ul> </body> </html> """, "links": [ { "url": "https://example.com/papers/paper1.pdf", "text": "Machine Learning Basics", }, { "url": "https://example.com/papers/paper2.pdf", "text": "Deep Learning Advanced", }, { "url": "https://external.com/paper3.pdf", "text": "Neural Networks", }, ], }, } pdf_processing_result = { "success": True, "text": "# Machine Learning Basics\n\nThis paper covers fundamental concepts...", "markdown": "# Machine Learning Basics\n\nThis paper covers fundamental concepts in machine learning.", "source": "https://example.com/papers/paper1.pdf", "method_used": "pymupdf", "pages_processed": 15, "word_count": 5000, "metadata": { "title": "Machine Learning Basics", "author": "Dr. Smith", "total_pages": 15, }, } with ( patch.object(web_scraper, "scrape_url") as mock_scrape, patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, ): mock_scrape.return_value = webpage_result mock_pdf.return_value = pdf_processing_result # Step 1: Scrape the webpage webpage_response = await scrape_tool.fn( url="https://example.com/research-page", method="simple", extract_config=None, wait_for_element=None, ) assert webpage_response.success is True # Extract PDF links from the scraped content pdf_links = [ link["url"] for link in webpage_response.data["content"]["links"] if link["url"].endswith(".pdf") ] assert len(pdf_links) == 3 # Step 2: Process the first PDF found first_pdf_url = pdf_links[0] pdf_response = await convert_pdf_tool.fn( pdf_source=first_pdf_url, method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert pdf_response.success is True assert pdf_response.pdf_source == first_pdf_url assert "Machine Learning Basics" in pdf_response.content # Verify the workflow executed correctly mock_scrape.assert_called_once_with( url="https://example.com/research-page", method="simple", extract_config=None, wait_for_element=None, ) mock_pdf.assert_called_once_with( pdf_source=first_pdf_url, method="auto", include_metadata=True, page_range=None, output_format="markdown", ) @pytest.mark.asyncio async def test_batch_scraping_with_pdf_extraction_workflow( self, all_tools, pdf_processor ): """Test batch webpage scraping followed by batch PDF processing.""" batch_scrape_tool = all_tools["scrape_multiple_webpages"] batch_pdf_tool = all_tools["batch_convert_pdfs_to_markdown"] # Mock batch scraping results with mixed content including PDFs batch_scrape_results = [ { "url": "https://site1.com", "title": "Site 1 - PDF Repository", "content": { "html": "<html><body><h1>PDF Collection</h1><a href='/doc1.pdf'>Document 1</a></body></html>", "links": [ {"url": "https://site1.com/doc1.pdf", "text": "Document 1"} ], }, }, { "url": "https://site2.com", "title": "Site 2 - Research Portal", "content": { "html": "<html><body><h1>Research</h1><a href='/research.pdf'>Research Paper</a></body></html>", "links": [ { "url": "https://site2.com/research.pdf", "text": "Research Paper", } ], }, }, ] batch_pdf_results = { "success": True, "results": [ { "success": True, "text": "Document 1 content...", "markdown": "# Document 1\n\nContent of document 1.", "source": "https://site1.com/doc1.pdf", "word_count": 1000, }, { "success": True, "text": "Research paper content...", "markdown": "# Research Paper\n\nContent of research paper.", "source": "https://site2.com/research.pdf", "word_count": 3000, }, ], "summary": { "total_pdfs": 2, "successful_count": 2, "failed": 0, "total_word_count": 4000, }, } with ( patch.object(web_scraper, "scrape_multiple_urls") as mock_batch_scrape, patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "batch_process_pdfs") as mock_batch_pdf, ): mock_batch_scrape.return_value = batch_scrape_results mock_batch_pdf.return_value = batch_pdf_results # Step 1: Batch scrape multiple websites scrape_urls = ["https://site1.com", "https://site2.com"] scrape_response = await batch_scrape_tool.fn( urls=scrape_urls, method="simple", extract_config=None, ) assert scrape_response.success is True assert len(scrape_response.results) == 2 # Extract all PDF links from scraped results all_pdf_links = [] for result in scrape_response.results: if ( result.success and result.data and "content" in result.data and "links" in result.data["content"] ): pdf_links = [ link["url"] for link in result.data["content"]["links"] if link["url"].endswith(".pdf") ] all_pdf_links.extend(pdf_links) assert len(all_pdf_links) == 2 # Step 2: Batch process all found PDFs pdf_response = await batch_pdf_tool.fn( pdf_sources=all_pdf_links, method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert pdf_response.success is True assert pdf_response.total_pdfs == 2 assert pdf_response.successful_count == 2 assert pdf_response.total_word_count == 4000 @pytest.mark.asyncio async def test_metrics_collection_across_multiple_tools( self, all_tools, pdf_processor ): """Test that metrics are collected properly across different tool usage.""" scrape_tool = all_tools["scrape_webpage"] pdf_tool = all_tools["convert_pdf_to_markdown"] markdown_tool = all_tools["convert_webpage_to_markdown"] metrics_tool = all_tools["get_server_metrics"] # Mock responses for different tools scrape_result = { "url": "https://test.com", "title": "Test Page", "content": {"html": "<html><body><h1>Test</h1></body></html>"}, } pdf_result = { "success": True, "text": "PDF content", "markdown": "# PDF Document\n\nContent", "source": "/test.pdf", "word_count": 100, } _ = { "success": True, "markdown": "# Webpage\n\nConverted content", "metadata": {"title": "Test Page", "word_count": 50}, } with ( patch.object(web_scraper, "scrape_url") as mock_scrape, patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, ): mock_scrape.return_value = scrape_result mock_pdf.return_value = pdf_result # Use multiple tools to generate metrics await scrape_tool.fn( url="https://test.com", method="auto", extract_config=None, wait_for_element=None, ) await pdf_tool.fn( pdf_source="/test.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) await markdown_tool.fn( url="https://test.com", method="auto", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) # Check metrics collection metrics_response = await metrics_tool.fn() assert metrics_response.success is True # Verify metrics contain information about different operations # MetricsResponse has individual attributes, not a metrics dict assert hasattr(metrics_response, "total_requests") assert hasattr(metrics_response, "successful_requests") assert hasattr(metrics_response, "cache_stats") assert hasattr(metrics_response, "method_usage") @pytest.mark.asyncio async def test_error_propagation_across_tools(self, all_tools, pdf_processor): """Test how errors propagate when using multiple tools together.""" scrape_tool = all_tools["scrape_webpage"] pdf_tool = all_tools["convert_pdf_to_markdown"] # Mock a failed scraping operation with patch.object(web_scraper, "scrape_url") as mock_scrape: mock_scrape.side_effect = Exception("Network timeout") # First tool fails scrape_response = await scrape_tool.fn( url="https://unreachable.com", method="auto", extract_config=None, wait_for_element=None, ) assert scrape_response.success is False # Mock a failed PDF processing with ( patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, ): mock_pdf.return_value = { "success": False, "error": "PDF parsing failed", "source": "/corrupted.pdf", } # Second tool fails with proper error handling pdf_response = await pdf_tool.fn( pdf_source="/corrupted.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert pdf_response.success is False assert "PDF parsing failed" in ( pdf_response.error["message"] if isinstance(pdf_response.error, dict) else pdf_response.error ) @pytest.mark.asyncio async def test_resource_cleanup_across_multiple_tools( self, all_tools, pdf_processor ): """Test proper resource cleanup when using multiple tools.""" import gc # Track initial memory state gc.collect() initial_objects = len(gc.get_objects()) scrape_tool = all_tools["scrape_webpage"] pdf_tool = all_tools["convert_pdf_to_markdown"] batch_pdf_tool = all_tools["batch_convert_pdfs_to_markdown"] # Mock successful_count operations scrape_result = { "url": "https://test.com", "title": "Test", "content": {"html": "<html><body>Content</body></html>"}, } pdf_result = { "success": True, "text": "Content " * 1000, # Large content to test memory "markdown": "# Document\n\n" + "Paragraph.\n" * 500, "source": "/test.pdf", "word_count": 1000, } batch_pdf_result = { "success": True, "results": [pdf_result for _ in range(5)], "summary": {"total_pdfs": 5, "successful_count": 5, "failed": 0}, } with ( patch.object(web_scraper, "scrape_url") as mock_scrape, patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, patch.object(pdf_processor, "batch_process_pdfs") as mock_batch_pdf, ): mock_scrape.return_value = scrape_result mock_pdf.return_value = pdf_result mock_batch_pdf.return_value = batch_pdf_result # Perform multiple operations with large data for i in range(10): await scrape_tool.fn( url=f"https://test{i}.com", method="auto", extract_config=None, wait_for_element=None, ) await pdf_tool.fn( pdf_source=f"/test{i}.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) # Perform batch operation await batch_pdf_tool.fn( pdf_sources=[f"/batch{i}.pdf" for i in range(5)], method="auto", include_metadata=True, page_range=None, output_format="markdown", ) # Force garbage collection and check memory usage gc.collect() final_objects = len(gc.get_objects()) object_growth = final_objects - initial_objects # Allow reasonable object growth but detect potential leaks assert object_growth < 3000, ( f"Potential memory leak: {object_growth} new objects" ) @pytest.mark.asyncio async def test_concurrent_multi_tool_operations(self, all_tools, pdf_processor): """Test concurrent execution of different tools.""" scrape_tool = all_tools["scrape_webpage"] pdf_tool = all_tools["convert_pdf_to_markdown"] markdown_tool = all_tools["convert_webpage_to_markdown"] # Mock results for concurrent operations scrape_result = { "url": "https://concurrent-test.com", "title": "Concurrent Test", "content": {"html": "<html><body><h1>Test</h1></body></html>"}, } pdf_result = { "success": True, "text": "Concurrent PDF content", "markdown": "# Concurrent PDF\n\nContent", "source": "/concurrent.pdf", } _ = { "success": True, "markdown": "# Concurrent Markdown\n\nContent", "metadata": {"title": "Concurrent Test"}, } with ( patch.object(web_scraper, "scrape_url") as mock_scrape, patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, ): mock_scrape.return_value = scrape_result mock_pdf.return_value = pdf_result # Create concurrent tasks using different tools tasks = ( [ scrape_tool.fn( url=f"https://test{i}.com", method="auto", extract_config=None, wait_for_element=None, ) for i in range(3) ] + [ pdf_tool.fn( pdf_source=f"/test{i}.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) for i in range(3) ] + [ markdown_tool.fn( url=f"https://markdown{i}.com", method="auto", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) for i in range(3) ] ) # Execute all concurrently results = await asyncio.gather(*tasks) # Verify all operations succeeded for result in results: assert result.success is True # Verify appropriate number of calls to each mock assert ( mock_scrape.call_count == 6 ) # 3 scrape + 3 markdown (which uses scraping) assert mock_pdf.call_count == 3 # 3 PDF operations class TestRealWorldIntegrationScenarios: """Integration tests simulating real-world usage scenarios.""" @pytest.fixture def pdf_processor(self): """创建 PDF 处理器实例用于测试""" return _get_pdf_processor() @pytest_asyncio.fixture async def scenario_tools(self): """Get tools commonly used together in real scenarios.""" tools = await app.get_tools() return { "scrape_webpage": tools["scrape_webpage"], "scrape_multiple_webpages": tools["scrape_multiple_webpages"], "convert_webpage_to_markdown": tools["convert_webpage_to_markdown"], "convert_pdf_to_markdown": tools["convert_pdf_to_markdown"], "batch_convert_pdfs_to_markdown": tools["batch_convert_pdfs_to_markdown"], "extract_links": tools["extract_links"], "get_server_metrics": tools["get_server_metrics"], "clear_cache": tools["clear_cache"], } @pytest.mark.asyncio async def test_research_paper_collection_scenario( self, scenario_tools, pdf_processor ): """Test a complete research paper collection workflow.""" # Scenario: User wants to collect and convert all research papers from an academic site # Step 1: Extract all links from the main page extract_links_tool = scenario_tools["extract_links"] links_result = { "url": "https://academic-site.com/papers", "links": [ { "url": "https://academic-site.com/paper1.pdf", "text": "Machine Learning", }, { "url": "https://academic-site.com/paper2.pdf", "text": "Deep Learning", }, { "url": "https://academic-site.com/paper3.html", "text": "Overview Page", }, { "url": "https://academic-site.com/paper4.pdf", "text": "Neural Networks", }, ], } with patch.object(web_scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = { "url": "https://academic-site.com/papers", "content": {"links": links_result["links"]}, } # Extract all links links_response = await extract_links_tool.fn( url="https://academic-site.com/papers", filter_domains=None, exclude_domains=None, internal_only=False, ) assert links_response.success is True # Step 2: Filter PDF links and batch process them pdf_links = [ "https://academic-site.com/paper1.pdf", "https://academic-site.com/paper2.pdf", "https://academic-site.com/paper4.pdf", ] batch_pdf_tool = scenario_tools["batch_convert_pdfs_to_markdown"] batch_result = { "success": True, "results": [ { "success": True, "markdown": f"# Paper {i}\n\nResearch content {i}.", "source": pdf_links[i - 1], "word_count": 1000 * i, "metadata": {"title": f"Research Paper {i}"}, } for i in range(1, 4) ], "summary": { "total_pdfs": 3, "successful_count": 3, "failed": 0, "total_word_count": 6000, }, } with ( patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "batch_process_pdfs") as mock_batch_pdf, ): mock_batch_pdf.return_value = batch_result batch_response = await batch_pdf_tool.fn( pdf_sources=pdf_links, method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert batch_response.success is True assert batch_response.successful_count == 3 assert batch_response.total_word_count == 6000 # Step 3: Also convert the overview HTML page to markdown markdown_tool = scenario_tools["convert_webpage_to_markdown"] with patch.object(web_scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = { "url": "https://academic-site.com/paper3.html", "title": "Research Overview", "content": { "html": "<html><body><h1>Research Overview</h1><p>Summary of all papers.</p></body></html>" }, } markdown_response = await markdown_tool.fn( url="https://academic-site.com/paper3.html", method="auto", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) assert markdown_response.success is True assert "Research Overview" in markdown_response.markdown_content # Step 4: Check final metrics metrics_tool = scenario_tools["get_server_metrics"] metrics_response = await metrics_tool.fn() assert metrics_response.success is True @pytest.mark.asyncio async def test_website_documentation_backup_scenario( self, scenario_tools, pdf_processor ): """Test creating a complete backup of website documentation.""" # Scenario: User wants to backup all documentation pages as markdown # Step 1: Scrape the main documentation index scrape_tool = scenario_tools["scrape_webpage"] index_result = { "url": "https://docs.example.com", "title": "Documentation Index", "content": { "html": """ <html> <body> <h1>Documentation</h1> <nav> <ul> <li><a href="/getting-started">Getting Started</a></li> <li><a href="/api-reference">API Reference</a></li> <li><a href="/tutorials">Tutorials</a></li> <li><a href="/faq.pdf">FAQ (PDF)</a></li> </ul> </nav> </body> </html> """, "links": [ { "url": "https://docs.example.com/getting-started", "text": "Getting Started", }, { "url": "https://docs.example.com/api-reference", "text": "API Reference", }, {"url": "https://docs.example.com/tutorials", "text": "Tutorials"}, {"url": "https://docs.example.com/faq.pdf", "text": "FAQ (PDF)"}, ], }, } with patch.object(web_scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = index_result index_response = await scrape_tool.fn( url="https://docs.example.com", method="auto", extract_config=None, wait_for_element=None, ) assert index_response.success is True # Step 2: Batch convert all HTML pages to markdown html_pages = [ "https://docs.example.com/getting-started", "https://docs.example.com/api-reference", "https://docs.example.com/tutorials", ] batch_markdown_tool = scenario_tools["convert_webpage_to_markdown"] # Process each HTML page (simulate batch by calling individually) html_results = [] for i, url in enumerate(html_pages): with patch.object(web_scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = { "url": url, "title": f"Documentation Page {i + 1}", "content": { "html": f"<html><body><h1>Page {i + 1}</h1><p>Content for page {i + 1}</p></body></html>" }, } result = await batch_markdown_tool.fn( url=url, method="auto", extract_main_content=True, include_metadata=True, custom_options=None, wait_for_element=None, formatting_options=None, embed_images=False, embed_options=None, ) assert result.success is True html_results.append(result) # Step 3: Convert the PDF to markdown pdf_tool = scenario_tools["convert_pdf_to_markdown"] with ( patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "process_pdf") as mock_pdf, ): mock_pdf.return_value = { "success": True, "markdown": "# FAQ\n\n## Q: How to get started?\nA: Follow the getting started guide.", "source": "https://docs.example.com/faq.pdf", "word_count": 50, } pdf_result = await pdf_tool.fn( pdf_source="https://docs.example.com/faq.pdf", method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert pdf_result.success is True # Verify complete documentation backup assert len(html_results) == 3 for result in html_results: assert "Page" in result.markdown_content assert "FAQ" in pdf_result.content @pytest.mark.asyncio async def test_competitive_analysis_scenario(self, scenario_tools, pdf_processor): """Test competitive analysis workflow across multiple competitor sites.""" # Scenario: Analyze multiple competitor websites and their resources competitor_urls = [ "https://competitor1.com", "https://competitor2.com", "https://competitor3.com", ] # Step 1: Batch scrape all competitor sites batch_scrape_tool = scenario_tools["scrape_multiple_webpages"] competitor_results = [ { "url": url, "title": f"Competitor {i + 1}", "content": { "html": f""" <html> <body> <h1>Competitor {i + 1}</h1> <p>Product features: Feature A, Feature B</p> <a href="/whitepaper{i + 1}.pdf">Download Whitepaper</a> </body> </html> """, "links": [ { "url": f"{url}/whitepaper{i + 1}.pdf", "text": "Download Whitepaper", } ], }, } for i, url in enumerate(competitor_urls) ] with patch.object(web_scraper, "scrape_multiple_urls") as mock_batch_scrape: mock_batch_scrape.return_value = competitor_results scrape_response = await batch_scrape_tool.fn( urls=competitor_urls, method="simple", extract_config=None, ) assert scrape_response.success is True assert len(scrape_response.results) == 3 # Step 2: Extract all whitepaper PDFs found pdf_urls = [] for result in competitor_results: if "content" in result and "links" in result["content"]: pdf_links = [ link["url"] for link in result["content"]["links"] if link["url"].endswith(".pdf") ] pdf_urls.extend(pdf_links) assert len(pdf_urls) == 3 # Step 3: Batch process all competitor whitepapers batch_pdf_tool = scenario_tools["batch_convert_pdfs_to_markdown"] whitepaper_results = { "success": True, "results": [ { "success": True, "markdown": f"# Competitor {i + 1} Whitepaper\n\nProduct analysis and features.", "source": pdf_urls[i], "word_count": (i + 1) * 500, "metadata": {"title": f"Competitor {i + 1} Whitepaper"}, } for i in range(3) ], "summary": { "total_pdfs": 3, "successful_count": 3, "total_word_count": 3000, # 500 + 1000 + 1500 }, } with ( patch("extractor.server._get_pdf_processor", return_value=pdf_processor), patch.object(pdf_processor, "batch_process_pdfs") as mock_batch_pdf, ): mock_batch_pdf.return_value = whitepaper_results pdf_response = await batch_pdf_tool.fn( pdf_sources=pdf_urls, method="auto", include_metadata=True, page_range=None, output_format="markdown", ) assert pdf_response.success is True assert pdf_response.total_word_count == 3000 # Step 4: Generate final metrics for the analysis metrics_tool = scenario_tools["get_server_metrics"] clear_cache_tool = scenario_tools["clear_cache"] # Clear cache and get final metrics await clear_cache_tool.fn() metrics_response = await metrics_tool.fn() assert metrics_response.success is True # In a real scenario, metrics would show the analysis activity

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server