Skip to main content
Glama
test_scrapers.py8.33 kB
""" Unit tests for the scraper implementations. """ import pytest from unittest.mock import patch, MagicMock, AsyncMock from services.scraping.requests_scraper import RequestsScraper from services.scraping.puppeteer_scraper import PuppeteerScraper from services.scraping.adaptive_scraper import AdaptiveScraper from services.scraping.scraper_factory import ScraperFactory class TestRequestsScraper: """Tests for the RequestsScraper implementation.""" @pytest.mark.asyncio async def test_scrape_url_impl(self): """Test the _scrape_url_impl method.""" # Mock aiohttp.ClientSession mock_session = AsyncMock() mock_response = AsyncMock() mock_response.status = 200 mock_response.text = AsyncMock(return_value="<html><head><title>Test</title></head><body><main>Content</main></body></html>") mock_session.__aenter__.return_value = mock_session mock_session.get.return_value.__aenter__.return_value = mock_response with patch("aiohttp.ClientSession", return_value=mock_session): scraper = RequestsScraper() result = await scraper._scrape_url_impl("https://example.com") # Check result assert result is not None assert "html" in result assert "title" in result assert result["method"] == "requests" # Check that session.get was called with the correct arguments mock_session.get.assert_called_once() args, kwargs = mock_session.get.call_args assert args[0] == "https://example.com" assert "timeout" in kwargs assert "headers" in kwargs class TestPuppeteerScraper: """Tests for the PuppeteerScraper implementation.""" @pytest.mark.asyncio async def test_scrape_url_impl(self): """Test the _scrape_url_impl method.""" # Mock PuppeteerPool mock_pool = MagicMock() mock_browser = AsyncMock() mock_page = AsyncMock() # Configure mocks mock_pool.get_browser = AsyncMock(return_value=mock_browser) mock_pool.release_browser = AsyncMock() mock_browser.newPage = AsyncMock(return_value=mock_page) mock_page.setRequestInterception = AsyncMock() mock_page.setDefaultNavigationTimeout = AsyncMock() mock_page.goto = AsyncMock() mock_page.content = AsyncMock(return_value="<html><head><title>Test</title></head><body>Content</body></html>") mock_page.evaluate = AsyncMock(side_effect=["Test Title", "Test Description"]) mock_page.close = AsyncMock() # Create scraper with mock pool scraper = PuppeteerScraper(pool=mock_pool) # Call method result = await scraper._scrape_url_impl("https://example.com") # Check result assert result is not None assert "html" in result assert "title" in result assert "description" in result assert result["method"] == "puppeteer" # Check that pool methods were called mock_pool.get_browser.assert_called_once() mock_pool.release_browser.assert_called_once_with(mock_browser) # Check that browser and page methods were called mock_browser.newPage.assert_called_once() mock_page.goto.assert_called_once() mock_page.content.assert_called_once() assert mock_page.evaluate.call_count == 2 mock_page.close.assert_called_once() class TestAdaptiveScraper: """Tests for the AdaptiveScraper implementation.""" @pytest.mark.asyncio async def test_scrape_url_impl_requests_success(self): """Test the _scrape_url_impl method when requests method succeeds.""" # Create scraper with mock scrapers scraper = AdaptiveScraper() # Mock the scrapers scraper.requests_scraper = MagicMock() scraper.puppeteer_scraper = MagicMock() # Configure mocks scraper.requests_scraper._scrape_url_impl = AsyncMock(return_value={ "html": "<html><body>Content</body></html>", "title": "Test", "description": "Description", "method": "requests" }) # Call method result = await scraper._scrape_url_impl("https://example.com") # Check result assert result is not None assert result["method"] == "requests" # Check that requests scraper was called but puppeteer scraper was not scraper.requests_scraper._scrape_url_impl.assert_called_once() scraper.puppeteer_scraper._scrape_url_impl.assert_not_called() @pytest.mark.asyncio async def test_scrape_url_impl_requests_fallback(self): """Test the _scrape_url_impl method when requests method fails and falls back to puppeteer.""" # Create scraper with mock scrapers scraper = AdaptiveScraper() # Mock the scrapers scraper.requests_scraper = MagicMock() scraper.puppeteer_scraper = MagicMock() # Configure mocks scraper.requests_scraper._scrape_url_impl = AsyncMock(return_value=None) scraper.puppeteer_scraper._scrape_url_impl = AsyncMock(return_value={ "html": "<html><body>Content</body></html>", "title": "Test", "description": "Description", "method": "puppeteer" }) # Call method result = await scraper._scrape_url_impl("https://example.com") # Check result assert result is not None assert result["method"] == "puppeteer" # Check that both scrapers were called scraper.requests_scraper._scrape_url_impl.assert_called_once() scraper.puppeteer_scraper._scrape_url_impl.assert_called_once() @pytest.mark.asyncio async def test_scrape_url_impl_js_required_domain(self): """Test the _scrape_url_impl method with a domain that requires JavaScript.""" # Create scraper with mock scrapers scraper = AdaptiveScraper() # Add test domain to JS_REQUIRED_DOMAINS scraper.JS_REQUIRED_DOMAINS.add("twitter.example.com") # Mock the scrapers scraper.requests_scraper = MagicMock() scraper.puppeteer_scraper = MagicMock() # Configure mocks scraper.puppeteer_scraper._scrape_url_impl = AsyncMock(return_value={ "html": "<html><body>Content</body></html>", "title": "Test", "description": "Description", "method": "puppeteer" }) # Call method result = await scraper._scrape_url_impl("https://twitter.example.com/page") # Check result assert result is not None assert result["method"] == "puppeteer" # Check that only puppeteer scraper was called scraper.requests_scraper._scrape_url_impl.assert_not_called() scraper.puppeteer_scraper._scrape_url_impl.assert_called_once() class TestScraperFactory: """Tests for the ScraperFactory.""" def test_create_scraper(self): """Test the create_scraper method.""" # Clear existing instances ScraperFactory._instances = {} ScraperFactory._puppeteer_pool = None # Create scrapers requests_scraper = ScraperFactory.create_scraper("requests") puppeteer_scraper = ScraperFactory.create_scraper("puppeteer") adaptive_scraper = ScraperFactory.create_scraper("adaptive") # Check types assert isinstance(requests_scraper, RequestsScraper) assert isinstance(puppeteer_scraper, PuppeteerScraper) assert isinstance(adaptive_scraper, AdaptiveScraper) # Check singleton pattern assert ScraperFactory.create_scraper("requests") is requests_scraper assert ScraperFactory.create_scraper("puppeteer") is puppeteer_scraper assert ScraperFactory.create_scraper("adaptive") is adaptive_scraper # Check puppeteer pool assert ScraperFactory._puppeteer_pool is not None assert puppeteer_scraper.pool is ScraperFactory._puppeteer_pool

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cabrit0/mcp_server_reuneMacacada'

If you have feedback or need assistance with the MCP directory API, please join our Discord server