Skip to main content
Glama
test_scraper.py13.5 kB
""" Unit tests for WebScraper core functionality. ## WebScraper 引擎测试 (`test_scraper.py`) ### DataExtractor 类测试 测试基本 CSS 选择器数据提取、多元素提取、元素属性(href、src)提取和不存在选择器的处理。 ### WebScraper 核心类测试 测试 WebScraper 实例的正确创建、默认 HTTP 请求头生成、自动方法选择 (auto/simple/scrapy/selenium)、 不同方法的网页抓取、多 URL 并发抓取、网络错误和异常处理、响应时间、内容长度等元数据提取。 """ import pytest from unittest.mock import patch, Mock from bs4 import BeautifulSoup from extractor.scraper import WebScraper class TestDataExtractor: """ DataExtractor 类测试 - **简单选择器提取**: 测试基本 CSS 选择器数据提取 - **多元素提取**: 测试 `multiple: true` 配置的多元素提取 - **属性提取**: 测试元素属性(href、src)提取 - **错误处理**: 测试不存在选择器的处理 """ def test_simple_selector_extraction(self, sample_html): """测试基本 CSS 选择器数据提取""" soup = BeautifulSoup(sample_html, "html.parser") # 简单文本选择器 title = soup.select_one("title").get_text() assert title == "Test Page" heading = soup.select_one("h1").get_text() assert heading == "Test Heading" def test_multiple_element_extraction(self, sample_html): """测试多元素提取 (multiple: true 配置)""" soup = BeautifulSoup(sample_html, "html.parser") # 多段落提取 paragraphs = soup.select(".content p") assert len(paragraphs) == 2 # 提取所有链接 links = soup.select("a") assert len(links) >= 1 # 验证多元素内容提取 link_texts = [link.get_text() for link in links] assert any(link_texts) def test_attribute_extraction(self, sample_html): """测试元素属性(href、src)提取""" soup = BeautifulSoup(sample_html, "html.parser") # 提取链接 href 属性 link = soup.select_one("a") href = link.get("href") assert href == "https://example.com" # 提取其他属性 if link.has_attr("class"): classes = link.get("class") assert isinstance(classes, list) def test_nonexistent_selector_handling(self, sample_html): """测试不存在选择器的处理""" soup = BeautifulSoup(sample_html, "html.parser") # 选择不存在的元素 nonexistent = soup.select_one("#nonexistent") assert nonexistent is None # 选择不存在的多个元素 nonexistent_multiple = soup.select(".nonexistent-class") assert len(nonexistent_multiple) == 0 class TestBasicScraping: """Test basic scraping functionality.""" def test_html_parsing(self, sample_html): """Test HTML parsing with BeautifulSoup.""" soup = BeautifulSoup(sample_html, "html.parser") title = soup.find("title") assert title.get_text() == "Test Page" links = soup.find_all("a") assert len(links) >= 1 assert links[0].get("href") == "https://example.com" def test_css_selector_extraction(self, sample_html): """Test CSS selector based extraction.""" soup = BeautifulSoup(sample_html, "html.parser") # Test simple selector heading = soup.select_one("h1") assert heading.get_text() == "Test Heading" # Test multiple elements paragraphs = soup.select(".content p") assert len(paragraphs) == 2 # Test attribute extraction link_href = soup.select_one("a")["href"] assert link_href == "https://example.com" class TestWebScraper: """ WebScraper 核心类测试 - **初始化测试**: 验证配置正确加载 - **请求头生成**: 测试默认 HTTP 请求头生成 - **方法选择逻辑**: 测试自动方法选择 (auto/simple/scrapy/selenium) - **URL 抓取**: 测试不同方法的网页抓取 - **批量抓取**: 测试多 URL 并发抓取 - **错误恢复**: 测试网络错误和异常处理 - **元数据提取**: 测试响应时间、内容长度等元数据提取 """ @pytest.fixture def scraper(self): """WebScraper instance for testing.""" return WebScraper() def test_scraper_initialization(self, scraper): """ 测试 WebScraper 实例的正确创建和配置加载 验证 WebScraper 实例包含所有必要的组件 (scrapy_wrapper, selenium_scraper, simple_scraper) """ assert scraper is not None assert hasattr(scraper, "scrapy_wrapper") assert hasattr(scraper, "selenium_scraper") assert hasattr(scraper, "simple_scraper") def test_default_headers_generation(self, scraper): """ 测试默认 HTTP 请求头生成 验证 WebScraper 生成适当的默认 HTTP 请求头,包括 User-Agent、Accept 等标准头部 """ if hasattr(scraper, "_get_default_headers"): headers = scraper._get_default_headers() # 验证标准头部存在 assert "User-Agent" in headers assert "Accept" in headers assert "Accept-Language" in headers # 验证头部格式正确 assert isinstance(headers["User-Agent"], str) assert len(headers["User-Agent"]) > 0 else: pytest.skip("_get_default_headers method not found") @pytest.mark.asyncio async def test_method_selection_logic(self, scraper): """ 测试自动方法选择逻辑 (auto/simple/scrapy/selenium) 验证当 method="auto" 时,系统能够智能选择最合适的抓取方法 """ # Test that auto method defaults to something reasonable with patch.object(scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = {"url": "test", "method": "simple"} # This should not raise an error result = await scraper.scrape_url("https://example.com", method="auto") # Verify method selection worked assert mock_scrape.called @pytest.mark.asyncio async def test_scrape_url_simple_method(self, scraper): """ 测试简单 HTTP 方法抓取 验证使用 method="simple" 时能够正确进行基本的 HTTP 请求并返回预期的数据结构 """ # Mock the simple scraper mock_result = { "url": "https://example.com/", "status_code": 200, "title": "Mock Page", "content": {"text": "Mock Content", "links": [], "images": []}, } with patch.object(scraper.simple_scraper, "scrape", return_value=mock_result): result = await scraper.scrape_url("https://example.com", method="simple") assert result["url"] == "https://example.com/" assert result["status_code"] == 200 assert "content" in result @pytest.mark.asyncio async def test_scrape_url_with_extraction(self, scraper): """ 测试带数据提取配置的网页抓取 验证当提供 extract_config 参数时,能够从页面中提取指定数据 """ mock_result = { "url": "https://example.com", "status_code": 200, "title": "Mock Page", "content": {"text": "Mock Content", "links": [], "images": []}, "extracted_data": {"title": "Mock Page"}, } with patch.object(scraper.simple_scraper, "scrape", return_value=mock_result): result = await scraper.scrape_url( "https://example.com", method="simple", extract_config={"title": "title"}, ) # Check if extracted_data exists or if title is directly available if "extracted_data" in result: assert result["extracted_data"]["title"] == "Mock Page" else: assert result["title"] == "Mock Page" @pytest.mark.asyncio async def test_scrape_multiple_urls(self, scraper): """ 测试多 URL 并发抓取 验证能够同时处理多个 URL,提高抓取效率,并返回正确的结果结构 """ mock_result = { "url": "https://example.com", "status_code": 200, "title": "Mock Page", "content": {"text": "Mock Content", "links": [], "images": []}, } with patch.object(scraper, "scrape_url", return_value=mock_result): urls = ["https://example.com", "https://test.com"] results = await scraper.scrape_multiple_urls(urls, method="simple") assert len(results) == 2 # Check the actual structure returned if isinstance(results[0], dict) and "status_code" in results[0]: assert all(r["status_code"] == 200 for r in results) else: # Results might be wrapped differently assert len(results) == 2 @pytest.mark.asyncio async def test_scrape_url_error_handling(self, scraper): """ 测试网络错误和异常处理 验证当发生网络错误或其他异常时,系统能够优雅地处理并返回适当的错误信息 """ with patch.object( scraper.simple_scraper, "scrape", side_effect=Exception("Network error") ): result = await scraper.scrape_url("https://example.com", method="simple") # Check if error is handled properly - could be in different formats assert ( ("error" in result) or (result is None) or ("Network error" in str(result)) ) def test_extract_page_metadata(self, scraper): """ 测试响应时间、内容长度等元数据提取 验证能够从 HTTP 响应中提取响应时间、内容长度、内容类型等元数据信息 """ # Check if the method exists before testing if hasattr(scraper, "_extract_page_metadata"): mock_response = Mock() mock_response.headers = { "content-length": "1000", "content-type": "text/html", } mock_response.url = "https://example.com" metadata = scraper._extract_page_metadata( mock_response, start_time=0, end_time=1.5 ) assert metadata["content_length"] > 0 assert metadata["response_time"] == 1.5 assert metadata["final_url"] == "https://example.com" assert metadata["content_type"] == "text/html" else: # Skip test if method doesn't exist pytest.skip("_extract_page_metadata method not found") @pytest.mark.asyncio async def test_scrapy_method_mock(self, scraper): """Test Scrapy method execution (mocked).""" # Check if the method exists before testing if hasattr(scraper, "_run_scrapy_spider"): with patch.object(scraper, "_run_scrapy_spider") as mock_spider: mock_spider.return_value = [ {"url": "https://example.com", "content": "test"} ] result = await scraper._scrape_with_scrapy("https://example.com") assert result["url"] == "https://example.com" assert result["content"] == "test" elif hasattr(scraper, "scrapy_wrapper"): # Test through the scrapy_wrapper component mock_result = { "url": "https://example.com", "status_code": 200, "content": {"text": "test content"}, } with patch.object( scraper.scrapy_wrapper, "scrape", return_value=[mock_result] ): result = await scraper.scrape_url( "https://example.com", method="scrapy" ) assert result["url"] == "https://example.com" assert result["status_code"] == 200 else: # Skip test if neither method exists pytest.skip("Scrapy method not found") @pytest.mark.asyncio async def test_method_selection_logic(self, scraper): """Test automatic method selection logic.""" # Test that auto method defaults to something reasonable with patch.object(scraper, "scrape_url") as mock_scrape: mock_scrape.return_value = {"url": "test", "method": "simple"} # This should not raise an error result = await scraper.scrape_url("https://example.com", method="auto") # Verify method selection worked assert mock_scrape.called def test_scraper_attributes(self, scraper): """Test that scraper has expected attributes.""" # Test that the main components exist assert hasattr(scraper, "simple_scraper") assert hasattr(scraper, "scrapy_wrapper") assert hasattr(scraper, "selenium_scraper") # These components should not be None assert scraper.simple_scraper is not None assert scraper.scrapy_wrapper is not None assert scraper.selenium_scraper is not None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server