Skip to main content
Glama

Scrapy MCP Server

by ThreeFish-AI
test_scraper_simple.py3.67 kB
"""Simplified unit tests for WebScraper core functionality.""" import pytest from bs4 import BeautifulSoup from extractor.scraper import WebScraper class TestBasicScraping: """Test basic scraping functionality.""" def test_html_parsing(self, sample_html): """Test HTML parsing with BeautifulSoup.""" soup = BeautifulSoup(sample_html, "html.parser") title = soup.find("title") assert title.get_text() == "Test Page" links = soup.find_all("a") assert len(links) >= 1 assert links[0].get("href") == "https://example.com" def test_css_selector_extraction(self, sample_html): """Test CSS selector based extraction.""" soup = BeautifulSoup(sample_html, "html.parser") # Test simple selector heading = soup.select_one("h1") assert heading.get_text() == "Test Heading" # Test multiple elements paragraphs = soup.select(".content p") assert len(paragraphs) == 2 # Test attribute extraction link_href = soup.select_one("a")["href"] assert link_href == "https://example.com" class TestWebScraperBasic: """Test basic WebScraper functionality.""" @pytest.fixture def scraper(self): """WebScraper instance.""" return WebScraper() def test_scraper_initialization(self, scraper): """Test WebScraper initializes correctly.""" assert scraper is not None assert hasattr(scraper, "scrapy_wrapper") assert hasattr(scraper, "selenium_scraper") assert hasattr(scraper, "simple_scraper") def test_scrape_url_method_exists(self, scraper): """Test scrape_url method exists.""" # Test that the method exists assert hasattr(scraper, "scrape_url") assert callable(getattr(scraper, "scrape_url")) @pytest.mark.asyncio async def test_scrape_multiple_urls_method_exists(self, scraper): """Test scrape_multiple_urls method exists.""" # Test that the method exists assert hasattr(scraper, "scrape_multiple_urls") assert callable(getattr(scraper, "scrape_multiple_urls")) class TestHTMLExtraction: """Test HTML content extraction.""" def test_title_extraction(self, sample_html): """Test title extraction from HTML.""" soup = BeautifulSoup(sample_html, "html.parser") title = soup.find("title") assert title is not None assert title.get_text().strip() == "Test Page" def test_link_extraction(self, sample_html): """Test link extraction from HTML.""" soup = BeautifulSoup(sample_html, "html.parser") links = soup.find_all("a") assert len(links) > 0 # Test first link first_link = links[0] assert first_link.get("href") == "https://example.com" assert first_link.get_text() == "Test Link" def test_form_detection(self, sample_html): """Test form detection in HTML.""" soup = BeautifulSoup(sample_html, "html.parser") forms = soup.find_all("form") assert len(forms) > 0 # Test form has inputs form = forms[0] inputs = form.find_all("input") assert len(inputs) >= 2 # username and password def test_list_extraction(self, sample_html): """Test list item extraction.""" soup = BeautifulSoup(sample_html, "html.parser") list_items = soup.select(".list li") assert len(list_items) == 3 item_texts = [item.get_text() for item in list_items] assert "Item 1" in item_texts assert "Item 2" in item_texts assert "Item 3" in item_texts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server