RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_scraper_models.py•9.97 KiB

"""Unit tests for scraper data models.""" from datetime import UTC, datetime from ragstack_common.scraper import ( ScrapeConfig, ScrapeJob, ScrapePage, ScrapeScope, ScrapeStatus, UrlStatus, ) class TestScrapeStatus: """Tests for ScrapeStatus enum.""" def test_enum_values(self): assert ScrapeStatus.PENDING.value == "pending" assert ScrapeStatus.DISCOVERING.value == "discovering" assert ScrapeStatus.PROCESSING.value == "processing" assert ScrapeStatus.COMPLETED.value == "completed" assert ScrapeStatus.COMPLETED_WITH_ERRORS.value == "completed_with_errors" assert ScrapeStatus.FAILED.value == "failed" assert ScrapeStatus.CANCELLED.value == "cancelled" def test_string_conversion(self): # str(Enum) inherits from str, so direct equality works assert ScrapeStatus.PENDING == "pending" assert ScrapeStatus.COMPLETED == "completed" class TestUrlStatus: """Tests for UrlStatus enum.""" def test_enum_values(self): assert UrlStatus.PENDING.value == "pending" assert UrlStatus.PROCESSING.value == "processing" assert UrlStatus.COMPLETED.value == "completed" assert UrlStatus.FAILED.value == "failed" assert UrlStatus.SKIPPED.value == "skipped" class TestScrapeScope: """Tests for ScrapeScope enum.""" def test_enum_values(self): assert ScrapeScope.SUBPAGES.value == "subpages" assert ScrapeScope.HOSTNAME.value == "hostname" assert ScrapeScope.DOMAIN.value == "domain" class TestScrapeConfig: """Tests for ScrapeConfig dataclass.""" def test_default_values(self): config = ScrapeConfig() assert config.max_pages == 1000 assert config.max_depth == 3 assert config.scope == ScrapeScope.SUBPAGES assert config.request_delay_ms == 500 assert config.include_patterns == [] assert config.exclude_patterns == [] assert config.cookies == {} assert config.headers == {} def test_custom_values(self): config = ScrapeConfig( max_pages=500, max_depth=5, scope=ScrapeScope.HOSTNAME, request_delay_ms=1000, include_patterns=["*/docs/*"], exclude_patterns=["*/blog/*"], cookies={"session": "abc123"}, headers={"Authorization": "Bearer token"}, ) assert config.max_pages == 500 assert config.max_depth == 5 assert config.scope == ScrapeScope.HOSTNAME assert config.request_delay_ms == 1000 assert config.include_patterns == ["*/docs/*"] assert config.exclude_patterns == ["*/blog/*"] assert config.cookies == {"session": "abc123"} assert config.headers == {"Authorization": "Bearer token"} def test_to_dict(self): config = ScrapeConfig(max_pages=100, scope=ScrapeScope.DOMAIN) data = config.to_dict() assert data["max_pages"] == 100 assert data["scope"] == "domain" assert data["max_depth"] == 3 assert data["request_delay_ms"] == 500 def test_from_dict(self): data = { "max_pages": 200, "max_depth": 4, "scope": "hostname", "request_delay_ms": 750, "include_patterns": ["*/api/*"], "exclude_patterns": [], "cookies": {}, "headers": {"X-Custom": "value"}, } config = ScrapeConfig.from_dict(data) assert config.max_pages == 200 assert config.max_depth == 4 assert config.scope == ScrapeScope.HOSTNAME assert config.request_delay_ms == 750 assert config.headers == {"X-Custom": "value"} def test_from_dict_defaults(self): config = ScrapeConfig.from_dict({}) assert config.max_pages == 1000 assert config.max_depth == 3 assert config.scope == ScrapeScope.SUBPAGES class TestScrapeJob: """Tests for ScrapeJob dataclass.""" def test_creation_minimal(self): job = ScrapeJob(job_id="job-123", base_url="https://example.com") assert job.job_id == "job-123" assert job.base_url == "https://example.com" assert job.status == ScrapeStatus.PENDING assert job.total_urls == 0 assert job.processed_count == 0 assert job.failed_count == 0 assert job.failed_urls == [] assert job.title is None assert job.step_function_arn is None assert isinstance(job.config, ScrapeConfig) def test_creation_full(self): config = ScrapeConfig(max_pages=50) job = ScrapeJob( job_id="job-456", base_url="https://docs.example.com", status=ScrapeStatus.PROCESSING, config=config, title="Example Docs", total_urls=100, processed_count=50, failed_count=2, failed_urls=[{"url": "https://example.com/404", "error": "Not found"}], step_function_arn="arn:aws:states:us-east-1:123456:execution:xyz", ) assert job.status == ScrapeStatus.PROCESSING assert job.title == "Example Docs" assert job.config.max_pages == 50 assert len(job.failed_urls) == 1 def test_to_dict(self): job = ScrapeJob( job_id="job-789", base_url="https://example.com", status=ScrapeStatus.COMPLETED, title="Test Job", ) data = job.to_dict() assert data["job_id"] == "job-789" assert data["base_url"] == "https://example.com" assert data["status"] == "completed" assert data["title"] == "Test Job" assert "created_at" in data assert "updated_at" in data assert isinstance(data["config"], dict) def test_to_dict_excludes_none(self): job = ScrapeJob(job_id="job-abc", base_url="https://example.com") data = job.to_dict() assert "title" not in data assert "step_function_arn" not in data def test_from_dict(self): now = datetime.now(UTC) data = { "job_id": "job-xyz", "base_url": "https://example.com/docs", "status": "processing", "config": {"max_pages": 75}, "title": "Documentation", "total_urls": 30, "processed_count": 10, "failed_count": 1, "failed_urls": [], "step_function_arn": "arn:aws:states:us-east-1:123:exec:abc", "created_at": now.isoformat(), "updated_at": now.isoformat(), } job = ScrapeJob.from_dict(data) assert job.job_id == "job-xyz" assert job.status == ScrapeStatus.PROCESSING assert job.config.max_pages == 75 assert job.title == "Documentation" assert job.step_function_arn == "arn:aws:states:us-east-1:123:exec:abc" def test_from_dict_defaults(self): data = {"job_id": "job-min", "base_url": "https://example.com"} job = ScrapeJob.from_dict(data) assert job.status == ScrapeStatus.PENDING assert job.total_urls == 0 assert job.config.max_pages == 1000 class TestScrapePage: """Tests for ScrapePage dataclass.""" def test_creation_minimal(self): page = ScrapePage(job_id="job-123", url="https://example.com/page1") assert page.job_id == "job-123" assert page.url == "https://example.com/page1" assert page.status == UrlStatus.PENDING assert page.depth == 0 assert page.content_hash is None assert page.document_id is None assert page.title is None assert page.error is None assert page.processed_at is None def test_creation_full(self): now = datetime.now(UTC) page = ScrapePage( job_id="job-456", url="https://example.com/page2", status=UrlStatus.COMPLETED, depth=2, content_hash="abc123def456", document_id="doc-789", title="Page Title", processed_at=now, ) assert page.status == UrlStatus.COMPLETED assert page.depth == 2 assert page.content_hash == "abc123def456" assert page.document_id == "doc-789" assert page.title == "Page Title" assert page.processed_at == now def test_to_dict(self): page = ScrapePage( job_id="job-abc", url="https://example.com/docs/intro", status=UrlStatus.COMPLETED, content_hash="hash123", document_id="doc-456", title="Introduction", ) data = page.to_dict() assert data["job_id"] == "job-abc" assert data["url"] == "https://example.com/docs/intro" assert data["status"] == "completed" assert data["content_hash"] == "hash123" assert data["document_id"] == "doc-456" assert data["title"] == "Introduction" def test_to_dict_excludes_none(self): page = ScrapePage(job_id="job-xyz", url="https://example.com") data = page.to_dict() assert "content_hash" not in data assert "document_id" not in data assert "title" not in data assert "error" not in data assert "processed_at" not in data def test_from_dict(self): now = datetime.now(UTC) data = { "job_id": "job-from", "url": "https://example.com/test", "status": "failed", "depth": 3, "error": "Connection timeout", "discovered_at": now.isoformat(), } page = ScrapePage.from_dict(data) assert page.job_id == "job-from" assert page.status == UrlStatus.FAILED assert page.depth == 3 assert page.error == "Connection timeout" def test_from_dict_defaults(self): data = {"job_id": "job-min", "url": "https://example.com/minimal"} page = ScrapePage.from_dict(data) assert page.status == UrlStatus.PENDING assert page.depth == 0 if __name__ == "__main__": import pytest pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_scraper_models.py•9.97 KiB