RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_scrape_discover.py•9.59 KiB

"""Unit tests for scrape_discover Lambda handler.""" import importlib.util import json import sys from pathlib import Path from unittest.mock import MagicMock, patch import pytest def _load_scrape_discover_module(): """Load scrape_discover module using importlib (avoids 'lambda' keyword issue).""" module_path = Path(__file__).parent.parent.parent.parent / "src/lambda/scrape_discover/index.py" spec = importlib.util.spec_from_file_location("scrape_discover_index", module_path) module = importlib.util.module_from_spec(spec) sys.modules["scrape_discover_index"] = module spec.loader.exec_module(module) return module @pytest.fixture def _mock_env(monkeypatch): """Set up environment variables for tests.""" monkeypatch.setenv("SCRAPE_JOBS_TABLE", "test-jobs-table") monkeypatch.setenv("SCRAPE_URLS_TABLE", "test-urls-table") monkeypatch.setenv("SCRAPE_DISCOVERY_QUEUE_URL", "https://sqs.us-east-1.amazonaws.com/123/disc") monkeypatch.setenv( "SCRAPE_PROCESSING_QUEUE_URL", "https://sqs.us-east-1.amazonaws.com/123/proc" ) monkeypatch.setenv("REQUEST_DELAY_MS", "0") # No delay for tests monkeypatch.setenv("LOG_LEVEL", "INFO") monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") @pytest.fixture def mock_aws(_mock_env): """Set up AWS mocks for DynamoDB and SQS.""" with patch("boto3.resource") as mock_resource, patch("boto3.client") as mock_client: # Mock DynamoDB tables mock_jobs_table = MagicMock() mock_urls_table = MagicMock() mock_dynamodb = MagicMock() mock_dynamodb.Table.side_effect = lambda name: ( mock_jobs_table if "jobs" in name else mock_urls_table ) mock_resource.return_value = mock_dynamodb # Mock SQS mock_sqs = MagicMock() mock_client.return_value = mock_sqs yield { "jobs_table": mock_jobs_table, "urls_table": mock_urls_table, "sqs": mock_sqs, } @pytest.fixture def _mock_fetcher(): """Mock the HTTP fetcher (fixture provides side-effect patching).""" with patch("ragstack_common.scraper.fetcher.HttpFetcher") as mock_cls: mock_instance = MagicMock() mock_instance.fetch.return_value = MagicMock( error=None, is_html=True, content="<html><body><h1>Test</h1></body></html>", status_code=200, ) mock_cls.return_value = mock_instance yield mock_instance class TestScrapeDiscoverHandler: """Tests for scrape_discover lambda_handler.""" def test_missing_jobs_table_env(self, monkeypatch): """Test error when SCRAPE_JOBS_TABLE is missing.""" monkeypatch.delenv("SCRAPE_JOBS_TABLE", raising=False) monkeypatch.setenv("SCRAPE_URLS_TABLE", "test-urls-table") monkeypatch.setenv("SCRAPE_DISCOVERY_QUEUE_URL", "https://sqs.example.com/disc") monkeypatch.setenv("SCRAPE_PROCESSING_QUEUE_URL", "https://sqs.example.com/proc") monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") module = _load_scrape_discover_module() with pytest.raises(ValueError, match="SCRAPE_JOBS_TABLE"): module.lambda_handler({"Records": []}, None) def test_sqs_message_parsing(self, mock_aws, _mock_fetcher): """Test SQS message parsing from event.""" mock_aws["jobs_table"].get_item.return_value = { "Item": { "job_id": "test-job-123", "status": "running", "base_url": "https://example.com", "config": {"max_depth": 3, "max_pages": 100}, "total_urls": 0, } } mock_aws["urls_table"].get_item.return_value = {} # URL not visited yet module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "test-job-123", "url": "https://example.com/page1", "depth": 0, } ) } ] } result = module.lambda_handler(event, None) assert result["processed"] == 1 mock_aws["urls_table"].put_item.assert_called_once() def test_duplicate_url_handling(self, mock_aws, _mock_fetcher): """Test that already-visited URLs are skipped.""" mock_aws["jobs_table"].get_item.return_value = { "Item": {"job_id": "test-job-123", "status": "running"} } mock_aws["urls_table"].get_item.return_value = { "Item": {"job_id": "test-job-123", "url": "https://example.com/page1"} } module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "test-job-123", "url": "https://example.com/page1", "depth": 0, } ) } ] } result = module.lambda_handler(event, None) assert result["skipped"] == 1 assert result["processed"] == 0 def test_job_cancelled_skips_processing(self, mock_aws, _mock_fetcher): """Test that cancelled jobs skip URL processing.""" mock_aws["jobs_table"].get_item.return_value = { "Item": {"job_id": "test-job-123", "status": "cancelled"} } module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "test-job-123", "url": "https://example.com/page1", "depth": 0, } ) } ] } result = module.lambda_handler(event, None) assert result["skipped"] == 1 mock_aws["urls_table"].put_item.assert_not_called() def test_job_not_found(self, mock_aws, _mock_fetcher): """Test handling when job doesn't exist.""" mock_aws["jobs_table"].get_item.return_value = {} module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "nonexistent-job", "url": "https://example.com/page1", "depth": 0, } ) } ] } result = module.lambda_handler(event, None) assert result["processed"] == 0 class TestJobCounterUpdates: """Tests for job counter updates.""" def test_increments_total_urls(self, mock_aws, _mock_fetcher): """Test that total_urls counter is incremented.""" mock_aws["jobs_table"].get_item.return_value = { "Item": { "job_id": "test-job-123", "status": "running", "base_url": "https://example.com", "config": {"max_depth": 3, "max_pages": 100}, "total_urls": 0, } } mock_aws["urls_table"].get_item.return_value = {} module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "test-job-123", "url": "https://example.com/page1", "depth": 0, } ) } ] } module.lambda_handler(event, None) mock_aws["jobs_table"].update_item.assert_called() call_args = mock_aws["jobs_table"].update_item.call_args assert "total_urls" in str(call_args) class TestUrlDiscovery: """Tests for URL discovery from page content.""" def test_discovers_links_from_content(self, mock_aws, _mock_fetcher): """Test that links are extracted from fetched content.""" _mock_fetcher.fetch.return_value = MagicMock( error=None, is_html=True, content=""" <html> <body> <a href="/page2">Link 1</a> <a href="/page3">Link 2</a> </body> </html> """, status_code=200, ) mock_aws["jobs_table"].get_item.return_value = { "Item": { "job_id": "test-job-123", "status": "running", "base_url": "https://example.com", "config": {"max_depth": 3, "max_pages": 100, "scope": "hostname"}, "total_urls": 0, } } mock_aws["urls_table"].get_item.return_value = {} module = _load_scrape_discover_module() event = { "Records": [ { "body": json.dumps( { "job_id": "test-job-123", "url": "https://example.com/", "depth": 0, } ) } ] } result = module.lambda_handler(event, None) # Should discover at least 2 new URLs assert result["discovered"] >= 2 # Should send messages to discovery queue mock_aws["sqs"].send_message.assert_called() if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_scrape_discover.py•9.59 KiB