RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_file_routing_integration.py•12.8 KiB

"""Integration tests for file type routing with mocked AWS services.""" import importlib.util import sys from pathlib import Path from unittest.mock import MagicMock, patch import boto3 import pytest from moto import mock_aws def _load_detect_file_type_module(): """Load detect_file_type module using importlib (avoids 'lambda' keyword issue).""" module_path = Path(__file__).parent.parent.parent / "src/lambda/detect_file_type/index.py" spec = importlib.util.spec_from_file_location("detect_file_type_index", module_path) module = importlib.util.module_from_spec(spec) sys.modules["detect_file_type_index"] = module spec.loader.exec_module(module) return module def _load_process_text_module(): """Load process_text module using importlib (avoids 'lambda' keyword issue).""" module_path = Path(__file__).parent.parent.parent / "src/lambda/process_text/index.py" spec = importlib.util.spec_from_file_location("process_text_index", module_path) module = importlib.util.module_from_spec(spec) sys.modules["process_text_index"] = module spec.loader.exec_module(module) return module @pytest.fixture def mock_env(monkeypatch): """Set up environment variables.""" monkeypatch.setenv("TRACKING_TABLE", "test-tracking-table") monkeypatch.setenv("GRAPHQL_ENDPOINT", "https://test.appsync.amazonaws.com/graphql") monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") @pytest.fixture def lambda_context(): """Create mock Lambda context.""" context = MagicMock() context.function_name = "DetectFileType" context.memory_limit_in_mb = 256 return context def setup_mocked_aws(): """Set up mocked S3 and DynamoDB resources.""" s3 = boto3.client("s3", region_name="us-east-1") s3.create_bucket(Bucket="test-bucket") dynamodb = boto3.resource("dynamodb", region_name="us-east-1") dynamodb.create_table( TableName="test-tracking-table", KeySchema=[{"AttributeName": "document_id", "KeyType": "HASH"}], AttributeDefinitions=[{"AttributeName": "document_id", "AttributeType": "S"}], BillingMode="PAY_PER_REQUEST", ) return s3, dynamodb @pytest.mark.integration class TestFileRoutingIntegration: """Integration tests for file type routing logic.""" @mock_aws def test_html_routes_to_text_path(self, mock_env, lambda_context): """Test HTML file detection and routing to text path.""" s3, _ = setup_mocked_aws() html_content = b"""<!DOCTYPE html> <html> <head><title>Test</title></head> <body><h1>Hello World</h1></body> </html>""" document_id = "route-test-html" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/page.html", Body=html_content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/page.html", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "text" assert result["detectedType"] == "html" assert result["document_id"] == document_id @mock_aws def test_pdf_routes_to_ocr_path(self, mock_env, lambda_context): """Test PDF file detection and routing to OCR path.""" s3, _ = setup_mocked_aws() # PDF magic bytes pdf_content = b"%PDF-1.4 fake pdf content for testing" document_id = "route-test-pdf" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/document.pdf", Body=pdf_content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/document.pdf", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "ocr" assert result["detectedType"] == "pdf" @mock_aws def test_markdown_routes_to_passthrough(self, mock_env, lambda_context): """Test markdown file detection and routing to passthrough path.""" s3, _ = setup_mocked_aws() # Markdown files don't need content check - extension is sufficient event = { "document_id": "route-test-md", "input_s3_uri": "s3://test-bucket/input/route-test-md/readme.md", "output_s3_prefix": "s3://test-bucket/output/route-test-md/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "passthrough" assert result["detectedType"] == "markdown" @mock_aws def test_csv_routes_to_text_path(self, mock_env, lambda_context): """Test CSV file detection and routing to text path.""" s3, _ = setup_mocked_aws() csv_content = b"""name,age,city Alice,30,NYC Bob,25,LA""" document_id = "route-test-csv" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/data.csv", Body=csv_content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/data.csv", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "text" assert result["detectedType"] == "csv" @mock_aws def test_json_routes_to_text_path(self, mock_env, lambda_context): """Test JSON file detection and routing to text path.""" s3, _ = setup_mocked_aws() json_content = b'{"key": "value", "number": 42}' document_id = "route-test-json" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/config.json", Body=json_content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/config.json", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "text" assert result["detectedType"] == "json" @mock_aws def test_image_routes_to_ocr_path(self, mock_env, lambda_context): """Test image file detection and routing to OCR path.""" s3, _ = setup_mocked_aws() # JPEG magic bytes jpeg_content = b"\xff\xd8\xff\xe0" + b"\x00" * 100 document_id = "route-test-jpg" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/photo.jpg", Body=jpeg_content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/photo.jpg", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } module = _load_detect_file_type_module() result = module.lambda_handler(event, lambda_context) assert result["fileType"] == "ocr" assert result["detectedType"] == "image" @pytest.mark.integration class TestEndToEndTextProcessing: """End-to-end tests simulating full Step Functions flow.""" @mock_aws def test_html_detection_then_processing(self, mock_env, lambda_context): """Test complete flow: DetectFileType -> ProcessText for HTML.""" s3, dynamodb = setup_mocked_aws() html_content = b"""<!DOCTYPE html> <html> <head><title>End-to-End Test</title></head> <body> <h1>Welcome</h1> <p>This tests the full pipeline flow.</p> </body> </html>""" document_id = "e2e-test-html" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/page.html", Body=html_content, ) initial_event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/page.html", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } # Step 1: DetectFileType detect_module = _load_detect_file_type_module() routing_result = detect_module.lambda_handler(initial_event, lambda_context) assert routing_result["fileType"] == "text" assert routing_result["detectedType"] == "html" # Step 2: ProcessText (using routing result as input) with patch("ragstack_common.appsync.publish_document_update"): process_module = _load_process_text_module() process_result = process_module.lambda_handler(routing_result, lambda_context) # Verify final result assert process_result["status"] == "ocr_complete" assert process_result["is_text_native"] is True assert "output_s3_uri" in process_result # Verify S3 output exists output = s3.get_object( Bucket="test-bucket", Key=f"output/{document_id}/full_text.txt", ) output_content = output["Body"].read().decode("utf-8") assert "End-to-End Test" in output_content or "Welcome" in output_content # Verify DynamoDB tracking table = dynamodb.Table("test-tracking-table") item = table.get_item(Key={"document_id": document_id})["Item"] assert item["status"] == "ocr_complete" @mock_aws def test_csv_detection_then_processing(self, mock_env, lambda_context): """Test complete flow: DetectFileType -> ProcessText for CSV.""" s3, dynamodb = setup_mocked_aws() csv_content = b"""product,price,stock Widget A,19.99,100 Widget B,29.99,50 Widget C,39.99,25""" document_id = "e2e-test-csv" s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/inventory.csv", Body=csv_content, ) initial_event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/inventory.csv", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } # Step 1: DetectFileType detect_module = _load_detect_file_type_module() routing_result = detect_module.lambda_handler(initial_event, lambda_context) assert routing_result["fileType"] == "text" assert routing_result["detectedType"] == "csv" # Step 2: ProcessText with patch("ragstack_common.appsync.publish_document_update"): process_module = _load_process_text_module() process_result = process_module.lambda_handler(routing_result, lambda_context) assert process_result["status"] == "ocr_complete" # Verify output contains smart extraction output = s3.get_object( Bucket="test-bucket", Key=f"output/{document_id}/full_text.txt", ) output_content = output["Body"].read().decode("utf-8") assert "product" in output_content or "Widget" in output_content @mock_aws def test_multiple_file_types_routing(self, mock_env, lambda_context): """Test routing decisions for multiple file types.""" s3, _ = setup_mocked_aws() test_cases = [ ("test.html", b"<html><body>HTML</body></html>", "text", "html"), ("test.txt", b"Plain text content", "text", "txt"), ("test.csv", b"a,b,c\n1,2,3\n4,5,6", "text", "csv"), ("test.json", b'{"key": "value"}', "text", "json"), ("test.xml", b'<?xml version="1.0"?><root/>', "text", "xml"), ("test.pdf", b"%PDF-1.4 fake pdf", "ocr", "pdf"), ("test.jpg", b"\xff\xd8\xff\xe0" + b"\x00" * 20, "ocr", "image"), ("test.md", b"# Markdown", "passthrough", "markdown"), ] detect_module = _load_detect_file_type_module() for filename, content, expected_route, expected_type in test_cases: document_id = f"route-multi-{filename.replace('.', '-')}" # Skip content upload for markdown (extension-based detection) if expected_type != "markdown": s3.put_object( Bucket="test-bucket", Key=f"input/{document_id}/{filename}", Body=content, ) event = { "document_id": document_id, "input_s3_uri": f"s3://test-bucket/input/{document_id}/{filename}", "output_s3_prefix": f"s3://test-bucket/output/{document_id}/", } result = detect_module.lambda_handler(event, lambda_context) assert result["fileType"] == expected_route, f"Failed for {filename}" assert result["detectedType"] == expected_type, f"Failed for {filename}"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_file_routing_integration.py•12.8 KiB