PDF MCP Server

test_agentic_features.py•87.9 KiB

""" Tests for v0.8.0+ Agentic AI Features with Multi-Backend Support (v0.9.0+). This module tests LLM-powered PDF processing capabilities: - auto_fill_pdf_form: Intelligent form filling with field mapping - extract_structured_data: Entity/section extraction - analyze_pdf_content: Document analysis and summarization - get_llm_backend_info: Check available LLM backends Supports multiple backends (v0.9.0+): - local: Local model server at localhost:8100 (free, no API costs) - ollama: Ollama models (free, local) - openai: OpenAI API (paid, requires OPENAI_API_KEY) All tests use mocked LLM responses for unit testing. """ import json import os import re from pathlib import Path from unittest.mock import MagicMock, patch import pytest from pdf_mcp import pdf_tools # ============================================================================ # Test Fixtures # ============================================================================ @pytest.fixture def sample_form_pdf(tmp_path): """Create a simple form PDF for testing.""" output = tmp_path / "form.pdf" pdf_tools.create_pdf_form( str(output), fields=[ {"name": "full_name", "type": "text", "x": 100, "y": 700, "width": 200, "height": 20}, {"name": "email", "type": "text", "x": 100, "y": 660, "width": 200, "height": 20}, {"name": "phone", "type": "text", "x": 100, "y": 620, "width": 200, "height": 20}, {"name": "address", "type": "text", "x": 100, "y": 580, "width": 300, "height": 20}, ] ) return str(output) @pytest.fixture def sample_text_pdf(tmp_path): """Create a PDF with sample text content for analysis.""" import pymupdf output = tmp_path / "text.pdf" doc = pymupdf.open() page = doc.new_page() # Add invoice-like content text = """ INVOICE #12345 Date: January 15, 2026 Bill To: John Smith 123 Main Street New York, NY 10001 Items: Widget A - $50.00 Widget B - $75.00 Service Fee - $25.00 Subtotal: $150.00 Tax (8%): $12.00 Total: $162.00 Payment Due: February 15, 2026 """ page.insert_text((72, 72), text, fontsize=11) doc.save(str(output)) doc.close() return str(output) @pytest.fixture def sample_passport_pdf(tmp_path): """Create a PDF with passport-like MRZ and labels for extraction.""" import pymupdf output = tmp_path / "passport.pdf" doc = pymupdf.open() page = doc.new_page() text = """ Passport Surname: ERIKSSON Given Names: ANNA MARIA Nationality: UTO Date of Issue: 01 Jan 2015 Issuing Authority: UTOPIA P<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<<<<<<<<< L898902C36UTO7408122F1204159ZE184226B<<<<<10 """ page.insert_text((72, 72), text, fontsize=11) doc.save(str(output)) doc.close() return str(output) @pytest.fixture def sample_passport_label_only_pdf(tmp_path): """Create a PDF with passport labels but no MRZ.""" import pymupdf output = tmp_path / "passport_labels.pdf" doc = pymupdf.open() page = doc.new_page() text = """ Passport Surname: NGUYEN Given Names: THI MAI Nationality: VNM Issuing Country: VIETNAM Passport Number: B1234567 Date of Issue: 2016-07-21 Issuing Authority: IMMIGRATION DEPT """ page.insert_text((72, 72), text, fontsize=11) doc.save(str(output)) doc.close() return str(output) @pytest.fixture def mock_openai_response(): """Create a mock OpenAI response.""" def _create_mock(content): mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = content return mock_response return _create_mock # ============================================================================ # Test: auto_fill_pdf_form # ============================================================================ class TestAutoFillPdfForm: """Tests for LLM-powered form auto-fill.""" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_auto_fill_without_any_llm_returns_error(self, mock_local, sample_form_pdf, tmp_path): """Without any LLM backend, should return error with clear message.""" mock_local.return_value = False # Local server not available output = tmp_path / "filled.pdf" source_data = {"name": "John Smith", "email_address": "john@example.com"} try: result = pdf_tools.auto_fill_pdf_form( sample_form_pdf, str(output), source_data=source_data ) # Should return error or succeed with direct mapping only if "error" in result: # Check for common error indicators error_lower = result["error"].lower() assert "backend" in error_lower or "llm" in error_lower or "server" in error_lower else: # Direct mapping may have succeeded assert "filled_fields" in result or "mappings" in result except AttributeError: # pypdf compatibility issue pytest.skip("pypdf form filling compatibility issue") @patch("pdf_mcp.pdf_tools._call_llm") @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) def test_auto_fill_with_mocked_llm(self, mock_llm, sample_form_pdf, tmp_path): """With mocked LLM, should fill form fields correctly.""" # Skip if openai not available if not pdf_tools._HAS_OPENAI: pytest.skip("OpenAI library not installed") output = tmp_path / "filled.pdf" source_data = { "name": "John Smith", "email_address": "john@example.com", "phone_number": "555-123-4567", "home_address": "123 Main St, NYC" } # Mock LLM returns field mapping mock_llm.return_value = json.dumps({ "full_name": "John Smith", "email": "john@example.com", "phone": "555-123-4567", "address": "123 Main St, NYC" }) try: result = pdf_tools.auto_fill_pdf_form( sample_form_pdf, str(output), source_data=source_data ) # Should either succeed or fail gracefully if "error" not in result: assert result.get("filled_fields", 0) >= 0 assert Path(output).exists() except AttributeError as e: # pypdf has a bug with certain form structures if "get_object" in str(e): pytest.skip("pypdf bug: AttributeError in form filling (known issue)") @patch("pdf_mcp.pdf_tools._call_llm") @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) def test_auto_fill_reports_mapping_confidence(self, mock_llm, sample_form_pdf, tmp_path): """Should report confidence scores for field mappings.""" # Skip if openai not available if not pdf_tools._HAS_OPENAI: pytest.skip("OpenAI library not installed") output = tmp_path / "filled.pdf" source_data = {"full_name": "John Smith"} # Use exact field name for direct mapping mock_llm.return_value = json.dumps({ "full_name": "John Smith" }) try: result = pdf_tools.auto_fill_pdf_form( sample_form_pdf, str(output), source_data=source_data ) # With direct mapping, should succeed assert "mappings" in result or "filled_fields" in result or "error" in result except AttributeError as e: # pypdf has a bug with certain form structures if "get_object" in str(e): pytest.skip("pypdf bug: AttributeError in form filling (known issue)") def test_auto_fill_with_invalid_pdf_returns_error(self, tmp_path): """Invalid PDF path should return error.""" output = tmp_path / "filled.pdf" result = pdf_tools.auto_fill_pdf_form( "/nonexistent/path.pdf", str(output), source_data={"name": "Test"} ) assert "error" in result # ============================================================================ # Test: extract_structured_data # ============================================================================ @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) class TestExtractStructuredData: """Tests for entity/section extraction (LLM disabled - pattern matching only).""" def test_extract_invoice_data(self, mock_check, sample_text_pdf): """Extract invoice-specific fields.""" result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice" ) # Should return structured data (even without LLM, uses pattern matching) assert "error" not in result or "api" in result.get("error", "").lower() if "error" not in result: assert "data" in result or "extracted" in result def test_extract_with_custom_schema(self, mock_check, sample_text_pdf): """Extract data using custom schema definition.""" schema = { "invoice_number": "string", "total_amount": "number", "due_date": "date" } result = pdf_tools.extract_structured_data( sample_text_pdf, schema=schema ) # Should attempt extraction assert isinstance(result, dict) def test_extract_with_invalid_pdf_returns_error(self, mock_check): """Invalid PDF should return error.""" result = pdf_tools.extract_structured_data( "/nonexistent/path.pdf", data_type="invoice" ) assert "error" in result @patch("pdf_mcp.pdf_tools._HAS_OPENAI", True) @patch("pdf_mcp.pdf_tools._call_llm") def test_extract_with_mocked_llm(self, mock_llm, sample_text_pdf): """With mocked LLM, should return structured extraction.""" mock_llm.return_value = json.dumps({ "invoice_number": "12345", "date": "January 15, 2026", "total": 162.00, "items": [ {"name": "Widget A", "price": 50.00}, {"name": "Widget B", "price": 75.00}, {"name": "Service Fee", "price": 25.00} ] }) result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice" ) if "error" not in result: assert "data" in result or "extracted" in result # ============================================================================ # Test: analyze_pdf_content # ============================================================================ @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) class TestAnalyzePdfContent: """Tests for PDF content analysis and summarization (LLM disabled).""" def test_analyze_returns_document_type(self, mock_check, sample_text_pdf): """Should classify document type.""" result = pdf_tools.analyze_pdf_content(sample_text_pdf) # Even without LLM, should attempt classification assert isinstance(result, dict) if "error" not in result: assert "document_type" in result or "classification" in result or "analysis" in result def test_analyze_returns_summary(self, mock_check, sample_text_pdf): """Should generate summary.""" result = pdf_tools.analyze_pdf_content( sample_text_pdf, include_summary=True ) assert isinstance(result, dict) def test_analyze_detects_key_entities(self, mock_check, sample_text_pdf): """Should detect key entities like dates, amounts, names.""" result = pdf_tools.analyze_pdf_content( sample_text_pdf, detect_entities=True ) assert isinstance(result, dict) def test_analyze_with_invalid_pdf_returns_error(self, mock_check): """Invalid PDF should return error.""" result = pdf_tools.analyze_pdf_content("/nonexistent/path.pdf") assert "error" in result @patch("pdf_mcp.pdf_tools._HAS_OPENAI", True) @patch("pdf_mcp.pdf_tools._call_llm") def test_analyze_with_mocked_llm(self, mock_llm, sample_text_pdf): """With mocked LLM, should return full analysis.""" mock_llm.return_value = json.dumps({ "document_type": "invoice", "summary": "Invoice #12345 for $162.00 from January 15, 2026", "key_entities": { "invoice_number": "12345", "total_amount": "$162.00", "due_date": "February 15, 2026" }, "risk_flags": [], "completeness_score": 0.95 }) result = pdf_tools.analyze_pdf_content( sample_text_pdf, include_summary=True, detect_entities=True ) if "error" not in result: assert "analysis" in result or "document_type" in result # ============================================================================ # Test: LLM Integration Helpers # ============================================================================ class TestLLMHelpers: """Tests for LLM integration helper functions.""" def test_has_openai_flag_exists(self): """_HAS_OPENAI flag should exist.""" assert hasattr(pdf_tools, "_HAS_OPENAI") def test_call_llm_function_exists(self): """_call_llm helper should exist.""" assert hasattr(pdf_tools, "_call_llm") @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_call_llm_without_any_backend_returns_none(self, mock_local): """Without any LLM backend, should return None.""" mock_local.return_value = False # Local server not available result = pdf_tools._call_llm("test prompt") assert result is None # ============================================================================ # Test: MCP Tool Registration # ============================================================================ class TestMCPToolRegistration: """Verify agentic tools are exposed via MCP.""" def test_auto_fill_pdf_form_registered(self): """auto_fill_pdf_form should be a public function.""" assert hasattr(pdf_tools, "auto_fill_pdf_form") assert callable(pdf_tools.auto_fill_pdf_form) def test_extract_structured_data_registered(self): """extract_structured_data should be a public function.""" assert hasattr(pdf_tools, "extract_structured_data") assert callable(pdf_tools.extract_structured_data) def test_analyze_pdf_content_registered(self): """analyze_pdf_content should be a public function.""" assert hasattr(pdf_tools, "analyze_pdf_content") assert callable(pdf_tools.analyze_pdf_content) # ============================================================================ # Integration Tests (with real PDFs, no LLM) # ============================================================================ class TestAgenticIntegration: """Integration tests using real PDFs without LLM.""" @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_auto_fill_graceful_degradation(self, mock_check, sample_form_pdf, tmp_path): """Without LLM, should fall back gracefully.""" output = tmp_path / "filled.pdf" # Use exact field name to test direct mapping path try: result = pdf_tools.auto_fill_pdf_form( sample_form_pdf, str(output), source_data={"full_name": "Direct Match"} ) # Should either succeed with direct mapping or return helpful error assert isinstance(result, dict) # Either we got an error (no LLM) or we successfully filled if "error" not in result: assert "filled_fields" in result or "mappings" in result except AttributeError as e: # pypdf version compatibility issue with form filling # This is expected in some Python/pypdf version combinations pytest.skip(f"pypdf form filling compatibility issue: {e}") @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_extract_structured_data_pattern_matching(self, mock_check, sample_text_pdf): """Without LLM, should use pattern matching for common types.""" result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice" ) # Should attempt pattern-based extraction assert isinstance(result, dict) @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_extract_structured_data_passport_mrz(self, mock_check, sample_passport_pdf): """Should extract key passport fields from MRZ and labels.""" result = pdf_tools.extract_structured_data( sample_passport_pdf, data_type="passport" ) assert isinstance(result, dict) data = result.get("data", {}) assert data.get("passport_number") == "L898902C3" assert data.get("nationality") == "UTO" assert data.get("birth_date") == "1974-08-12" assert data.get("expiry_date") == "2012-04-15" assert data.get("sex") == "F" assert data.get("surname") == "ERIKSSON" assert data.get("given_names") == "ANNA MARIA" assert data.get("issuing_country") == "UTO" assert data.get("issue_date") == "2015-01-01" assert data.get("issuing_authority") == "UTOPIA" @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_extract_structured_data_passport_labels_only(self, mock_check, sample_passport_label_only_pdf): """Should extract key passport fields from labels when MRZ is missing.""" result = pdf_tools.extract_structured_data( sample_passport_label_only_pdf, data_type="passport" ) assert isinstance(result, dict) data = result.get("data", {}) assert data.get("surname") == "NGUYEN" assert data.get("given_names") == "THI MAI" assert data.get("nationality") == "VNM" assert data.get("issuing_country") == "VIETNAM" assert data.get("passport_number") == "B1234567" assert data.get("issue_date") == "2016-07-21" assert data.get("issuing_authority") == "IMMIGRATION DEPT" @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_analyze_pdf_basic_analysis(self, mock_check, sample_text_pdf): """Without LLM, should provide basic document analysis.""" result = pdf_tools.analyze_pdf_content(sample_text_pdf) # Should return basic metrics at minimum assert isinstance(result, dict) # ============================================================================ # Test: Multi-Backend Support (v0.9.0+) # ============================================================================ class TestMultiBackendSupport: """Tests for local VLM, Ollama, and OpenAI backend support.""" def test_get_llm_backend_info_exists(self): """get_llm_backend_info should be available.""" assert hasattr(pdf_tools, "get_llm_backend_info") assert callable(pdf_tools.get_llm_backend_info) def test_get_llm_backend_info_returns_dict(self): """Should return backend info dict.""" result = pdf_tools.get_llm_backend_info() assert isinstance(result, dict) assert "current_backend" in result assert "backends" in result assert "override_env" in result def test_backend_info_has_all_backends(self): """Should report on all backend types.""" result = pdf_tools.get_llm_backend_info() backends = result["backends"] assert "local" in backends assert "ollama" in backends assert "openai" in backends def test_local_backend_info_has_url(self): """Local backend should report URL.""" result = pdf_tools.get_llm_backend_info() local_info = result["backends"]["local"] assert "url" in local_info assert "localhost" in local_info["url"] or "127.0.0.1" in local_info["url"] def test_backends_report_cost(self): """All backends should report cost info.""" result = pdf_tools.get_llm_backend_info() for backend_name, backend_info in result["backends"].items(): assert "cost" in backend_info def test_local_and_ollama_are_free(self): """Local and Ollama should be marked as free.""" result = pdf_tools.get_llm_backend_info() assert "free" in result["backends"]["local"]["cost"] assert "free" in result["backends"]["ollama"]["cost"] def test_openai_is_paid(self): """OpenAI should be marked as paid.""" result = pdf_tools.get_llm_backend_info() assert "paid" in result["backends"]["openai"]["cost"] def test_backend_constants_exist(self): """Backend constants should be defined.""" assert hasattr(pdf_tools, "LLM_BACKEND_LOCAL") assert hasattr(pdf_tools, "LLM_BACKEND_OLLAMA") assert hasattr(pdf_tools, "LLM_BACKEND_OPENAI") assert pdf_tools.LLM_BACKEND_LOCAL == "local" assert pdf_tools.LLM_BACKEND_OLLAMA == "ollama" assert pdf_tools.LLM_BACKEND_OPENAI == "openai" def test_local_model_server_url_configurable(self): """LOCAL_MODEL_SERVER_URL should be configurable via env.""" assert hasattr(pdf_tools, "LOCAL_MODEL_SERVER_URL") # Default should include localhost assert "localhost" in pdf_tools.LOCAL_MODEL_SERVER_URL or "127.0.0.1" in pdf_tools.LOCAL_MODEL_SERVER_URL @patch("pdf_mcp.pdf_tools._check_local_model_server") def test_get_llm_backend_prefers_local(self, mock_check): """Should prefer local backend when available.""" mock_check.return_value = True result = pdf_tools._get_llm_backend() assert result == "local" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", True) def test_get_llm_backend_falls_back_to_ollama(self, mock_check): """Should fall back to Ollama when local unavailable.""" mock_check.return_value = False result = pdf_tools._get_llm_backend() # Should be ollama or openai (depends on whether OPENAI_API_KEY is set) assert result in ("ollama", "openai", "") @patch.dict(os.environ, {"PDF_MCP_LLM_BACKEND": "openai"}) def test_get_llm_backend_respects_override(self): """Should respect PDF_MCP_LLM_BACKEND env override.""" result = pdf_tools._get_llm_backend() assert result == "openai" class TestLocalVLMBackend: """Tests for local VLM backend at localhost:8100.""" def test_check_local_model_server_function_exists(self): """_check_local_model_server should exist.""" assert hasattr(pdf_tools, "_check_local_model_server") assert callable(pdf_tools._check_local_model_server) def test_call_local_llm_function_exists(self): """_call_local_llm should exist.""" assert hasattr(pdf_tools, "_call_local_llm") assert callable(pdf_tools._call_local_llm) @patch("pdf_mcp.pdf_tools._HAS_REQUESTS", False) def test_call_local_llm_without_requests_returns_none(self): """Without requests library, should return None.""" result = pdf_tools._call_local_llm("test prompt") assert result is None @patch("pdf_mcp.pdf_tools._HAS_REQUESTS", True) @patch("pdf_mcp.pdf_tools._resolve_local_model_name", return_value="test-model") @patch("pdf_mcp.pdf_tools._requests") def test_call_local_llm_with_mock_server(self, mock_requests, _mock_resolve): """With mocked server, should return response (OpenAI chat format).""" mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = { "choices": [{"message": {"content": "Test response"}}] } mock_requests.post.return_value = mock_response result = pdf_tools._call_local_llm("test prompt") assert result == "Test response" class TestOllamaBackend: """Tests for Ollama backend.""" def test_call_ollama_llm_function_exists(self): """_call_ollama_llm should exist.""" assert hasattr(pdf_tools, "_call_ollama_llm") assert callable(pdf_tools._call_ollama_llm) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) def test_call_ollama_llm_without_ollama_returns_none(self): """Without ollama library, should return None.""" result = pdf_tools._call_ollama_llm("test prompt") assert result is None def test_call_ollama_llm_with_mock(self): """With mocked Ollama, should return response.""" if not pdf_tools._HAS_OLLAMA: pytest.skip("Ollama not installed") with patch("pdf_mcp.pdf_tools._ollama") as mock_ollama: # Ollama returns Pydantic ChatResponse; mock .message.content mock_response = MagicMock() mock_response.message.content = "Ollama response" mock_ollama.chat.return_value = mock_response result = pdf_tools._call_ollama_llm("test prompt") assert result == "Ollama response" # ============================================================================ # v0.9.0 Comprehensive Integration Tests # ============================================================================ class TestLocalVLMIntegration: """Integration tests for local VLM backend with agentic functions.""" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._call_local_llm") def test_auto_fill_uses_local_backend(self, mock_call_llm, mock_check, sample_form_pdf, tmp_path): """auto_fill_pdf_form should use local backend when available.""" mock_check.return_value = True mock_call_llm.return_value = json.dumps({"full_name": "Test User"}) output = tmp_path / "filled.pdf" try: result = pdf_tools.auto_fill_pdf_form( sample_form_pdf, str(output), source_data={"name": "Test User"}, backend="local" ) # Should either succeed or return meaningful result assert isinstance(result, dict) if "error" not in result: assert result.get("backend") == "local" or "filled_fields" in result except AttributeError as e: # pypdf version compatibility issue with form filling pytest.skip(f"pypdf form filling compatibility issue: {e}") @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._call_local_llm") def test_extract_structured_data_uses_local_backend(self, mock_call_llm, mock_check, sample_text_pdf): """extract_structured_data should use local backend when specified.""" mock_check.return_value = True mock_call_llm.return_value = json.dumps({ "invoice_number": "INV-001", "total": "100.00" }) result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice", backend="local" ) assert isinstance(result, dict) # Verify backend is tracked if "backend" in result: assert result["backend"] in ("local", None) @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._call_local_llm") def test_analyze_pdf_content_uses_local_backend(self, mock_call_llm, mock_check, sample_text_pdf): """analyze_pdf_content should use local backend when specified.""" mock_check.return_value = True mock_call_llm.return_value = json.dumps({ "summary": "This is a test document.", "document_type": "invoice", "key_findings": ["Invoice number found"] }) result = pdf_tools.analyze_pdf_content( sample_text_pdf, include_summary=True, backend="local" ) assert isinstance(result, dict) assert "document_type" in result class TestOllamaIntegration: """Integration tests for Ollama backend with agentic functions.""" @patch("pdf_mcp.pdf_tools._get_llm_backend") @patch("pdf_mcp.pdf_tools._call_ollama_llm") def test_extract_structured_data_with_ollama(self, mock_call_llm, mock_get_backend, sample_text_pdf): """extract_structured_data should work with Ollama backend.""" mock_get_backend.return_value = "ollama" mock_call_llm.return_value = json.dumps({ "invoice_number": "12345", "date": "January 15, 2026" }) result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice", backend="ollama" ) assert isinstance(result, dict) @patch("pdf_mcp.pdf_tools._get_llm_backend") @patch("pdf_mcp.pdf_tools._call_ollama_llm") def test_analyze_pdf_content_with_ollama(self, mock_call_llm, mock_get_backend, sample_text_pdf): """analyze_pdf_content should work with Ollama backend.""" mock_get_backend.return_value = "ollama" mock_call_llm.return_value = json.dumps({ "summary": "Invoice document for services.", "document_type": "invoice", "key_findings": [] }) result = pdf_tools.analyze_pdf_content( sample_text_pdf, backend="ollama" ) assert isinstance(result, dict) assert "document_type" in result class TestBackendFieldInResults: """Tests verifying backend field is returned in agentic function results.""" @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_extract_structured_data_returns_backend_field(self, mock_check, sample_text_pdf): """extract_structured_data should return backend field.""" result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice" ) assert isinstance(result, dict) # Backend field should be present (may be None if no LLM used) assert "backend" in result @patch("pdf_mcp.pdf_tools._check_local_model_server", return_value=False) @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_analyze_pdf_content_returns_backend_field(self, mock_check, sample_text_pdf): """analyze_pdf_content should return backend field.""" result = pdf_tools.analyze_pdf_content(sample_text_pdf) assert isinstance(result, dict) assert "backend" in result @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._call_local_llm") def test_backend_field_reflects_local_when_used(self, mock_call_llm, mock_check, sample_text_pdf): """When local backend is used, backend field should be 'local'.""" mock_check.return_value = True mock_call_llm.return_value = json.dumps({ "summary": "Test summary", "document_type": "invoice", "key_findings": [] }) result = pdf_tools.analyze_pdf_content( sample_text_pdf, backend="local" ) if "backend" in result and result["backend"] is not None: assert result["backend"] == "local" class TestBackendFallbackChain: """Tests for backend fallback behavior.""" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_no_backend_available_graceful_degradation(self, mock_check, sample_text_pdf): """When no backend available, should gracefully degrade to pattern matching.""" mock_check.return_value = False result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice" ) # Should still return results using pattern matching assert isinstance(result, dict) if "method" in result: assert result["method"] == "pattern" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", True) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", False) def test_fallback_from_local_to_ollama(self, mock_check): """When local unavailable, should fall back to Ollama.""" mock_check.return_value = False backend = pdf_tools._get_llm_backend() assert backend == "ollama" @patch("pdf_mcp.pdf_tools._check_local_model_server") @patch("pdf_mcp.pdf_tools._HAS_OLLAMA", False) @patch("pdf_mcp.pdf_tools._HAS_OPENAI", True) @patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) def test_fallback_from_local_to_openai(self, mock_check): """When local and Ollama unavailable, should fall back to OpenAI.""" mock_check.return_value = False backend = pdf_tools._get_llm_backend() assert backend == "openai" class TestBackendEnvironmentConfiguration: """Tests for environment variable configuration.""" @patch.dict(os.environ, {"PDF_MCP_LLM_BACKEND": "local"}) def test_env_override_forces_local(self): """PDF_MCP_LLM_BACKEND=local should force local backend.""" backend = pdf_tools._get_llm_backend() assert backend == "local" @patch.dict(os.environ, {"PDF_MCP_LLM_BACKEND": "ollama"}) def test_env_override_forces_ollama(self): """PDF_MCP_LLM_BACKEND=ollama should force ollama backend.""" backend = pdf_tools._get_llm_backend() assert backend == "ollama" @patch.dict(os.environ, {"PDF_MCP_LLM_BACKEND": "openai"}) def test_env_override_forces_openai(self): """PDF_MCP_LLM_BACKEND=openai should force openai backend.""" backend = pdf_tools._get_llm_backend() assert backend == "openai" @patch.dict(os.environ, {"LOCAL_MODEL_SERVER_URL": "http://custom:9999"}) def test_custom_local_server_url(self): """LOCAL_MODEL_SERVER_URL should be configurable.""" # Note: This tests the module-level constant which may already be set # The actual URL would be read at import time assert hasattr(pdf_tools, "LOCAL_MODEL_SERVER_URL") class TestUnifiedCallLLM: """Tests for unified _call_llm function with backend routing.""" def test_call_llm_exists(self): """_call_llm should exist and be callable.""" assert hasattr(pdf_tools, "_call_llm") assert callable(pdf_tools._call_llm) @patch("pdf_mcp.pdf_tools._call_local_llm") def test_call_llm_routes_to_local(self, mock_local): """_call_llm should route to local when specified.""" mock_local.return_value = "local response" result = pdf_tools._call_llm("test", backend="local") mock_local.assert_called_once() assert result == "local response" @patch("pdf_mcp.pdf_tools._call_ollama_llm") def test_call_llm_routes_to_ollama(self, mock_ollama): """_call_llm should route to ollama when specified.""" mock_ollama.return_value = "ollama response" result = pdf_tools._call_llm("test", backend="ollama") mock_ollama.assert_called_once() assert result == "ollama response" @patch("pdf_mcp.pdf_tools._call_openai_llm") def test_call_llm_routes_to_openai(self, mock_openai): """_call_llm should route to openai when specified.""" mock_openai.return_value = "openai response" result = pdf_tools._call_llm("test", backend="openai") mock_openai.assert_called_once() assert result == "openai response" class TestMCPToolRegistrationV090: """Verify v0.9.0 tools are exposed via MCP.""" def test_get_llm_backend_info_registered(self): """get_llm_backend_info should be a public function.""" assert hasattr(pdf_tools, "get_llm_backend_info") assert callable(pdf_tools.get_llm_backend_info) def test_all_agentic_tools_have_backend_param(self): """All agentic tools should accept backend parameter.""" import inspect # Check auto_fill_pdf_form sig = inspect.signature(pdf_tools.auto_fill_pdf_form) assert "backend" in sig.parameters # Check extract_structured_data sig = inspect.signature(pdf_tools.extract_structured_data) assert "backend" in sig.parameters # Check analyze_pdf_content sig = inspect.signature(pdf_tools.analyze_pdf_content) assert "backend" in sig.parameters # ============================================================================ # E2E Tests with Real LLM (v0.9.2+) # These tests actually call the local model server when available # Mark with pytest.mark.slow for CI/CD to optionally skip # ============================================================================ def _is_local_server_running() -> bool: """Check if local model server is running at localhost:8100.""" try: import requests response = requests.get(f"{pdf_tools.LOCAL_MODEL_SERVER_URL}/health", timeout=2) return response.status_code == 200 except Exception: return False @pytest.mark.slow class TestE2ELocalVLM: """ End-to-end tests with REAL local model server (not mocked). These tests require the local model server to be running: ./scripts/run_local_vlm.sh Tests are skipped if server is not available. """ @pytest.fixture(autouse=True) def skip_if_no_server(self): """Skip all tests in this class if local server not running.""" if not _is_local_server_running(): pytest.skip("Local model server not running at localhost:8100") def test_e2e_get_llm_backend_info_detects_local(self): """With server running, should detect local backend.""" result = pdf_tools.get_llm_backend_info() assert result["backends"]["local"]["available"] is True # Local should be selected as current backend (highest priority) assert result["current_backend"] == "local" def test_e2e_call_local_llm_returns_response(self): """With server running, should get actual LLM response.""" result = pdf_tools._call_local_llm( "What is 2+2? Reply with just the number." ) assert result is not None assert len(result) > 0 # Response should contain "4" somewhere assert "4" in result def test_e2e_extract_structured_data_with_local_llm(self, sample_text_pdf): """E2E test: extract_structured_data with real local LLM.""" result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice", backend="local" ) assert isinstance(result, dict) assert "error" not in result assert "data" in result # Backend is "local" if LLM was used, None if pattern matching was sufficient # Both are valid outcomes - pattern matching success is actually preferred assert result.get("backend") in ("local", None) assert result.get("method") in ("pattern", "llm", "llm+pattern") def test_e2e_analyze_pdf_content_with_local_llm(self, sample_text_pdf): """E2E test: analyze_pdf_content with real local LLM.""" result = pdf_tools.analyze_pdf_content( sample_text_pdf, include_summary=True, detect_entities=True, backend="local" ) assert isinstance(result, dict) assert "error" not in result assert "document_type" in result # With real LLM, should have a summary if "summary" in result: assert len(result["summary"]) > 10 # Backend should be "local" assert result.get("backend") == "local" def test_e2e_local_llm_timeout_handling(self): """E2E test: local LLM should handle requests without hanging.""" import time start = time.time() result = pdf_tools._call_local_llm( "Reply with a single word: hello" ) elapsed = time.time() - start # Should complete within reasonable time (2 min max for slow first load) assert elapsed < 120 assert result is not None def _ollama_model_available(model: str) -> bool: """Check if specific Ollama model is available.""" from pdf_mcp import llm_setup if not llm_setup.ollama_is_installed(): return False models = llm_setup.ollama_list_models() return model in models @pytest.mark.slow class TestE2EOllama: """ End-to-end tests with REAL Ollama (not mocked). Requires Ollama installed and a model pulled: ollama pull qwen2.5:7b """ @pytest.fixture(autouse=True) def skip_if_no_ollama(self): """Skip all tests if Ollama not available.""" if not pdf_tools._HAS_OLLAMA: pytest.skip("Ollama library not installed (pip install ollama)") from pdf_mcp import llm_setup # Check if Ollama CLI is installed if not llm_setup.ollama_is_installed(): pytest.skip("Ollama CLI not found (install: curl -fsSL https://ollama.ai/install.sh | sh)") # Check if Ollama service is running try: import ollama ollama.list() except Exception as e: pytest.skip(f"Ollama service not running: {e}") # Check if any model is available models = llm_setup.ollama_list_models() if not models: pytest.skip("No Ollama models found (run: ollama pull qwen2.5:1.5b)") def test_e2e_ollama_llm_returns_response(self): """With Ollama running, should get actual LLM response.""" # Try smaller models first for speed test_models = ["qwen2.5:1.5b", "qwen2.5:7b", "llama3.2:1b"] model_to_use = None for model in test_models: if _ollama_model_available(model): model_to_use = model break if model_to_use is None: pytest.skip(f"None of {test_models} found; pull one: ollama pull qwen2.5:1.5b") result = pdf_tools._call_ollama_llm( "What is 2+2? Reply with just the number.", model=model_to_use ) assert result is not None assert len(result) > 0 def test_e2e_extract_structured_data_with_ollama(self, sample_text_pdf): """E2E test: extract_structured_data with real Ollama.""" result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice", backend="ollama" ) assert isinstance(result, dict) @pytest.mark.slow class TestE2EOpenAI: """ End-to-end tests with REAL OpenAI API (not mocked). Requires OPENAI_API_KEY environment variable. WARNING: These tests incur actual API costs! """ @pytest.fixture(autouse=True) def skip_if_no_openai(self): """Skip all tests if OpenAI not available.""" if not pdf_tools._HAS_OPENAI: pytest.skip("OpenAI library not installed") if not os.environ.get("OPENAI_API_KEY"): pytest.skip("OPENAI_API_KEY not set") def test_e2e_openai_llm_returns_response(self): """With OpenAI API key, should get actual response.""" result = pdf_tools._call_openai_llm( "What is 2+2? Reply with just the number.", model="gpt-4o-mini" ) assert result is not None assert "4" in result def test_e2e_analyze_pdf_content_with_openai(self, sample_text_pdf): """E2E test: analyze_pdf_content with real OpenAI.""" result = pdf_tools.analyze_pdf_content( sample_text_pdf, include_summary=True, backend="openai" ) assert isinstance(result, dict) assert "document_type" in result # ============================================================================ # Backend Comparison Tests (v0.9.2+) # ============================================================================ @pytest.mark.slow class TestBackendComparison: """Compare outputs across different backends.""" def test_all_backends_return_consistent_structure(self, sample_text_pdf): """All backends should return same result structure.""" backends_to_test = [] # Check which backends are available if _is_local_server_running(): backends_to_test.append("local") if pdf_tools._HAS_OLLAMA: try: import ollama ollama.list() backends_to_test.append("ollama") except Exception: pass if pdf_tools._HAS_OPENAI and os.environ.get("OPENAI_API_KEY"): backends_to_test.append("openai") if not backends_to_test: pytest.skip("No LLM backends available for comparison") results = {} for backend in backends_to_test: result = pdf_tools.extract_structured_data( sample_text_pdf, data_type="invoice", backend=backend ) results[backend] = result # All results should have same structure for backend, result in results.items(): assert "data" in result, f"{backend} missing 'data' field" assert "confidence" in result, f"{backend} missing 'confidence' field" assert "method" in result, f"{backend} missing 'method' field" assert "backend" in result, f"{backend} missing 'backend' field" # ============================================================================ # Test: MRZ Checksum Validation (v1.2.0) # ============================================================================ class TestMrzChecksumValidation: """Tests for MRZ checksum validation per ICAO 9303.""" def test_mrz_check_digit_valid(self): """Valid MRZ check digit should return correct value per ICAO 9303.""" assert pdf_tools._mrz_check_digit("L898902C3") == 6 def test_mrz_check_digit_all_zeros(self): """All zeros should return 0.""" assert pdf_tools._mrz_check_digit("000000000") == 0 def test_mrz_check_digit_letters(self): """Letters should be converted to numbers (A=10..Z=35).""" # Per ICAO: A=10, B=11, ..., Z=35, <(filler)=0 result = pdf_tools._mrz_check_digit("A") assert isinstance(result, int) assert 0 <= result <= 9 def test_mrz_validate_field_correct(self): """Field with matching check digit should validate.""" assert pdf_tools._mrz_validate_field("L898902C3", 6) is True def test_mrz_validate_field_incorrect(self): """Field with wrong check digit should fail.""" assert pdf_tools._mrz_validate_field("L898902C3", 9) is False def test_extract_passport_fields_with_checksum(self): """MRZ extraction should include checksum_valid field.""" mrz_text = ( "P<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<<<<<<<<<\n" "L898902C36UTO7408122F1204159ZE184226B<<<<<10\n" ) extracted, confidence = pdf_tools._extract_passport_fields(mrz_text) assert extracted.get("passport_number") == "L898902C3" # After checksum implementation, confidence should be higher for validated fields assert confidence.get("passport_number", 0) >= 0.8 class TestMrzTd1Format: """Tests for TD1 ID card format (3 lines x 30 chars).""" def test_extract_mrz_lines_td1(self): """Should detect TD1 format (3 lines of 30 chars).""" td1_text = ( "I<UTOD231458907<<<<<<<<<<<<<<<\n" "7408122F1204159UTO<<<<<<<<<<<6\n" "ERIKSSON<<ANNA<MARIA<<<<<<<<<<\n" ) result = pdf_tools._extract_mrz_lines(td1_text) assert result is not None assert len(result) >= 2 def test_extract_passport_fields_td1(self): """Should extract fields from TD1 (ID card) format.""" td1_text = ( "I<UTOD231458907<<<<<<<<<<<<<<<\n" "7408122F1204159UTO<<<<<<<<<<<6\n" "ERIKSSON<<ANNA<MARIA<<<<<<<<<<\n" ) extracted, confidence = pdf_tools._extract_passport_fields(td1_text) # Should extract at least some fields from TD1 assert isinstance(extracted, dict) # Surname should be extractable from line 3 if extracted.get("surname"): assert "ERIKSSON" in extracted["surname"] class TestMrzOcrErrorCorrection: """Tests for OCR error correction in MRZ text.""" def test_mrz_ocr_correction_common_mistakes(self): """Should correct common OCR mistakes in MRZ: O->0, I->1, B->8.""" # Line2 with OCR errors: O instead of 0, I instead of 1 corrected = pdf_tools._correct_mrz_ocr_errors( "L89890ZC36UTO74O8I22FI2O4I59ZEI84226B<<<<<IO" ) assert "0" in corrected # O should become 0 in digit positions assert isinstance(corrected, str) def test_mrz_ocr_correction_preserves_valid(self): """Should not modify already-correct MRZ text.""" valid = "L898902C36UTO7408122F1204159ZE184226B<<<<<10" corrected = pdf_tools._correct_mrz_ocr_errors(valid) assert corrected == valid # ============================================================================ # Test: Non-standard Form Heuristics (v1.2.0) # ============================================================================ class TestFormHeuristicsLcs: """Tests for improved label matching using LCS fuzzy similarity.""" def test_lcs_similarity_exact_match(self): """Exact match should return 1.0.""" assert pdf_tools._lcs_similarity("name", "name") == 1.0 def test_lcs_similarity_partial_match(self): """Partial match should return > 0 and < 1.""" score = pdf_tools._lcs_similarity("fullname", "firstname") assert 0 < score < 1 def test_lcs_similarity_no_match(self): """Completely different strings should return 0.""" assert pdf_tools._lcs_similarity("xyz", "abc") == 0.0 def test_lcs_similarity_empty(self): """Empty strings should return 0.""" assert pdf_tools._lcs_similarity("", "abc") == 0.0 def test_improved_score_label_match_fuzzy(self): """Improved scoring should handle fuzzy matches like full_name -> name.""" score = pdf_tools._score_label_match("full_name", "name", ["name"]) assert score >= 1 # Should match via token overlap def test_improved_score_label_match_abbreviation(self): """Should handle abbreviations like DOB -> date of birth.""" score = pdf_tools._score_label_match("dob", "dateofbirth", ["date", "birth"]) assert score >= 1 class TestFieldAliasMatching: """Tests for field alias groups -- semantically equivalent field names should match at score 3 (exact equivalent), not just fuzzy score 1.""" def test_dob_matches_date_of_birth(self): """'dob' and 'date_of_birth' are aliases and should score 3.""" score = pdf_tools._score_label_match("dob", "dateofbirth", ["date", "birth"]) assert score == 3, f"Expected alias score 3, got {score}" def test_first_name_matches_given_name(self): """'first_name' and 'given_name' are aliases and should score 3.""" score = pdf_tools._score_label_match( "first_name", "givenname", ["given", "name"] ) assert score == 3, f"Expected alias score 3, got {score}" def test_last_name_matches_surname(self): """'last_name' and 'surname' are aliases.""" score = pdf_tools._score_label_match("last_name", "surname", ["surname"]) assert score == 3, f"Expected alias score 3, got {score}" def test_phone_matches_telephone(self): """'phone' and 'telephone' are aliases.""" score = pdf_tools._score_label_match("phone", "telephone", ["telephone"]) assert score == 3, f"Expected alias score 3, got {score}" def test_sex_matches_gender(self): """'sex' and 'gender' are aliases.""" score = pdf_tools._score_label_match("sex", "gender", ["gender"]) assert score == 3, f"Expected alias score 3, got {score}" def test_zip_matches_postal_code(self): """'zip' and 'postal_code' are aliases.""" score = pdf_tools._score_label_match("zip", "postalcode", ["postal", "code"]) assert score == 3, f"Expected alias score 3, got {score}" def test_email_matches_email_address(self): """'email' and 'email_address' are aliases.""" score = pdf_tools._score_label_match("email", "emailaddress", ["email", "address"]) assert score == 3, f"Expected alias score 3, got {score}" def test_non_alias_still_works(self): """Non-alias fields should still use regular scoring.""" score = pdf_tools._score_label_match("company", "organization", ["organization"]) assert score < 3, f"Non-alias should not score 3, got {score}" def test_alias_is_bidirectional(self): """Alias matching should work in both directions.""" score_a = pdf_tools._score_label_match("surname", "lastname", ["last", "name"]) score_b = pdf_tools._score_label_match("last_name", "surname", ["surname"]) assert score_a == 3 and score_b == 3, ( f"Alias should be bidirectional: {score_a}, {score_b}" ) class TestFormHeuristicsGeometricCheckbox: """Tests for geometric checkbox/radio button detection.""" def test_detect_form_fields_has_geometric_detection(self, sample_text_pdf): """detect_form_fields should include geometric checkbox detection.""" result = pdf_tools.detect_form_fields(sample_text_pdf) assert isinstance(result, dict) # Should have page analysis with checkbox info for page_analysis in result.get("page_analysis", []): assert "detected_checkboxes" in page_analysis def test_detect_form_fields_multiline_areas(self, sample_text_pdf): """detect_form_fields should detect large blank areas as multi-line fields.""" result = pdf_tools.detect_form_fields(sample_text_pdf) assert isinstance(result, dict) # Structure should exist even if no multi-line areas found for page_analysis in result.get("page_analysis", []): assert "detected_multiline_areas" in page_analysis # ============================================================================ # Test: BUG-002 - Passport branch should use VLM when available (v1.2.1) # ============================================================================ class TestBug002PassportVlmIntegration: """BUG-002: extract_structured_data(data_type='passport') must not bypass VLM when a backend is available. The passport branch previously did an early-return with backend=None, completely skipping the LLM enhancement code path.""" # Standard TD3 MRZ: 2 lines x 44 chars each MRZ_LINE1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars MRZ_LINE2 = "AB12345674UTO8001011M3012315<<<<<<<<<<<<<<06" # 44 chars def test_passport_branch_reports_backend_when_available(self): """When VLM is available, passport extraction should report it.""" fake_text = ( f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" "Date of Issue: 01 Jan 2020\n" "Issuing Authority: Test Immigration Office\n" ) fake_text_result = { "text": fake_text, "pages_extracted": 2, "page_details": [{"page": 1, "text": fake_text}], } fake_llm_response = '{"issue_date": "2020-01-01", "issuing_authority": "Test Immigration Office"}' with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/passport.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/passport.pdf", data_type="passport", backend="local", ) # BUG-002: backend must NOT be None when VLM is available assert result["backend"] is not None, ( "BUG-002: passport branch still bypasses VLM (backend is None)" ) assert result["backend_available"] is not None def test_passport_branch_method_includes_llm_when_enhanced(self): """Method should be 'passport+llm' when VLM enhances results.""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = { "text": fake_text, "pages_extracted": 1, } fake_llm_response = '{"issuing_authority": "Immigration Office"}' with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/passport.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/passport.pdf", data_type="passport", backend="local", ) assert "llm" in result.get("method", ""), ( "BUG-002: method should contain 'llm' when VLM enhances passport data" ) def test_passport_branch_still_works_without_vlm(self): """Without VLM, passport extraction should still work (pattern only).""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = { "text": fake_text, "pages_extracted": 1, } with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/passport.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/passport.pdf", data_type="passport", ) assert result["method"] == "passport" assert result["backend"] is None # MRZ should still extract the passport number assert result["data"].get("passport_number") is not None def test_passport_vlm_enhances_low_confidence_fields(self): """VLM should fill in fields that pattern extraction missed or got low confidence on (issue_date, issuing_authority, place_of_birth).""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} fake_llm_response = json.dumps({ "issue_date": "2020-06-15", "issuing_authority": "Department of Immigration", "place_of_birth": "Springfield", }) with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/passport.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/passport.pdf", data_type="passport", backend="local", ) data = result["data"] assert data.get("issuing_authority") == "Department of Immigration" assert data.get("issue_date") == "2020-06-15" assert data.get("place_of_birth") == "Springfield" # ============================================================================ # Test: BUG-002a - Passport regex quality issues (v1.2.1) # ============================================================================ class TestBug002aPassportRegexQuality: """BUG-002a: _extract_passport_fields regex should not capture junk prefixes in passport_number or match 'Bearer's signature' as authority.""" def test_passport_number_no_junk_prefix(self): """Passport number fallback should strip common prefixes.""" ocr_text = ( "PASSPORT | P CHN EK2547928\n" "Some other text\n" ) # No MRZ lines, so it falls back to label-value regex extracted, confidence = pdf_tools._extract_passport_fields(ocr_text) pn = extracted.get("passport_number", "") # Must NOT contain "PASSPORT" or "P CHN" prefix assert "PASSPORT" not in pn, ( f"BUG-002a: passport_number contains junk prefix: {pn}" ) if pn: # Should be clean alphanumeric assert pn == "EK2547928" or len(pn) <= 12 def test_authority_does_not_match_bearer_signature(self): """Issuing authority regex must not capture 'Bearer's signature'.""" ocr_text = ( "持照大签名 /Bearer's signature x\n" "Issuing Authority: National Immigration Administration\n" ) extracted, confidence = pdf_tools._extract_passport_fields(ocr_text) auth = extracted.get("issuing_authority", "") assert "signature" not in auth.lower(), ( f"BUG-002a: issuing_authority matched signature field: {auth}" ) def test_authority_does_not_match_signature_garbage(self): """Authority should not capture OCR noise like '#9 A %'.""" ocr_text = ( "#9 A % /Bearer's signature S\n" "Authority: Immigration Office\n" ) extracted, confidence = pdf_tools._extract_passport_fields(ocr_text) auth = extracted.get("issuing_authority", "") assert "signature" not in auth.lower(), ( f"BUG-002a: issuing_authority matched garbage: {auth}" ) # ============================================================================ # Test: BUG-002b - page_count should reflect actual pages (v1.2.1) # ============================================================================ class TestBug002bPageCount: """BUG-002b: extract_structured_data returns page_count: 0 because it reads 'page_count' from text_result but extract_text returns 'pages_extracted' instead.""" def test_page_count_uses_pages_extracted(self): """page_count in result should reflect actual pages from extract_text.""" fake_text_result = { "text": "Some document text", "pages_extracted": 3, } with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/doc.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/doc.pdf", data_type="invoice", ) assert result["page_count"] == 3, ( f"BUG-002b: page_count is {result['page_count']}, expected 3" ) def test_page_count_passport_uses_pages_extracted(self): """Passport branch should also correctly report page_count.""" fake_text = ( "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<\n" # 44 chars "AB12345674UTO8001011M3012315<<<<<<<<<<<<<<06\n" # 44 chars ) fake_text_result = { "text": fake_text, "pages_extracted": 2, } with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/passport.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/passport.pdf", data_type="passport", ) assert result["page_count"] == 2, ( f"BUG-002b: passport page_count is {result['page_count']}, expected 2" ) # ============================================================================ # Test: MRZ-GAP-001 - MRZ fields must appear in passport+llm output (v1.2.2) # ============================================================================ class TestMrzGap001FieldsInOutput: """MRZ-GAP-001: When MRZ is successfully parsed, ALL MRZ-derived fields (surname, given_names, nationality, birth_date, sex, expiry_date) must appear in the final passport output, even when VLM enhances it.""" # Standard TD3 MRZ: 2 lines x 44 chars each MRZ_LINE1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars MRZ_LINE2 = "AB12345674UTO8001011M3012315<<<<<<<<<<<<<<06" # 44 chars def test_mrz_fields_present_in_passport_output(self): """All MRZ-derived fields should be in the output.""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", ) data = result["data"] expected_fields = [ "passport_number", "surname", "given_names", "nationality", "birth_date", "sex", "expiry_date", ] for field in expected_fields: assert field in data, ( f"MRZ-GAP-001: '{field}' missing from passport output" ) def test_mrz_fields_preserved_after_vlm_enhancement(self): """MRZ fields must NOT be overwritten by VLM (MRZ is higher confidence).""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} fake_llm_response = json.dumps({ "issue_date": "2020-01-01", "issuing_authority": "Test Office", }) with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) data = result["data"] # MRZ fields must be present assert data.get("surname") == "DOE" assert data.get("given_names") == "JOHN JAMES" assert data.get("passport_number") == "AB1234567" # VLM fields should also be present assert data.get("issuing_authority") == "Test Office" # ============================================================================ # Test: MRZ-GAP-002 - Noisy OCR MRZ lines must be tolerated (v1.2.2) # ============================================================================ class TestMrzGap002NoisyOcrTolerance: """MRZ-GAP-002: Real OCR often produces MRZ lines that are 43-46 chars instead of exactly 44. The parser must tolerate near-length matches.""" def test_mrz_detection_tolerates_extra_chars(self): """MRZ lines with 1-2 extra trailing chars should still be detected.""" # Simulate real OCR noise: 45 chars (extra trailing char) noisy_line1 = "POCHNNI<<XIUYING<<<<<<<<<<<<<<<<<<<<<<<<<<<K<" # 45 chars noisy_line2 = "EK25479281CHN4905088F3304064MENPNAODNDKCA930" # 45 chars mrz = pdf_tools._extract_mrz_lines(f"{noisy_line1}\n{noisy_line2}\n") assert mrz is not None, ( "MRZ-GAP-002: MRZ detection failed on 45-char lines" ) # Returned lines should be trimmed to 44 assert len(mrz[0]) == 44 assert len(mrz[1]) == 44 def test_mrz_detection_tolerates_short_line(self): """MRZ lines with 1 missing char (43) should still be detected.""" short_line1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<" # 43 chars normal_line2 = "AB12345674UTO8001011M3012315<<<<<<<<<<<<<<06" # 44 chars mrz = pdf_tools._extract_mrz_lines(f"{short_line1}\n{normal_line2}\n") # Should still attempt to detect -- pad short lines assert mrz is not None, ( "MRZ-GAP-002: MRZ detection failed on 43-char line" ) def test_passport_number_from_vlm_when_mrz_fails(self): """When MRZ parsing fails, VLM should extract passport_number too.""" # No MRZ lines in text fake_text = "Some passport text without clear MRZ lines\nPassport No: ABC123\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} fake_llm_response = json.dumps({ "passport_number": "EK2544770", "issuing_authority": "Immigration Office", "place_of_birth": "Fujian", }) with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) assert result["data"].get("passport_number") == "EK2544770", ( "MRZ-GAP-002: VLM should provide passport_number when MRZ fails" ) # ============================================================================ # Test: VLM-QUALITY-001 - Cross-validate VLM dates with MRZ expiry (v1.2.2) # ============================================================================ class TestVlmQuality001DateCrossValidation: """VLM-QUALITY-001: When VLM returns an issue_date that matches the MRZ expiry_date, it should be flagged and corrected using domain knowledge.""" MRZ_LINE1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars MRZ_LINE2 = "AB12345674UTO8001011M3012315<<<<<<<<<<<<<<06" # 44 chars # MRZ expiry = 301231 -> 2030-12-31 def test_vlm_issue_date_not_same_as_mrz_expiry(self): """If VLM returns expiry as issue_date, it should be corrected.""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} # VLM mistakenly returns the expiry date as issue_date fake_llm_response = json.dumps({ "issue_date": "2030-12-31", # Same as MRZ expiry! "issuing_authority": "Immigration Office", }) with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) data = result["data"] # issue_date must NOT be the same as expiry_date if data.get("issue_date") and data.get("expiry_date"): assert data["issue_date"] != data["expiry_date"], ( "VLM-QUALITY-001: issue_date should not equal expiry_date" ) def test_domain_knowledge_passport_validity(self): """When VLM date matches MRZ expiry, apply 10-year validity rule.""" fake_text = f"{self.MRZ_LINE1}\n{self.MRZ_LINE2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} # VLM returns the expiry date (301231 in MRZ = 2030-12-31) fake_llm_response = json.dumps({ "issue_date": "301231", # Raw MRZ format of expiry "issuing_authority": "Immigration Office", }) with ( patch.object(pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf")), patch.object(pdf_tools, "extract_text", return_value=fake_text_result), patch.object(pdf_tools, "_get_llm_backend", return_value="local"), patch.object(pdf_tools, "_call_llm", return_value=fake_llm_response), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) data = result["data"] # If issue_date was corrected via domain knowledge, it should be ~10 years before expiry issue = data.get("issue_date", "") if issue: assert "2030" not in str(issue), ( "VLM-QUALITY-001: issue_date still contains expiry year after correction" ) # ============================================================================ # Test: BUG-003 - MRZ expiry_date century must use future context (v1.2.3) # ============================================================================ class TestBug003ExpiryDateCentury: """BUG-003: _parse_mrz_date uses a single threshold that fails for future expiry dates. Year '33' should be 2033 (expiry), not 1933.""" def test_expiry_date_future_year(self): """MRZ expiry '330406' should produce 2033-04-06, not 1933.""" result = pdf_tools._parse_mrz_date("330406", is_expiry=True) assert result is not None assert result.startswith("2033"), ( f"BUG-003: expiry '330406' returned {result}, expected 2033-04-06" ) def test_expiry_date_far_future(self): """MRZ expiry '450101' should produce 2045-01-01.""" result = pdf_tools._parse_mrz_date("450101", is_expiry=True) assert result is not None assert result.startswith("2045"), ( f"BUG-003: expiry '450101' returned {result}, expected 2045-01-01" ) def test_birth_date_past_year(self): """MRZ birth '490508' should produce 1949-05-08 (DOB is in the past).""" result = pdf_tools._parse_mrz_date("490508") assert result is not None assert result.startswith("1949"), ( f"BUG-003: birth '490508' returned {result}, expected 1949-05-08" ) def test_birth_date_recent(self): """MRZ birth '050101' should produce 2005-01-01 (young person).""" result = pdf_tools._parse_mrz_date("050101") assert result is not None assert result.startswith("2005"), ( f"BUG-003: birth '050101' returned {result}, expected 2005-01-01" ) def test_expiry_cascades_to_passport_output(self): """Full passport extraction must produce correct expiry year.""" # TD3 MRZ with expiry 330406 (April 6, 2033) line1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars line2 = "AB12345674UTO8001011M3304064<<<<<<<<<<<<<<06" # 44 chars fake_text = f"{line1}\n{line2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} with ( patch.object( pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf") ), patch.object( pdf_tools, "extract_text", return_value=fake_text_result ), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", ) expiry = result["data"].get("expiry_date", "") assert "2033" in str(expiry), ( f"BUG-003: expiry_date in passport output is {expiry}, expected 2033" ) # ============================================================================ # Test: BUG-004 - personal_number OCR noise filtering (v1.2.3) # ============================================================================ class TestBug004PersonalNumberNoise: """BUG-004: personal_number from MRZ positions 28-42 often contains OCR garbage. Should be None when no meaningful content is found.""" def test_noise_personal_number_filtered(self): """OCR noise like 'MENPNAODNDKCA9' should become None.""" # MRZ line2 with noise in personal_number positions line1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars line2 = "AB12345674UTO8001011M3012315MENPNAODNDKCA906" # 44 chars fake_text = f"{line1}\n{line2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} with ( patch.object( pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf") ), patch.object( pdf_tools, "extract_text", return_value=fake_text_result ), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", ) pn = result["data"].get("personal_number") if pn is not None: # If present, it should not be pure noise assert re.match(r"^[A-Z0-9]+$", pn) is None or len(pn) <= 2, ( f"BUG-004: personal_number '{pn}' looks like OCR noise" ) def test_valid_personal_number_preserved(self): """A legitimate personal_number (digits) should be kept.""" line1 = "P<UTODOE<<JOHN<JAMES<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars line2 = "AB12345674UTO8001011M301231512345678<<<<<<06" # 44 chars fake_text = f"{line1}\n{line2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} with ( patch.object( pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf") ), patch.object( pdf_tools, "extract_text", return_value=fake_text_result ), patch.object(pdf_tools, "_get_llm_backend", return_value=None), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", ) pn = result["data"].get("personal_number") assert pn == "12345678", ( f"BUG-004: valid personal_number should be '12345678', got '{pn}'" ) # ============================================================================ # Test: VLM-QUALITY-003 - VLM authority prompt improvement (v1.2.3) # ============================================================================ class TestVlmQuality003AuthorityPrompt: """VLM-QUALITY-003: VLM prompt should specify to look at the data page, not the introductory page, for issuing authority. Also for Chinese passports, post-process MFA -> NIA if appropriate.""" def test_chinese_passport_authority_post_processing(self): """If issuing_country=CHN and authority mentions 'Foreign Affairs', it should be corrected to NIA.""" line1 = "P<CHNTEST<<PERSON<<<<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars line2 = "AB12345674CHN8001011M3012315<<<<<<<<<<<<<<06" # 44 chars fake_text = f"{line1}\n{line2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} # VLM returns MFA (from intro page) fake_llm_response = json.dumps({ "issue_date": "2020-12-31", "issuing_authority": "Ministry of Foreign Affairs", "place_of_birth": "Beijing", }) with ( patch.object( pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf") ), patch.object( pdf_tools, "extract_text", return_value=fake_text_result ), patch.object( pdf_tools, "_get_llm_backend", return_value="local" ), patch.object( pdf_tools, "_call_llm", return_value=fake_llm_response ), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) authority = result["data"].get("issuing_authority", "") assert "Foreign Affairs" not in authority, ( f"VLM-QUALITY-003: authority should not be MFA, got '{authority}'" ) class TestBug005IssueDateOffByOneDay: """BUG-005 (CRITICAL): extract_structured_data passport issue_date is off by -1 day. The VLM derives issue_date = expiry - 10 years, but passports expire the day BEFORE the 10-year anniversary of issuance. Correct formula: issue_date = expiry_date + 1 day - 10 years Example: expiry 2033-04-06 -> issue 2023-04-07 (NOT 2023-04-06) See: https://github.com/nfsarch33/pdf-mcp-server/issues/34 """ def test_cross_validate_corrects_with_plus_one_day(self): """When VLM returns expiry as issue_date, the correction should produce expiry + 1 day - 10 years, not expiry - 10 years.""" from datetime import date, timedelta data = {"issue_date": "2033-04-06", "expiry_date": "2033-04-06"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) # Corrected issue_date should be 2023-04-07 (expiry + 1 day - 10 years) assert data["issue_date"] == "2023-04-07", ( f"BUG-005: expected 2023-04-07, got {data['issue_date']}" ) # Confidence should be lowered for derived values assert conf["issue_date"] <= 0.55, ( f"BUG-005: derived date confidence should be <= 0.55, got {conf['issue_date']}" ) def test_cross_validate_jan_boundary(self): """Expiry 2033-01-01 -> issue should be 2023-01-02 (not 2023-01-01).""" data = {"issue_date": "2033-01-01", "expiry_date": "2033-01-01"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) assert data["issue_date"] == "2023-01-02", ( f"BUG-005: expected 2023-01-02, got {data['issue_date']}" ) def test_cross_validate_dec31_year_boundary(self): """Expiry 2033-12-31 -> issue should be 2024-01-01. Dec 31 + 1 day = Jan 1, 2034, - 10 years = 2024-01-01.""" data = {"issue_date": "2033-12-31", "expiry_date": "2033-12-31"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) assert data["issue_date"] == "2024-01-01", ( f"BUG-005: expected 2024-01-01, got {data['issue_date']}" ) def test_detect_vlm_derived_issue_date(self): """When VLM returns issue_date that is exactly expiry - 10 years (same day/month), detect and apply +1 day correction. This is the MAIN BUG-005 scenario from the real-world report.""" data = {"issue_date": "2023-04-06", "expiry_date": "2033-04-06"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) # Should detect derivation and correct to +1 day assert data["issue_date"] == "2023-04-07", ( f"BUG-005: expected 2023-04-07, got {data['issue_date']}" ) assert conf["issue_date"] <= 0.55, ( f"BUG-005: derived confidence should be <= 0.55, got {conf['issue_date']}" ) def test_detect_vlm_derived_second_passport(self): """Second passport from bug report: expiry 2033-04-05, VLM issue 2023-04-05. Should be corrected to 2023-04-06.""" data = {"issue_date": "2023-04-05", "expiry_date": "2033-04-05"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) assert data["issue_date"] == "2023-04-06", ( f"BUG-005: expected 2023-04-06, got {data['issue_date']}" ) def test_no_correction_for_legitimate_dates(self): """If issue_date is NOT exactly expiry - 10 years, do NOT modify it. Example: issue 2023-04-07, expiry 2033-04-06 (correct, no change needed).""" data = {"issue_date": "2023-04-07", "expiry_date": "2033-04-06"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) assert data["issue_date"] == "2023-04-07", ( f"BUG-005: legitimate date should not be modified, got {data['issue_date']}" ) assert conf["issue_date"] == 0.85, ( f"BUG-005: legitimate confidence should not change, got {conf['issue_date']}" ) def test_full_pipeline_corrects_issue_date(self): """End-to-end: extract_structured_data should return the corrected issue_date for a Chinese passport where VLM derives the date.""" # MRZ with expiry 330406 -> 2033-04-06 line1 = "P<CHNTEST<<PERSON<<<<<<<<<<<<<<<<<<<<<<<<<<<" # 44 chars line2 = "EK25479284CHN8001011M3304065<<<<<<<<<<<<<<06" # 44 chars fake_text = f"{line1}\n{line2}\n" fake_text_result = {"text": fake_text, "pages_extracted": 1} # VLM derives issue_date = expiry - 10 years (the BUG-005 pattern) fake_llm_response = json.dumps({ "issue_date": "2023-04-06", # Wrong: should be 2023-04-07 "issuing_authority": "National Immigration Administration, PRC", }) with ( patch.object( pdf_tools, "_ensure_file", return_value=Path("/fake/p.pdf") ), patch.object( pdf_tools, "extract_text", return_value=fake_text_result ), patch.object( pdf_tools, "_get_llm_backend", return_value="local" ), patch.object( pdf_tools, "_call_llm", return_value=fake_llm_response ), ): result = pdf_tools.extract_structured_data( "/fake/p.pdf", data_type="passport", backend="local", ) issue = result["data"].get("issue_date", "") assert issue == "2023-04-07", ( f"BUG-005: full pipeline issue_date should be 2023-04-07, got {issue}" ) def test_vlm_prompt_does_not_instruct_derivation(self): """The VLM prompt should NOT tell the VLM to derive issue_date from expiry_date. It should instruct to READ it directly.""" # This is a code inspection test -- verify the prompt guidance import inspect source = inspect.getsource(pdf_tools.extract_structured_data) # The prompt should NOT contain "10 years BEFORE the expiry" assert "10 years BEFORE the expiry" not in source, ( "BUG-005: VLM prompt should not instruct derivation of issue_date" ) def test_derived_flag_on_corrected_dates(self): """When issue_date is derived from expiry, the output should include a 'derived_fields' key listing fields that were computed, not read.""" data = {"issue_date": "2023-04-06", "expiry_date": "2033-04-06"} conf = {"issue_date": 0.85, "expiry_date": 0.95} pdf_tools._cross_validate_passport_dates(data, conf) assert "issue_date" in data.get("derived_fields", []), ( "BUG-005: derived dates should be flagged in 'derived_fields'" )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nfsarch33/pdf-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_agentic_features.py•87.9 KiB