PDF MCP Server

test_ocr.py•24.3 KiB

""" Comprehensive OCR tests for pdf-mcp-server. Tests cover: - PDF type detection (searchable vs image-based vs hybrid) - Native text extraction - OCR text extraction with fallback chain - Text block extraction with positions - All PDF fixtures in tests/ directory """ from pathlib import Path from typing import Dict, Any import pytest from pdf_mcp import pdf_tools from pdf_mcp.pdf_tools import PdfToolError # ============================================================================= # Test Fixtures - All PDFs in tests/ # ============================================================================= TESTS_DIR = Path(__file__).parent # Categorize test PDFs by expected type (based on actual analysis) # Classifications determined by text_coverage_ratio and image_coverage_ratio PDF_FIXTURES = { # Form PDFs - searchable with native text layer "1006.pdf": {"expected_type": "searchable", "has_forms": True}, # PDFs with native text layer (may have OCR layer embedded) "TestOCR.pdf": {"expected_type": "searchable", "has_forms": False}, # Has OCR text layer "scanned_example_1.pdf": {"expected_type": "searchable", "has_forms": False}, # Has OCR text layer # Image-based PDFs (no native text layer, need OCR) "pdf_sample2.pdf": {"expected_type": "image_based", "has_forms": False}, "scansmpl.pdf": {"expected_type": "image_based", "has_forms": False}, "image-based-pdf-sample.pdf": {"expected_type": "image_based", "has_forms": False}, "non-text-searchable.pdf": {"expected_type": "image_based", "has_forms": False}, "Sbizhub_C2219080509040.pdf": {"expected_type": "image_based", "has_forms": False}, "PublicWaterMassMailing.pdf": {"expected_type": "image_based", "has_forms": False}, } def get_available_fixtures() -> Dict[str, Dict[str, Any]]: """Return only fixtures that exist in the tests directory.""" available = {} for name, info in PDF_FIXTURES.items(): path = TESTS_DIR / name if path.exists(): available[name] = {**info, "path": str(path)} return available # ============================================================================= # detect_pdf_type Tests # ============================================================================= class TestDetectPdfType: """Tests for the detect_pdf_type function.""" def test_detect_type_returns_required_fields(self): """Verify detect_pdf_type returns all required fields.""" fixtures = get_available_fixtures() if not fixtures: pytest.skip("No test fixtures available") # Use first available fixture path = list(fixtures.values())[0]["path"] result = pdf_tools.detect_pdf_type(path) # Check required fields assert "pdf_path" in result assert "classification" in result assert result["classification"] in ("searchable", "image_based", "hybrid") assert "total_pages" in result assert "pages_with_native_text" in result assert "pages_with_images" in result assert "total_native_chars" in result assert "total_images" in result assert "needs_ocr" in result assert "tesseract_available" in result assert "page_details" in result def test_detect_type_all_fixtures(self): """Test detect_pdf_type on all available fixtures.""" fixtures = get_available_fixtures() results = {} for name, info in fixtures.items(): result = pdf_tools.detect_pdf_type(info["path"]) results[name] = result # Basic sanity checks assert result["total_pages"] >= 1 assert result["classification"] in ("searchable", "image_based", "hybrid") assert isinstance(result["page_details"], list) assert len(result["page_details"]) == result["total_pages"] # Print summary for debugging print("\n--- PDF Type Detection Summary ---") for name, result in results.items(): print(f"{name}: {result['classification']}, " f"pages={result['total_pages']}, " f"text_ratio={result['text_coverage_ratio']}, " f"needs_ocr={result['needs_ocr']}") @pytest.mark.parametrize("fixture_name,expected_info", [ (name, info) for name, info in PDF_FIXTURES.items() ]) def test_detect_type_classification(self, fixture_name: str, expected_info: Dict): """Test that PDFs are classified as expected.""" path = TESTS_DIR / fixture_name if not path.exists(): pytest.skip(f"Fixture {fixture_name} not available") result = pdf_tools.detect_pdf_type(str(path)) # For image-based PDFs, either image_based or hybrid is acceptable # since some might have minimal text layers expected = expected_info["expected_type"] actual = result["classification"] if expected == "image_based": assert actual in ("image_based", "hybrid"), \ f"{fixture_name}: expected {expected}, got {actual}" else: # For searchable PDFs, we're more lenient assert actual in ("searchable", "hybrid"), \ f"{fixture_name}: expected searchable/hybrid, got {actual}" def test_detect_type_file_not_found(self, tmp_path: Path): """Test error handling for missing file.""" with pytest.raises(PdfToolError) as exc: pdf_tools.detect_pdf_type(str(tmp_path / "nonexistent.pdf")) assert "not found" in str(exc.value).lower() def test_detect_type_1006_has_text(self): """Test that 1006.pdf (form PDF) is detected as searchable.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.detect_pdf_type(str(path)) assert result["classification"] == "searchable" assert result["total_native_chars"] > 100 assert result["needs_ocr"] is False # ============================================================================= # extract_text_native Tests # ============================================================================= class TestExtractTextNative: """Tests for native text extraction.""" def test_extract_native_searchable_pdf(self): """Test native extraction on a searchable PDF.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.extract_text_native(str(path)) assert result["method"] == "native" assert result["total_chars"] > 0 assert "text" in result assert len(result["text"]) > 0 assert result["pages_extracted"] >= 1 def test_extract_native_specific_pages(self): """Test native extraction of specific pages.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") # Extract only page 1 result = pdf_tools.extract_text_native(str(path), pages=[1]) assert result["pages_extracted"] == 1 assert len(result["page_details"]) == 1 assert result["page_details"][0]["page"] == 1 def test_extract_native_image_based_pdf(self): """Test native extraction on image-based PDF (should return minimal text).""" fixtures = get_available_fixtures() image_pdfs = [name for name, info in fixtures.items() if info["expected_type"] == "image_based"] if not image_pdfs: pytest.skip("No image-based PDF fixtures available") path = fixtures[image_pdfs[0]]["path"] result = pdf_tools.extract_text_native(path) # Image-based PDFs should have minimal native text # (but might have some due to PDF metadata or embedded fonts) assert result["method"] == "native" # The text might be very short or empty assert "text" in result def test_extract_native_all_fixtures(self): """Test native extraction on all available fixtures.""" fixtures = get_available_fixtures() print("\n--- Native Text Extraction Summary ---") for name, info in fixtures.items(): result = pdf_tools.extract_text_native(info["path"]) print(f"{name}: {result['total_chars']} chars from {result['pages_extracted']} pages") assert "text" in result assert result["pages_extracted"] >= 1 # ============================================================================= # extract_text_ocr Tests # ============================================================================= class TestExtractTextOcr: """Tests for OCR text extraction.""" def test_extract_ocr_auto_engine(self): """Test auto engine mode (native with OCR fallback).""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.extract_text_ocr(str(path), engine="auto") assert result["engine_requested"] == "auto" assert result["method_used"] in ("native", "ocr", "hybrid") assert "text" in result def test_extract_ocr_native_engine(self): """Test native-only engine mode (no OCR).""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.extract_text_ocr(str(path), engine="native") assert result["engine_requested"] == "native" assert result["method_used"] == "native" def test_extract_ocr_invalid_engine(self): """Test error handling for invalid engine.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") with pytest.raises(PdfToolError) as exc: pdf_tools.extract_text_ocr(str(path), engine="invalid_engine") assert "Invalid engine" in str(exc.value) def test_extract_ocr_specific_pages(self): """Test OCR extraction of specific pages.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.extract_text_ocr(str(path), pages=[1], engine="native") assert result["pages_extracted"] == 1 assert len(result["page_details"]) == 1 def test_extract_ocr_tesseract_required_but_missing(self): """Test error when Tesseract is required but not available.""" # This test only runs if Tesseract is NOT installed if pdf_tools._HAS_TESSERACT: pytest.skip("Tesseract is available, skipping unavailable test") path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") with pytest.raises(PdfToolError) as exc: pdf_tools.extract_text_ocr(str(path), engine="tesseract") assert "Tesseract" in str(exc.value) or "not available" in str(exc.value).lower() @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_ocr_tesseract_engine(self): """Test Tesseract OCR engine (requires tesseract-ocr installed).""" fixtures = get_available_fixtures() image_pdfs = [name for name, info in fixtures.items() if info["expected_type"] == "image_based"] if not image_pdfs: pytest.skip("No image-based PDF fixtures available") path = fixtures[image_pdfs[0]]["path"] result = pdf_tools.extract_text_ocr(path, engine="tesseract", dpi=150) assert result["engine_requested"] == "tesseract" assert result["method_used"] in ("ocr", "hybrid") assert result["dpi"] == 150 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_ocr_force_ocr_engine(self): """Test force_ocr engine (OCR even on searchable PDFs).""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.extract_text_ocr(str(path), engine="force_ocr", pages=[1]) assert result["engine_requested"] == "force_ocr" assert "ocr" in result["method_used"] def test_extract_ocr_all_fixtures_auto(self): """Test auto OCR extraction on all available fixtures.""" fixtures = get_available_fixtures() print("\n--- OCR Text Extraction Summary (auto mode) ---") for name, info in fixtures.items(): result = pdf_tools.extract_text_ocr(info["path"], engine="auto") print(f"{name}: method={result['method_used']}, " f"{result['total_chars']} chars from {result['pages_extracted']} pages") assert "text" in result assert result["pages_extracted"] >= 1 # ============================================================================= # get_pdf_text_blocks Tests # ============================================================================= class TestGetPdfTextBlocks: """Tests for text block extraction with positions.""" def test_get_text_blocks_returns_structure(self): """Test that text blocks returns proper structure.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.get_pdf_text_blocks(str(path)) assert "pdf_path" in result assert "total_pages" in result assert "pages_analyzed" in result assert "page_blocks" in result assert isinstance(result["page_blocks"], list) def test_get_text_blocks_has_bbox(self): """Test that text blocks include bounding boxes.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.get_pdf_text_blocks(str(path), pages=[1]) assert len(result["page_blocks"]) == 1 page_data = result["page_blocks"][0] assert "page" in page_data assert "width" in page_data assert "height" in page_data assert "blocks" in page_data # Check that blocks have proper structure for block in page_data["blocks"]: assert "type" in block assert block["type"] in ("text", "image") assert "bbox" in block def test_get_text_blocks_specific_pages(self): """Test text block extraction for specific pages.""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") result = pdf_tools.get_pdf_text_blocks(str(path), pages=[1]) assert result["pages_analyzed"] == 1 assert len(result["page_blocks"]) == 1 assert result["page_blocks"][0]["page"] == 1 def test_get_text_blocks_all_fixtures(self): """Test text block extraction on all available fixtures.""" fixtures = get_available_fixtures() print("\n--- Text Block Extraction Summary ---") for name, info in fixtures.items(): result = pdf_tools.get_pdf_text_blocks(info["path"]) total_blocks = sum( len(p["blocks"]) for p in result["page_blocks"] ) text_blocks = sum( len([b for b in p["blocks"] if b["type"] == "text"]) for p in result["page_blocks"] ) image_blocks = sum( len([b for b in p["blocks"] if b["type"] == "image"]) for p in result["page_blocks"] ) print(f"{name}: {result['pages_analyzed']} pages, " f"{total_blocks} blocks ({text_blocks} text, {image_blocks} image)") # ============================================================================= # MCP Layer Integration Tests # ============================================================================= class TestMcpLayerOcr: """Test OCR tools through the MCP layer.""" def test_mcp_detect_pdf_type(self): """Test detect_pdf_type via MCP layer.""" import asyncio from pdf_mcp import server path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") async def call(): _content, meta = await server.mcp.call_tool( "detect_pdf_type", {"pdf_path": str(path)} ) assert isinstance(meta, dict) assert "result" in meta result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert result["classification"] in ("searchable", "image_based", "hybrid") def test_mcp_extract_text_native(self): """Test extract_text_native via MCP layer.""" import asyncio from pdf_mcp import server path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") async def call(): _content, meta = await server.mcp.call_tool( "extract_text_native", {"pdf_path": str(path)} ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert result["method"] == "native" assert "text" in result def test_mcp_extract_text_ocr(self): """Test extract_text_ocr via MCP layer.""" import asyncio from pdf_mcp import server path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") async def call(): _content, meta = await server.mcp.call_tool( "extract_text_ocr", {"pdf_path": str(path), "engine": "auto"}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert result["engine_requested"] == "auto" assert "text" in result def test_mcp_get_pdf_text_blocks(self): """Test get_pdf_text_blocks via MCP layer.""" import asyncio from pdf_mcp import server path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") async def call(): _content, meta = await server.mcp.call_tool( "get_pdf_text_blocks", {"pdf_path": str(path), "pages": [1]} ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "page_blocks" in result assert len(result["page_blocks"]) == 1 # ============================================================================= # Regression Tests for Specific PDFs # ============================================================================= class TestSpecificPdfRegression: """Regression tests for specific PDF fixtures.""" def test_1006_pdf_full_workflow(self): """Comprehensive test on 1006.pdf (InDesign form).""" path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") # Detect type detect_result = pdf_tools.detect_pdf_type(str(path)) assert detect_result["classification"] == "searchable" assert detect_result["needs_ocr"] is False # Native extraction native_result = pdf_tools.extract_text_native(str(path)) assert native_result["total_chars"] > 100 # OCR extraction (should use native) ocr_result = pdf_tools.extract_text_ocr(str(path), engine="auto") assert ocr_result["method_used"] in ("native", "hybrid") # Text blocks blocks_result = pdf_tools.get_pdf_text_blocks(str(path)) assert len(blocks_result["page_blocks"]) >= 1 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_image_pdf_ocr_extraction(self): """Test OCR extraction on image-based PDFs.""" fixtures = get_available_fixtures() image_pdfs = [name for name, info in fixtures.items() if info["expected_type"] == "image_based"] if not image_pdfs: pytest.skip("No image-based PDF fixtures available") # Test first available image PDF path = fixtures[image_pdfs[0]]["path"] # Detect type detect_result = pdf_tools.detect_pdf_type(path) assert detect_result["classification"] in ("image_based", "hybrid") # OCR extraction should produce text ocr_result = pdf_tools.extract_text_ocr(path, engine="tesseract", dpi=200) assert ocr_result["method_used"] in ("ocr", "hybrid") print(f"\n--- OCR Result for {image_pdfs[0]} ---") print(f"Characters extracted: {ocr_result['total_chars']}") if ocr_result['total_chars'] > 0: preview = ocr_result['text'][:200].replace('\n', ' ') print(f"Preview: {preview}...") def test_scansmpl_pdf(self): """Test scansmpl.pdf (scanner sample).""" path = TESTS_DIR / "scansmpl.pdf" if not path.exists(): pytest.skip("scansmpl.pdf fixture not available") detect_result = pdf_tools.detect_pdf_type(str(path)) print(f"\nscansmpl.pdf: {detect_result['classification']}, " f"native_chars={detect_result['total_native_chars']}, " f"images={detect_result['total_images']}") def test_testocr_pdf(self): """Test TestOCR.pdf (OCR test document).""" path = TESTS_DIR / "TestOCR.pdf" if not path.exists(): pytest.skip("TestOCR.pdf fixture not available") detect_result = pdf_tools.detect_pdf_type(str(path)) print(f"\nTestOCR.pdf: {detect_result['classification']}, " f"native_chars={detect_result['total_native_chars']}, " f"images={detect_result['total_images']}") def test_public_water_mass_mailing(self): """Test PublicWaterMassMailing.pdf (large multi-page document).""" path = TESTS_DIR / "PublicWaterMassMailing.pdf" if not path.exists(): pytest.skip("PublicWaterMassMailing.pdf fixture not available") detect_result = pdf_tools.detect_pdf_type(str(path)) print(f"\nPublicWaterMassMailing.pdf: {detect_result['classification']}, " f"pages={detect_result['total_pages']}, " f"native_chars={detect_result['total_native_chars']}") # Test extraction of first few pages only (for speed) native_result = pdf_tools.extract_text_native(str(path), pages=[1, 2]) assert native_result["pages_extracted"] == 2 # ============================================================================= # Performance Tests # ============================================================================= class TestOcrPerformance: """Performance-related tests for OCR operations.""" def test_detect_type_performance(self): """Ensure detect_pdf_type completes in reasonable time.""" import time path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") start = time.time() pdf_tools.detect_pdf_type(str(path)) elapsed = time.time() - start # Should complete in under 2 seconds for a typical PDF assert elapsed < 2.0, f"detect_pdf_type took {elapsed:.2f}s" def test_native_extraction_performance(self): """Ensure native extraction completes quickly.""" import time path = TESTS_DIR / "1006.pdf" if not path.exists(): pytest.skip("1006.pdf fixture not available") start = time.time() pdf_tools.extract_text_native(str(path)) elapsed = time.time() - start # Native extraction should be very fast assert elapsed < 1.0, f"extract_text_native took {elapsed:.2f}s"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nfsarch33/pdf-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_ocr.py•24.3 KiB