PDF MCP Server

test_phase2_features.py•33.1 KiB

""" Integration tests for Phase 2 features: - OCR Phase 2: Multi-language support, confidence scores - Table extraction - Image extraction - Smart/hybrid text extraction - Form auto-detection """ from pathlib import Path from typing import Dict, Any import tempfile import shutil import pytest from pdf_mcp import pdf_tools from pdf_mcp.pdf_tools import PdfToolError # ============================================================================= # Test Fixtures # ============================================================================= TESTS_DIR = Path(__file__).parent def get_test_pdf(name: str) -> str: """Get path to test PDF if it exists.""" path = TESTS_DIR / name if path.exists(): return str(path) return None # ============================================================================= # OCR Phase 2 Tests: Multi-language and Confidence Scores # ============================================================================= class TestOcrLanguages: """Tests for OCR language support.""" def test_get_ocr_languages_structure(self): """Test that get_ocr_languages returns proper structure.""" result = pdf_tools.get_ocr_languages() assert "tesseract_available" in result assert isinstance(result["tesseract_available"], bool) assert "installed_languages" in result assert isinstance(result["installed_languages"], list) assert "common_language_codes" in result assert isinstance(result["common_language_codes"], dict) def test_common_languages_include_english(self): """Test that common languages include English.""" result = pdf_tools.get_ocr_languages() assert "eng" in result["common_language_codes"] assert result["common_language_codes"]["eng"] == "English" def test_tesseract_languages_include_common(self): """Test common language codes are documented.""" result = pdf_tools.get_ocr_languages() common = result["common_language_codes"] # Should have major languages expected_codes = ["eng", "fra", "deu", "spa", "chi_sim", "jpn"] for code in expected_codes: assert code in common, f"Missing common language: {code}" class TestExtractTextWithConfidence: """Tests for OCR with confidence scores.""" @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_with_confidence_structure(self): """Test extract_text with include_confidence=True returns proper structure.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text(pdf_path, pages=[1], include_confidence=True) assert "pdf_path" in result assert "language" in result assert "dpi" in result assert "min_confidence" in result assert "pages_extracted" in result assert "total_words" in result assert "overall_average_confidence" in result assert "text" in result assert "page_details" in result @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_with_confidence_word_level(self): """Test that word-level details include confidence.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text(pdf_path, pages=[1], include_confidence=True) assert result["pages_extracted"] == 1 page_detail = result["page_details"][0] assert "words" in page_detail assert "average_confidence" in page_detail # Check word structure if words exist if page_detail["words"]: word = page_detail["words"][0] assert "text" in word assert "confidence" in word assert isinstance(word["confidence"], int) @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_with_min_confidence_filter(self): """Test that min_confidence filters low-confidence words.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") # Extract all words result_all = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, min_confidence=0 ) # Extract only high confidence words result_high = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, min_confidence=80 ) # High confidence should have same or fewer words assert result_high["total_words"] <= result_all["total_words"] def test_extract_with_confidence_no_tesseract(self): """Test error when Tesseract not available and confidence requested.""" if pdf_tools._HAS_TESSERACT: pytest.skip("Tesseract is available") pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") with pytest.raises(PdfToolError) as exc: pdf_tools.extract_text(pdf_path, include_confidence=True) assert "Tesseract" in str(exc.value) # ============================================================================= # Table Extraction Tests # ============================================================================= class TestExtractTables: """Tests for table extraction.""" def test_extract_tables_structure(self): """Test that extract_tables returns proper structure.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_tables(pdf_path) assert "pdf_path" in result assert "total_pages" in result assert "pages_analyzed" in result assert "total_tables" in result assert "output_format" in result assert "page_tables" in result assert isinstance(result["page_tables"], list) def test_extract_tables_list_format(self): """Test table extraction with list format.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_tables(pdf_path, output_format="list") assert result["output_format"] == "list" def test_extract_tables_dict_format(self): """Test table extraction with dict format (uses headers).""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_tables(pdf_path, output_format="dict") assert result["output_format"] == "dict" def test_extract_tables_invalid_format(self): """Test error for invalid output format.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") with pytest.raises(PdfToolError) as exc: pdf_tools.extract_tables(pdf_path, output_format="invalid") assert "output_format" in str(exc.value) def test_extract_tables_specific_pages(self): """Test table extraction for specific pages.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_tables(pdf_path, pages=[1]) assert result["pages_analyzed"] == 1 def test_extract_tables_all_fixtures(self): """Test table extraction on all available PDFs.""" test_pdfs = ["1006.pdf", "PublicWaterMassMailing.pdf", "TestOCR.pdf"] print("\n--- Table Extraction Summary ---") for name in test_pdfs: pdf_path = get_test_pdf(name) if not pdf_path: continue result = pdf_tools.extract_tables(pdf_path) print(f"{name}: {result['total_tables']} tables found") # ============================================================================= # Image Extraction Tests # ============================================================================= class TestImageExtraction: """Tests for image extraction.""" def test_get_image_info_structure(self): """Test that get_image_info returns proper structure.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.get_image_info(pdf_path) assert "pdf_path" in result assert "total_pages" in result assert "pages_analyzed" in result assert "total_images" in result assert "page_images" in result assert isinstance(result["page_images"], list) def test_get_image_info_image_details(self): """Test that image info includes dimensions.""" # Use an image-based PDF pdf_path = get_test_pdf("image-based-pdf-sample.pdf") if not pdf_path: pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("No image-based PDF available") result = pdf_tools.get_image_info(pdf_path) if result["total_images"] > 0: page_info = result["page_images"][0] if page_info["images"]: img = page_info["images"][0] assert "width" in img assert "height" in img assert "xref" in img def test_extract_images_creates_files(self): """Test that extract_images creates image files.""" # Find a PDF with images pdf_path = get_test_pdf("image-based-pdf-sample.pdf") if not pdf_path: pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("No image-based PDF available") with tempfile.TemporaryDirectory() as tmpdir: result = pdf_tools.extract_images(pdf_path, tmpdir, pages=[1]) assert "images_extracted" in result assert "images" in result assert result["output_dir"] == tmpdir # Check if files were created if result["images_extracted"] > 0: for img_info in result["images"]: if "output_path" in img_info: assert Path(img_info["output_path"]).exists() def test_extract_images_min_dimensions(self): """Test that min dimensions filter works.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") with tempfile.TemporaryDirectory() as tmpdir: # Extract with very high min dimensions result = pdf_tools.extract_images( pdf_path, tmpdir, min_width=5000, min_height=5000 ) # Should skip most/all images assert result["images_extracted"] == 0 or result["images_skipped"] >= 0 def test_extract_images_formats(self): """Test different output formats.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") for fmt in ["png", "jpeg"]: with tempfile.TemporaryDirectory() as tmpdir: result = pdf_tools.extract_images( pdf_path, tmpdir, pages=[1], image_format=fmt ) assert "images" in result def test_extract_images_invalid_format(self): """Test error for invalid image format.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") with tempfile.TemporaryDirectory() as tmpdir: with pytest.raises(PdfToolError) as exc: pdf_tools.extract_images(pdf_path, tmpdir, image_format="invalid") assert "image_format" in str(exc.value) # ============================================================================= # Smart/Hybrid Text Extraction Tests # ============================================================================= class TestSmartExtraction: """Tests for smart/hybrid text extraction using extract_text(engine='smart').""" def test_extract_text_smart_structure(self): """Test that extract_text with engine='smart' returns proper structure.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text(pdf_path, engine="smart") assert "pdf_path" in result assert "method" in result assert result["method"] == "smart" assert "pages_extracted" in result assert "total_chars" in result assert "native_threshold" in result assert "native_pages" in result assert "ocr_pages" in result assert "ocr_available" in result assert "text" in result assert "page_details" in result def test_extract_text_smart_per_page_method(self): """Test that each page has method info.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text(pdf_path, engine="smart") for page_detail in result["page_details"]: assert "page" in page_detail assert "method" in page_detail assert page_detail["method"] in ("native", "ocr") assert "text" in page_detail assert "char_count" in page_detail assert "native_chars" in page_detail def test_extract_text_smart_threshold(self): """Test native_threshold parameter.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") # With very high threshold, more pages should try OCR result_high = pdf_tools.extract_text( pdf_path, engine="smart", native_threshold=10000 ) # With low threshold, most pages should use native result_low = pdf_tools.extract_text( pdf_path, engine="smart", native_threshold=10 ) # Low threshold should have more native pages assert result_low["native_pages"] >= result_high["native_pages"] def test_extract_text_smart_hybrid_document(self): """Test smart extraction on image-based PDF.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") result = pdf_tools.extract_text(pdf_path, engine="smart") # Image-based PDFs should have mostly OCR pages if Tesseract available if pdf_tools._HAS_TESSERACT: # At least some pages should attempt OCR print(f"\nSmart extraction: {result['native_pages']} native, " f"{result['ocr_pages']} OCR") def test_extract_text_smart_all_fixtures(self): """Test smart extraction on multiple PDFs.""" test_pdfs = ["1006.pdf", "scansmpl.pdf", "TestOCR.pdf"] print("\n--- Smart Extraction Summary ---") for name in test_pdfs: pdf_path = get_test_pdf(name) if not pdf_path: continue result = pdf_tools.extract_text(pdf_path, engine="smart") print(f"{name}: {result['native_pages']} native, " f"{result['ocr_pages']} OCR, " f"{result['total_chars']} chars") # ============================================================================= # Form Auto-Detection Tests # ============================================================================= class TestFormDetection: """Tests for form field auto-detection.""" def test_detect_form_fields_structure(self): """Test that detect_form_fields returns proper structure.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.detect_form_fields(pdf_path) assert "pdf_path" in result assert "total_pages" in result assert "pages_analyzed" in result assert "has_existing_acroform" in result assert "existing_field_count" in result assert "detected_potential_fields" in result assert "detected_fields" in result assert "page_analysis" in result assert "recommendation" in result def test_detect_form_fields_on_form_pdf(self): """Test form detection on a PDF with AcroForm fields.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.detect_form_fields(pdf_path) # 1006.pdf has AcroForm fields assert result["has_existing_acroform"] is True assert result["existing_field_count"] > 0 assert "AcroForm" in result["recommendation"] def test_detect_form_fields_labels(self): """Test that label patterns are detected.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.detect_form_fields(pdf_path) # Check page analysis structure for page_analysis in result["page_analysis"]: assert "page" in page_analysis assert "detected_labels" in page_analysis assert "detected_checkboxes" in page_analysis def test_detect_form_fields_specific_pages(self): """Test form detection on specific pages.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.detect_form_fields(pdf_path, pages=[1]) assert result["pages_analyzed"] == 1 assert len(result["page_analysis"]) == 1 def test_detect_form_fields_suggestion_type(self): """Test that detected fields have suggested types.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.detect_form_fields(pdf_path) for field in result["detected_fields"]: assert "type" in field if field["type"] == "label": assert "suggested_field_type" in field def test_detect_form_fields_all_fixtures(self): """Test form detection on multiple PDFs.""" test_pdfs = ["1006.pdf", "PublicWaterMassMailing.pdf", "TestOCR.pdf"] print("\n--- Form Detection Summary ---") for name in test_pdfs: pdf_path = get_test_pdf(name) if not pdf_path: continue result = pdf_tools.detect_form_fields(pdf_path) print(f"{name}: AcroForm={result['has_existing_acroform']}, " f"existing={result['existing_field_count']}, " f"detected={result['detected_potential_fields']}") # ============================================================================= # MCP Layer Integration Tests # ============================================================================= class TestMcpLayerPhase2: """Test Phase 2 tools through the MCP layer.""" def test_mcp_get_ocr_languages(self): """Test get_ocr_languages via MCP layer.""" import asyncio from pdf_mcp import server async def call(): _content, meta = await server.mcp.call_tool( "get_ocr_languages", {} ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "tesseract_available" in result def test_mcp_extract_tables(self): """Test extract_tables via MCP layer.""" import asyncio from pdf_mcp import server pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") async def call(): _content, meta = await server.mcp.call_tool( "extract_tables", {"pdf_path": pdf_path, "pages": [1]}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "total_tables" in result def test_mcp_get_image_info(self): """Test get_image_info via MCP layer.""" import asyncio from pdf_mcp import server pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") async def call(): _content, meta = await server.mcp.call_tool( "get_image_info", {"pdf_path": pdf_path}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "total_images" in result def test_mcp_extract_text_smart(self): """Test extract_text_smart via MCP layer.""" import asyncio from pdf_mcp import server pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") async def call(): _content, meta = await server.mcp.call_tool( "extract_text", {"pdf_path": pdf_path, "pages": [1], "engine": "smart"}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "native_pages" in result def test_mcp_detect_form_fields(self): """Test detect_form_fields via MCP layer.""" import asyncio from pdf_mcp import server pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") async def call(): _content, meta = await server.mcp.call_tool( "detect_form_fields", {"pdf_path": pdf_path}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "has_existing_acroform" in result # ============================================================================= # End-to-End Workflow Tests # ============================================================================= class TestEndToEndWorkflows: """End-to-end workflow tests combining multiple features.""" def test_full_document_analysis_workflow(self): """Test complete document analysis: detect type, extract text, find tables.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") # Step 1: Detect PDF type type_result = pdf_tools.detect_pdf_type(pdf_path) assert type_result["classification"] in ("searchable", "image_based", "hybrid") # Step 2: Smart text extraction text_result = pdf_tools.extract_text(pdf_path, engine="smart") assert text_result["total_chars"] >= 0 # Step 3: Table extraction table_result = pdf_tools.extract_tables(pdf_path) assert table_result["total_tables"] >= 0 # Step 4: Image info image_result = pdf_tools.get_image_info(pdf_path) assert image_result["total_images"] >= 0 # Step 5: Form detection form_result = pdf_tools.detect_form_fields(pdf_path) assert "recommendation" in form_result print(f"\n--- Document Analysis: {Path(pdf_path).name} ---") print(f"Type: {type_result['classification']}") print(f"Text: {text_result['total_chars']} chars " f"({text_result['native_pages']} native, " f"{text_result['ocr_pages']} OCR)") print(f"Tables: {table_result['total_tables']}") print(f"Images: {image_result['total_images']}") print(f"Form: {form_result['has_existing_acroform']}, " f"{form_result['detected_potential_fields']} detected") def test_image_based_document_workflow(self): """Test workflow for image-based documents.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") # Detect type type_result = pdf_tools.detect_pdf_type(pdf_path) # Smart extraction should handle image-based pages text_result = pdf_tools.extract_text(pdf_path, engine="smart") # Get image info image_result = pdf_tools.get_image_info(pdf_path) print(f"\n--- Image-based Document: {Path(pdf_path).name} ---") print(f"Type: {type_result['classification']}") print(f"Needs OCR: {type_result['needs_ocr']}") print(f"Images: {image_result['total_images']}") print(f"Extracted chars: {text_result['total_chars']}") if type_result["classification"] == "image_based": assert image_result["total_images"] > 0 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_ocr_quality_assessment_workflow(self): """Test OCR quality assessment workflow.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") # Extract with confidence scores result = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, min_confidence=0 ) avg_conf = result["overall_average_confidence"] print(f"\n--- OCR Quality Assessment ---") print(f"Words: {result['total_words']}") print(f"Average Confidence: {avg_conf}%") # Filter for high-quality text if result["page_details"] and result["page_details"][0]["words"]: high_conf_words = [ w for w in result["page_details"][0]["words"] if w["confidence"] >= 80 ] print(f"High-confidence words (>=80%): {len(high_conf_words)}") # ============================================================================= # Advanced OCR Tests (requires Tesseract) # ============================================================================= class TestAdvancedOcr: """Advanced OCR tests that require Tesseract to be installed.""" @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_ocr_different_dpi_settings(self): """Test OCR with different DPI settings.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") # Test with lower DPI (faster but less accurate) result_150 = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, dpi=150 ) # Test with higher DPI (slower but more accurate) result_300 = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, dpi=300 ) assert result_150["dpi"] == 150 assert result_300["dpi"] == 300 # Both should produce some text assert result_150["total_words"] >= 0 assert result_300["total_words"] >= 0 print(f"\n--- DPI Comparison ---") print(f"150 DPI: {result_150['total_words']} words, " f"avg confidence: {result_150['overall_average_confidence']}%") print(f"300 DPI: {result_300['total_words']} words, " f"avg confidence: {result_300['overall_average_confidence']}%") @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_ocr_on_multiple_scanned_documents(self): """Test OCR on various scanned document types.""" scanned_pdfs = ["scansmpl.pdf", "image-based-pdf-sample.pdf", "TestOCR.pdf"] print("\n--- OCR on Multiple Documents ---") for name in scanned_pdfs: pdf_path = get_test_pdf(name) if not pdf_path: continue result = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, dpi=200, min_confidence=50 ) print(f"{name}: {result['total_words']} words (>=50% conf), " f"avg: {result['overall_average_confidence']}%") assert "text" in result assert result["pages_extracted"] == 1 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_smart_extraction_uses_ocr_for_images(self): """Test that smart extraction uses OCR for image-based pages.""" pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("scansmpl.pdf not available") # With low native threshold, should trigger OCR result = pdf_tools.extract_text( pdf_path, engine="smart", native_threshold=10, # Very low threshold dpi=200 ) # Should use OCR for image-based pages if result["ocr_pages"] > 0: print(f"\nSmart extraction used OCR on {result['ocr_pages']} pages") assert result["total_chars"] > 0 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_confidence_score_ranges(self): """Test that confidence scores are in valid range (0-100).""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text(pdf_path, pages=[1], include_confidence=True) for page in result["page_details"]: for word in page.get("words", []): assert 0 <= word["confidence"] <= 100, \ f"Invalid confidence: {word['confidence']}" assert 0 <= result["overall_average_confidence"] <= 100 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_extract_text_with_confidence_mcp_layer(self): """Test extract_text_with_confidence via MCP layer.""" import asyncio from pdf_mcp import server pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") async def call(): _content, meta = await server.mcp.call_tool( "extract_text", {"pdf_path": pdf_path, "pages": [1], "include_confidence": True, "min_confidence": 50}, ) result = meta["result"] assert "error" not in result, result.get("error") return result result = asyncio.run(call()) assert "overall_average_confidence" in result assert result["min_confidence"] == 50 @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_ocr_extracts_meaningful_text(self): """Test that OCR extracts actual meaningful text from documents.""" pdf_path = get_test_pdf("TestOCR.pdf") if not pdf_path: pdf_path = get_test_pdf("scansmpl.pdf") if not pdf_path: pytest.skip("No suitable OCR test PDF available") result = pdf_tools.extract_text( pdf_path, pages=[1], engine="tesseract", dpi=300 ) # Should extract some text assert result["total_chars"] > 0, "OCR should extract some text" assert len(result["text"].strip()) > 0 print(f"\n--- OCR Text Preview ---") preview = result["text"][:200].replace("\n", " ") print(f"Extracted: {preview}...") @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_force_ocr_on_searchable_pdf(self): """Test force_ocr engine on a searchable PDF.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") # Native extraction first native_result = pdf_tools.extract_text(pdf_path, pages=[1], engine="native") # Force OCR ocr_result = pdf_tools.extract_text( pdf_path, pages=[1], engine="force_ocr", dpi=200 ) assert native_result["method"] == "native" assert "ocr" in ocr_result["method_used"] # Both should produce text assert native_result["total_chars"] > 0 assert ocr_result["total_chars"] > 0 print(f"\n--- Native vs Force OCR ---") print(f"Native: {native_result['total_chars']} chars") print(f"Force OCR: {ocr_result['total_chars']} chars") class TestOcrLanguageSupport: """Tests for OCR language support features.""" @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_get_installed_languages(self): """Test that installed languages are reported correctly.""" result = pdf_tools.get_ocr_languages() assert result["tesseract_available"] is True assert "eng" in result["installed_languages"], "English should be installed" print(f"\nInstalled languages: {result['installed_languages']}") @pytest.mark.skipif(not pdf_tools._HAS_TESSERACT, reason="Tesseract not installed") def test_ocr_with_english_language(self): """Test OCR with explicit English language setting.""" pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("1006.pdf not available") result = pdf_tools.extract_text( pdf_path, pages=[1], include_confidence=True, language="eng" ) assert result["language"] == "eng" assert result["total_words"] > 0

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nfsarch33/pdf-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_phase2_features.py•33.1 KiB