Skip to main content
Glama

content-core

test_pymupdf_ocr.py9.98 kB
""" Tests for PyMuPDF OCR enhancement functionality. """ import pytest from unittest.mock import patch, MagicMock from content_core.processors.pdf import ( count_formula_placeholders, extract_page_with_ocr, convert_table_to_markdown, ) from content_core.config import ( set_pymupdf_ocr_enabled, set_pymupdf_formula_threshold, set_pymupdf_ocr_fallback, ) class TestFormulaDetection: """Test formula placeholder detection.""" def test_count_formula_placeholders_none(self): """Test counting when no formulas present.""" text = "This is regular text with no formulas." assert count_formula_placeholders(text) == 0 def test_count_formula_placeholders_single(self): """Test counting single formula placeholder.""" text = "Text before <!-- formula-not-decoded --> text after." assert count_formula_placeholders(text) == 1 def test_count_formula_placeholders_multiple(self): """Test counting multiple formula placeholders.""" text = """ First formula: <!-- formula-not-decoded --> Some text. Second formula: <!-- formula-not-decoded --> More text. Third formula: <!-- formula-not-decoded --> """ assert count_formula_placeholders(text) == 3 def test_count_formula_placeholders_empty_text(self): """Test counting on empty text.""" assert count_formula_placeholders("") == 0 assert count_formula_placeholders(None) == 0 class TestTableConversion: """Test table to markdown conversion.""" def test_convert_simple_table(self): """Test converting a simple table.""" table_data = [ ["Header 1", "Header 2", "Header 3"], ["Row 1 Col 1", "Row 1 Col 2", "Row 1 Col 3"], ["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"], ] result = convert_table_to_markdown(table_data) expected_lines = [ "| Header 1 | Header 2 | Header 3 |", "| --- | --- | --- |", "| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 |", "| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 |", ] for line in expected_lines: assert line in result def test_convert_table_with_empty_cells(self): """Test converting table with empty cells.""" table_data = [ ["Name", "Age", "City"], ["John", "", "New York"], ["", "25", "Boston"], ] result = convert_table_to_markdown(table_data) assert "| John | | New York |" in result assert "| | 25 | Boston |" in result def test_convert_empty_table(self): """Test converting empty table.""" assert convert_table_to_markdown([]) == "" assert convert_table_to_markdown(None) == "" assert convert_table_to_markdown([[]]) == "" def test_convert_table_with_only_empty_cells(self): """Test converting table with only empty or whitespace cells.""" empty_table = [ ["", " ", " "], [None, "", ""], [" ", None, " "], ] # Should still create a table structure even with empty cells result = convert_table_to_markdown(empty_table) assert "|" in result # Should have table structure assert "---" in result # Should have separator class TestOCRExtraction: """Test OCR extraction functionality.""" @patch('content_core.processors.pdf.fitz') def test_extract_page_with_ocr_success(self, mock_fitz): """Test successful OCR extraction.""" # Mock page and textpage mock_page = MagicMock() mock_textpage = MagicMock() mock_textpage.extractText.return_value = "OCR extracted text with formulas" mock_page.get_textpage_ocr.return_value = mock_textpage result = extract_page_with_ocr(mock_page, 1) assert result == "OCR extracted text with formulas" mock_page.get_textpage_ocr.assert_called_once() mock_textpage.extractText.assert_called_once() @patch('content_core.processors.pdf.fitz') def test_extract_page_with_ocr_failure(self, mock_fitz): """Test OCR extraction failure (Tesseract not available).""" mock_page = MagicMock() mock_page.get_textpage_ocr.side_effect = Exception("Tesseract not found") result = extract_page_with_ocr(mock_page, 1) assert result is None mock_page.get_textpage_ocr.assert_called_once() @patch('content_core.processors.pdf.fitz') def test_extract_page_with_ocr_empty_result(self, mock_fitz): """Test OCR extraction returning empty textpage.""" mock_page = MagicMock() mock_page.get_textpage_ocr.return_value = None result = extract_page_with_ocr(mock_page, 1) assert result is None class TestConfigurationFunctions: """Test PyMuPDF configuration functions.""" def test_set_pymupdf_ocr_enabled(self): """Test enabling/disabling OCR.""" from content_core.config import CONFIG # Test enabling set_pymupdf_ocr_enabled(True) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('enable_formula_ocr') is True # Test disabling set_pymupdf_ocr_enabled(False) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('enable_formula_ocr') is False def test_set_pymupdf_formula_threshold(self): """Test setting formula threshold.""" from content_core.config import CONFIG set_pymupdf_formula_threshold(5) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('formula_threshold') == 5 set_pymupdf_formula_threshold(1) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('formula_threshold') == 1 def test_set_pymupdf_ocr_fallback(self): """Test setting OCR fallback option.""" from content_core.config import CONFIG set_pymupdf_ocr_fallback(True) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('ocr_fallback') is True set_pymupdf_ocr_fallback(False) assert CONFIG.get('extraction', {}).get('pymupdf', {}).get('ocr_fallback') is False @pytest.mark.asyncio class TestPDFExtractionIntegration: """Integration tests for PDF extraction with OCR.""" async def test_pdf_extraction_without_ocr(self): """Test PDF extraction with OCR disabled.""" from content_core.content.extraction import extract_content # Ensure OCR is disabled set_pymupdf_ocr_enabled(False) # Test with the sample PDF result = await extract_content({ 'file_path': 'tests/input_content/file.pdf', 'document_engine': 'simple' }) assert result.source_type == "file" assert len(result.content) > 0 # Should not contain OCR artifacts assert "OCR extracted" not in result.content async def test_pdf_extraction_with_ocr_disabled_by_threshold(self): """Test PDF extraction where OCR is enabled but threshold not met.""" from content_core.content.extraction import extract_content # Enable OCR but set high threshold set_pymupdf_ocr_enabled(True) set_pymupdf_formula_threshold(100) # Very high threshold result = await extract_content({ 'file_path': 'tests/input_content/file.pdf', 'document_engine': 'simple' }) assert result.source_type == "file" assert len(result.content) > 0 # OCR should not have been triggered due to high threshold @patch('content_core.processors.pdf.extract_page_with_ocr') async def test_pdf_extraction_with_ocr_fallback(self, mock_ocr): """Test PDF extraction with OCR failure and fallback.""" from content_core.content.extraction import extract_content # Mock OCR to fail mock_ocr.return_value = None # Enable OCR with low threshold set_pymupdf_ocr_enabled(True) set_pymupdf_formula_threshold(0) # Very low threshold set_pymupdf_ocr_fallback(True) result = await extract_content({ 'file_path': 'tests/input_content/file.pdf', 'document_engine': 'simple' }) assert result.source_type == "file" assert len(result.content) > 0 # Should have content from fallback extraction assert "Buenos Aires" in result.content # From the test PDF class TestEdgeCases: """Test edge cases and error conditions.""" def test_count_formula_placeholders_with_none(self): """Test formula counting with None input.""" # Should handle None gracefully try: result = count_formula_placeholders(None) assert result == 0 except (TypeError, AttributeError): # If it throws an error, that's also acceptable behavior pass def test_convert_table_to_markdown_malformed(self): """Test table conversion with malformed data.""" # Table with inconsistent row lengths malformed_table = [ ["Header 1", "Header 2"], ["Row 1 Col 1"], # Missing column ["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"], # Extra column ] result = convert_table_to_markdown(malformed_table) assert "Header 1" in result assert "Header 2" in result # Should handle gracefully without crashing @pytest.fixture def reset_config(): """Reset configuration after each test.""" yield # Reset to defaults set_pymupdf_ocr_enabled(False) set_pymupdf_formula_threshold(3) set_pymupdf_ocr_fallback(True)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lfnovo/content-core'

If you have feedback or need assistance with the MCP directory API, please join our Discord server