Skip to main content
Glama
juanqui
by juanqui
test_docling_integration.py11.5 kB
"""Integration tests for DoclingParser with the existing system.""" import tempfile from pathlib import Path from unittest.mock import Mock, patch import pytest from pdfkb.config import ServerConfig from pdfkb.document_processor import DocumentProcessor as PDFProcessor from pdfkb.parsers.parser_docling import DoclingParser class TestDoclingIntegration: """Test DoclingParser integration with PDFProcessor and ServerConfig.""" @pytest.fixture def temp_pdf_file(self): """Create a temporary PDF file for testing.""" with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: f.write(b"%PDF-1.4\n%Test PDF content for integration testing\n") temp_path = Path(f.name) yield temp_path if temp_path.exists(): temp_path.unlink() @pytest.fixture def mock_embedding_service(self): """Create a mock embedding service.""" service = Mock() service.generate_embeddings.return_value = [[0.1, 0.2, 0.3]] * 5 # Mock embeddings return service def test_docling_parser_instantiation(self): """Test that DoclingParser can be instantiated without errors.""" parser = DoclingParser() assert parser is not None # Do not assert non-existent key; ensure defaults exist instead assert "ocr_enabled" in parser.config # Actually, processor_version is added during parsing, not in config assert parser.config["ocr_enabled"] is True assert parser.available_features is not None def test_config_system_docling_support(self): """Test that ServerConfig properly handles docling configuration.""" # Test that docling is accepted as a valid parser with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "PDFKB_PDF_PARSER": "docling", "DOCLING_OCR_ENGINE": "tesseract", "DOCLING_TABLE_MODE": "ACCURATE", "DOCLING_FORMULA_ENRICHMENT": "true", }, clear=True, ): config = ServerConfig.from_env() assert config.pdf_parser == "docling" assert hasattr(config, "docling_config") docling_config = getattr(config, "docling_config", {}) assert docling_config.get("ocr_engine") == "tesseract" assert docling_config.get("table_processing_mode") == "ACCURATE" assert docling_config.get("formula_enrichment") is True @patch("pdfkb.parsers.parser_docling.DoclingParser._check_ocr_engine_available") def test_pdf_processor_docling_integration(self, mock_ocr_check, mock_embedding_service, temp_pdf_file): """Test DoclingParser integration with PDFProcessor.""" mock_ocr_check.return_value = True # Create config for docling parser with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "PDFKB_PDF_PARSER": "docling", }, clear=True, ): config = ServerConfig.from_env() # Create PDFProcessor with docling parser processor = PDFProcessor(config, mock_embedding_service) # Verify the correct parser was created assert isinstance(processor.parser, DoclingParser) assert processor.parser.config["ocr_enabled"] is True def test_docling_parser_fallback_logic(self, mock_embedding_service): """Test that DoclingParser fallback works when docling is not available.""" # Create config for docling parser with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "PDFKB_PDF_PARSER": "docling", }, clear=True, ): config = ServerConfig.from_env() # Mock docling import failure with patch( "pdfkb.parsers.parser_docling.DoclingParser.__init__", side_effect=ImportError("No module named 'docling'"), ): # PDFProcessor should fallback to another parser processor = PDFProcessor(config, mock_embedding_service) # Should fallback to UnstructuredPDFParser or PyMuPDF4LLMParser assert not isinstance(processor.parser, DoclingParser) # The specific fallback parser depends on what's available @patch("pdfkb.parsers.parser_docling.DoclingParser._check_ocr_engine_available") async def test_docling_parser_with_pdf_processor_processing( self, mock_ocr_check, mock_embedding_service, temp_pdf_file ): """Test end-to-end processing with DoclingParser through PDFProcessor.""" mock_ocr_check.return_value = True with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "PDFKB_PDF_PARSER": "docling", "PDFKB_DOCUMENT_CHUNKER": "langchain", "PDFKB_MIN_CHUNK_SIZE": "0", # Disable chunk filtering for this test }, clear=True, ): config = ServerConfig.from_env() # Mock docling components for testing with ( patch("docling.document_converter.DocumentConverter") as mock_converter_class, patch("docling.datamodel.pipeline_options.PdfPipelineOptions") as mock_options_class, ): # Setup mock conversion result mock_result = Mock() mock_result.status = Mock() mock_result.status.__eq__ = lambda self, other: False # Not FAILURE mock_doc = Mock() mock_doc.pages = [Mock()] mock_doc.export_to_markdown.return_value = "# Test Document\n\nIntegration test content." mock_result.document = mock_doc mock_converter = Mock() mock_converter_class.return_value = mock_converter mock_converter.convert.return_value = mock_result mock_options = Mock() mock_options_class.return_value = mock_options # Create processor and process PDF processor = PDFProcessor(config, mock_embedding_service) # Verify we have the right parser assert isinstance(processor.parser, DoclingParser) # Process the PDF result = await processor.process_pdf(temp_pdf_file) # Verify processing was successful assert result.success is True assert result.document is not None assert result.document.title is not None assert len(result.document.chunks) > 0 # Verify docling-specific metadata metadata = result.document.metadata assert metadata["processor_version"] == "docling" assert "docling_processing_time" in metadata assert "docling_features_used" in metadata class TestDoclingEnvironmentVariables: """Test docling-specific environment variable handling.""" def test_all_docling_environment_variables(self): """Test that all docling environment variables are properly parsed.""" with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "PDFKB_PDF_PARSER": "docling", "DOCLING_OCR_ENGINE": "easyocr", "DOCLING_OCR_LANGUAGES": "en,es,fr", "DOCLING_TABLE_MODE": "FAST", "DOCLING_FORMULA_ENRICHMENT": "true", "DOCLING_PROCESSING_TIMEOUT": "600", "DOCLING_DEVICE": "cuda", "DOCLING_MAX_PAGES": "100", }, clear=True, ): config = ServerConfig.from_env() assert config.pdf_parser == "docling" docling_config = getattr(config, "docling_config", {}) assert docling_config["ocr_engine"] == "easyocr" assert docling_config["ocr_languages"] == ["en", "es", "fr"] assert docling_config["table_processing_mode"] == "FAST" assert docling_config["formula_enrichment"] is True assert docling_config["processing_timeout"] == 600 assert docling_config["device_selection"] == "cuda" assert docling_config["max_pages"] == 100 def test_invalid_docling_environment_variables(self): """Test handling of invalid docling environment variables.""" # Test invalid timeout with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "DOCLING_PROCESSING_TIMEOUT": "invalid", }, ): with pytest.raises(Exception): # Should raise ConfigurationError ServerConfig.from_env() # Test invalid max pages with patch.dict( "os.environ", { "OPENAI_API_KEY": "sk-test-key-12345", "DOCLING_MAX_PAGES": "invalid", }, ): with pytest.raises(Exception): # Should raise ConfigurationError ServerConfig.from_env() class TestDoclingParserRegistry: """Test that DoclingParser is properly registered in the parser system.""" def test_docling_parser_in_imports(self): """Test that DoclingParser is available in parser imports.""" from pdfkb.parsers import DoclingParser as ImportedDoclingParser from pdfkb.parsers.parser_docling import DoclingParser as DirectDoclingParser assert ImportedDoclingParser is DirectDoclingParser def test_docling_parser_in_all_exports(self): """Test that DoclingParser is in __all__ exports.""" from pdfkb.parsers import __all__ as parser_exports assert "DoclingParser" in parser_exports @pytest.mark.performance class TestDoclingParserPerformance: """Performance tests for DoclingParser.""" @pytest.fixture def larger_pdf_content(self): """Create larger PDF content for performance testing.""" # Create a larger mock PDF content content = b"%PDF-1.4\n" content += b"Mock PDF content " * 1000 # Repeat content to simulate larger file return content def test_memory_usage_estimation(self): """Test that DoclingParser has reasonable memory usage configuration.""" parser = DoclingParser( config={ "max_file_size": 50 * 1024 * 1024, # 50MB limit "processing_timeout": 120, # 2 minute timeout "table_processing_mode": "FAST", # Use faster processing } ) assert parser.config["max_file_size"] == 50 * 1024 * 1024 assert parser.config["processing_timeout"] == 120 assert parser.config["table_processing_mode"] == "FAST" def test_timeout_configuration(self): """Test timeout configuration for different processing stages.""" parser = DoclingParser(config={"processing_timeout": 300}) # Mock PdfPipelineOptions to test timeout application mock_options = Mock() mock_options.ocr_options = Mock() mock_options.table_options = Mock() configured_options = parser._apply_resource_limits(mock_options) # Verify that the options object was returned (even if timeouts couldn't be set) assert configured_options is mock_options

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server