PDF Knowledgebase MCP Server

pdfkb-mcp
tests

test_pdf_parser.py•6.96 KiB

"""Tests for the PDF parser module.""" import shutil from pathlib import Path from unittest.mock import AsyncMock, Mock, patch import pytest from pdfkb.config import ServerConfig from pdfkb.document_processor import DocumentProcessor as PDFProcessor from pdfkb.exceptions import PDFProcessingError from pdfkb.parsers import ParseResult, PyMuPDF4LLMParser, UnstructuredPDFParser class TestPDFParser: """Test cases for PDFParser classes.""" @pytest.fixture def sample_pdf(self, tmp_path): """Provide a copy of the sample PDF for testing.""" # Copy the sample PDF to a temp location to avoid modifying the original sample_pdf_path = Path(__file__).parent / "sample.pdf" test_pdf_path = tmp_path / "test.pdf" shutil.copy(sample_pdf_path, test_pdf_path) return test_pdf_path @pytest.fixture def config(self): """Create a test configuration.""" return ServerConfig( openai_api_key="sk-test-key", chunk_size=1000, chunk_overlap=200, ) @pytest.fixture def embedding_service(self): """Create a mock embedding service.""" service = Mock() service.generate_embeddings = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) return service @pytest.mark.asyncio async def test_unstructured_parser_creation(self): """Test UnstructuredPDFParser creation.""" parser = UnstructuredPDFParser(strategy="fast") assert parser.strategy == "fast" @pytest.mark.asyncio async def test_pymupdf4llm_parser_creation(self): """Test PyMuPDF4LLMParser creation.""" parser = PyMuPDF4LLMParser() assert parser.config == {} @pytest.mark.asyncio async def test_unstructured_parser_parse(self, sample_pdf): """Test UnstructuredPDFParser parse method.""" pdf_file = sample_pdf # Mock the unstructured partition function at the correct location with patch("unstructured.partition.pdf.partition_pdf") as mock_partition: mock_partition.return_value = ["test element 1", "test element 2"] parser = UnstructuredPDFParser(strategy="fast") result = await parser.parse(pdf_file) assert isinstance(result, ParseResult) assert len(result.pages) > 0 assert len(result.pages[0].markdown_content) > 0 assert "processor_version" in result.metadata assert result.metadata["processor_version"] == "unstructured" @pytest.mark.asyncio async def test_pymupdf4llm_parser_parse(self, sample_pdf): """Test PyMuPDF4LLMParser parse method.""" pdf_file = sample_pdf # Test with real PDF parsing parser = PyMuPDF4LLMParser() result = await parser.parse(pdf_file) assert isinstance(result, ParseResult) assert len(result.pages) > 0 # Some pages might be empty (e.g., cover pages), so check that at least one page has content pages_with_content = [p for p in result.pages if len(p.markdown_content) > 0] assert len(pages_with_content) > 0, "At least one page should have content" assert "processor_version" in result.metadata assert result.metadata["processor_version"] == "pymupdf4llm" @pytest.mark.asyncio async def test_pymupdf4llm_parser_parse_with_pages(self, sample_pdf): """Test PyMuPDF4LLMParser parse method with page chunks.""" pdf_file = sample_pdf # Test with real PDF parsing - sample.pdf should have multiple pages parser = PyMuPDF4LLMParser() result = await parser.parse(pdf_file) assert isinstance(result, ParseResult) assert len(result.pages) > 0 # Should have at least one page # Check that pages are properly numbered and at least some have content pages_with_content = 0 for page in result.pages: assert page.page_number > 0 if len(page.markdown_content) > 0: pages_with_content += 1 assert pages_with_content > 0, "At least one page should have content" assert "processor_version" in result.metadata @pytest.mark.asyncio async def test_pdf_processor_with_unstructured_parser(self, config, embedding_service, sample_pdf): """Test PDFProcessor with Unstructured parser.""" # Create a config with unstructured parser config.pdf_parser = "unstructured" # Mock the unstructured partition function with patch("unstructured.partition.pdf.partition_pdf") as mock_partition: mock_partition.return_value = ["test element 1", "test element 2"] processor = PDFProcessor(config, embedding_service) assert isinstance(processor.parser, UnstructuredPDFParser) @pytest.mark.asyncio async def test_pdf_processor_with_pymupdf4llm_parser(self, config, embedding_service, sample_pdf): """Test PDFProcessor with PyMuPDF4LLM parser.""" # Create a config with pymupdf4llm parser config.pdf_parser = "pymupdf4llm" # Just test that the processor creates the right parser type processor = PDFProcessor(config, embedding_service) assert isinstance(processor.parser, PyMuPDF4LLMParser) @pytest.mark.asyncio async def test_pdf_processor_parser_fallback(self, config, embedding_service, sample_pdf): """Test PDFProcessor parser fallback when primary parser is not available.""" # Create a config with pymupdf4llm parser config.pdf_parser = "pymupdf4llm" # Mock the PyMuPDF4LLMParser to raise ImportError during construction with patch("pdfkb.parsers.parser_pymupdf4llm.PyMuPDF4LLMParser.__init__") as mock_pymupdf_init: mock_pymupdf_init.side_effect = ImportError("pymupdf4llm not available") # Mock the unstructured partition function with patch("unstructured.partition.pdf.partition_pdf") as mock_partition: mock_partition.return_value = ["test element 1", "test element 2"] processor = PDFProcessor(config, embedding_service) # Should fall back to Unstructured parser assert isinstance(processor.parser, UnstructuredPDFParser) @pytest.mark.asyncio async def test_pdf_processor_both_parsers_unavailable(self, config, embedding_service): """Test PDFProcessor when both parsers are unavailable.""" # Create a config with pymupdf4llm parser config.pdf_parser = "pymupdf4llm" # Mock both parsers to be unavailable with patch("pdfkb.parsers.parser_pymupdf4llm.PyMuPDF4LLMParser.__init__") as mock_pymupdf_init: mock_pymupdf_init.side_effect = ImportError("pymupdf4llm not available") with patch("pdfkb.parsers.parser_unstructured.UnstructuredPDFParser.__init__") as mock_unstructured_init: mock_unstructured_init.side_effect = ImportError("unstructured not available") with pytest.raises(PDFProcessingError): PDFProcessor(config, embedding_service)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_pdf_parser.py•6.96 KiB