Skip to main content
Glama
juanqui
by juanqui
test_page_chunker_integration.py10.2 kB
"""Integration tests for PageChunker with document processing pipeline.""" import shutil import tempfile from pathlib import Path from unittest.mock import AsyncMock, Mock import pytest from pdfkb.config import ServerConfig from pdfkb.document_processor import DocumentProcessor class TestPageChunkerIntegration: """Integration tests for PageChunker with the document processing pipeline.""" @pytest.fixture def config_with_page_chunker(self): """Create config with page chunker.""" return ServerConfig( openai_api_key="test-key", document_chunker="page", page_chunker_min_chunk_size=50, page_chunker_merge_small=True, markdown_page_boundary_pattern=r"--\[PAGE:\s*(\d+)\]--", markdown_split_on_page_boundaries=True, ) @pytest.fixture def mock_embedding_service(self): """Create a mock embedding service.""" service = Mock() async def generate_embeddings(texts): return [[0.1] * 1024 for _ in texts] service.generate_embeddings = AsyncMock(side_effect=generate_embeddings) return service @pytest.mark.asyncio async def test_process_markdown_with_page_markers(self, config_with_page_chunker, mock_embedding_service): """Test processing markdown with page markers using page chunker.""" # Create markdown content with page markers content = """--- title: Test Document with Pages author: Test Author --- --[PAGE: 1]-- # Introduction This is the introduction on page 1. It contains important background information. --[PAGE: 2]-- ## Methods The methods section on page 2. Describes the approach taken. --[PAGE: 3]-- ## Results Results are presented on page 3. Shows the findings of the study. --[PAGE: 4]-- ## Conclusion The conclusion on page 4. Summarizes the key points.""" # Create temp file with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(content) temp_path = Path(f.name) try: # Create processor processor = DocumentProcessor( config=config_with_page_chunker, embedding_service=mock_embedding_service, ) # Process the document result = await processor.process_markdown(temp_path) # Verify processing succeeded assert result.success is True assert result.document is not None # Check we got 4 chunks (one per page) assert len(result.document.chunks) == 4 # Verify each chunk corresponds to a page for i, chunk in enumerate(result.document.chunks): page_num = i + 1 assert f"page {page_num}" in chunk.text.lower() or f"[PAGE: {page_num}]" in chunk.text assert chunk.metadata.get("page_number") == page_num or chunk.metadata.get("page_numbers") == [page_num] assert chunk.metadata.get("chunk_strategy") == "page" finally: temp_path.unlink() @pytest.mark.asyncio async def test_process_markdown_without_page_markers(self, config_with_page_chunker, mock_embedding_service): """Test processing markdown without page markers.""" content = """# Document Title This is a document without page markers. ## Section 1 Content for section 1. ## Section 2 Content for section 2.""" # Create temp file with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(content) temp_path = Path(f.name) try: # Create processor processor = DocumentProcessor( config=config_with_page_chunker, embedding_service=mock_embedding_service, ) # Process the document result = await processor.process_markdown(temp_path) # Verify processing succeeded assert result.success is True # Should have 1 chunk (entire document as single page) assert len(result.document.chunks) == 1 assert result.document.chunks[0].metadata.get("chunk_strategy") == "page" assert "Document Title" in result.document.chunks[0].text finally: temp_path.unlink() @pytest.mark.asyncio async def test_page_merging_with_small_pages(self, mock_embedding_service): """Test that small pages are properly merged.""" config = ServerConfig( openai_api_key="test-key", document_chunker="page", page_chunker_min_chunk_size=100, page_chunker_merge_small=True, markdown_page_boundary_pattern=r"--\[PAGE:\s*(\d+)\]--", markdown_split_on_page_boundaries=True, ) content = """--[PAGE: 1]-- Short content. --[PAGE: 2]-- Also short. --[PAGE: 3]-- This is a much longer page with enough content to exceed the minimum chunk size threshold. --[PAGE: 4]-- Another long page with substantial content that won't be merged with others.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(content) temp_path = Path(f.name) try: processor = DocumentProcessor( config=config, embedding_service=mock_embedding_service, ) result = await processor.process_markdown(temp_path) assert result.success is True # After removing page markers, pages 1, 2, and 3 are all small enough to be merged # Should have 2 chunks: pages 1+2+3 merged, page 4 separate assert len(result.document.chunks) == 2 # First chunk should be merged pages 1, 2, and 3 assert "Short content" in result.document.chunks[0].text assert "Also short" in result.document.chunks[0].text assert "much longer page" in result.document.chunks[0].text # Check metadata indicates merging if "page_numbers" in result.document.chunks[0].metadata: assert result.document.chunks[0].metadata["page_numbers"] == [1, 2, 3] finally: temp_path.unlink() @pytest.mark.asyncio async def test_pdf_processing_with_page_chunker(self, config_with_page_chunker, mock_embedding_service): """Test that page chunker works with PDF processing.""" # Use the real sample PDF sample_pdf_path = Path(__file__).parent / "sample.pdf" with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as f: temp_path = Path(f.name) # Copy sample PDF to temp location shutil.copy(sample_pdf_path, temp_path) try: # Create processor processor = DocumentProcessor( config=config_with_page_chunker, embedding_service=mock_embedding_service, ) # Mock the parser to return page-aware content from pdfkb.parsers.parser import PageContent, ParseResult pages = [ PageContent(page_number=1, markdown_content="Content from PDF page 1", metadata={}), PageContent(page_number=2, markdown_content="Content from PDF page 2", metadata={}), ] mock_parse_result = ParseResult(pages=pages, metadata={"page_count": 2}) processor.parser.parse = AsyncMock(return_value=mock_parse_result) # Process the PDF result = await processor.process_pdf(temp_path) # Should work with PDF content too assert result.success is True # With min_chunk_size=50 and merge_small=True, the two small pages # ("Content from PDF page 1" and "Content from PDF page 2") should be merged assert len(result.document.chunks) == 1 # Check the merged chunk contains both pages chunk = result.document.chunks[0] assert "Content from PDF page 1" in chunk.text assert "Content from PDF page 2" in chunk.text assert chunk.metadata.get("chunk_strategy") == "page" assert chunk.metadata.get("page_numbers") == [1, 2] finally: temp_path.unlink() class TestPageChunkerConfiguration: """Test configuration validation for page chunker.""" def test_valid_page_chunker_config(self): """Test valid page chunker configuration.""" config = ServerConfig( openai_api_key="test-key", document_chunker="page", page_chunker_min_chunk_size=100, page_chunker_max_chunk_size=5000, page_chunker_merge_small=True, ) assert config.document_chunker == "page" assert config.pdf_chunker == "page" # Backward compatibility assert config.page_chunker_min_chunk_size == 100 assert config.page_chunker_max_chunk_size == 5000 assert config.page_chunker_merge_small is True def test_page_chunker_from_env(self, monkeypatch): """Test loading page chunker config from environment.""" monkeypatch.setenv("PDFKB_DOCUMENT_CHUNKER", "page") monkeypatch.setenv("PDFKB_PAGE_CHUNKER_MIN_CHUNK_SIZE", "200") monkeypatch.setenv("PDFKB_PAGE_CHUNKER_MAX_CHUNK_SIZE", "10000") monkeypatch.setenv("PDFKB_PAGE_CHUNKER_MERGE_SMALL", "false") config = ServerConfig.from_env() assert config.document_chunker == "page" assert config.page_chunker_min_chunk_size == 200 assert config.page_chunker_max_chunk_size == 10000 assert config.page_chunker_merge_small is False def test_markdown_page_boundary_config(self, monkeypatch): """Test markdown page boundary configuration.""" monkeypatch.setenv("PDFKB_MARKDOWN_PAGE_BOUNDARY_PATTERN", r"<<<PAGE (\d+)>>>") monkeypatch.setenv("PDFKB_MARKDOWN_SPLIT_ON_PAGE_BOUNDARIES", "true") config = ServerConfig.from_env() assert config.markdown_page_boundary_pattern == r"<<<PAGE (\d+)>>>" assert config.markdown_split_on_page_boundaries is True

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server