Skip to main content
Glama
juanqui
by juanqui
test_page_chunker.py5.94 kB
"""Tests for the PageChunker implementation.""" from pdfkb.chunker.chunker_page import PageChunker from pdfkb.parsers.parser import PageContent class TestPageChunker: """Test suite for PageChunker.""" def test_basic_page_chunking(self): """Test basic page chunking with one chunk per page.""" chunker = PageChunker() pages = [ PageContent(page_number=1, markdown_content="Page 1 content", metadata={}), PageContent(page_number=2, markdown_content="Page 2 content", metadata={}), PageContent(page_number=3, markdown_content="Page 3 content", metadata={}), ] chunks = chunker.chunk_pages(pages) assert len(chunks) == 3 assert chunks[0].text == "Page 1 content" assert chunks[0].metadata["page_number"] == 1 assert chunks[1].text == "Page 2 content" assert chunks[1].metadata["page_number"] == 2 assert chunks[2].text == "Page 3 content" assert chunks[2].metadata["page_number"] == 3 def test_merge_small_pages(self): """Test merging of small pages.""" chunker = PageChunker(min_chunk_size=50, merge_small=True) pages = [ PageContent(page_number=1, markdown_content="Small", metadata={}), PageContent(page_number=2, markdown_content="Also small", metadata={}), PageContent( page_number=3, markdown_content="This is a much longer page with lots of content that exceeds the minimum", metadata={}, ), ] chunks = chunker.chunk_pages(pages) # First two pages should be merged assert len(chunks) == 2 assert "Small" in chunks[0].text assert "Also small" in chunks[0].text assert chunks[0].metadata["page_numbers"] == [1, 2] assert chunks[0].metadata["merged_pages"] is True # Third page should be separate assert chunks[1].metadata["page_number"] == 3 assert chunks[1].metadata["single_page"] is True def test_split_large_pages(self): """Test splitting of large pages.""" chunker = PageChunker(max_chunk_size=50) long_content = "This is a very long page. " * 10 # Creates content > 50 chars pages = [ PageContent(page_number=1, markdown_content=long_content, metadata={}), ] chunks = chunker.chunk_pages(pages) # Should be split into multiple chunks assert len(chunks) > 1 for chunk in chunks: assert len(chunk.text) <= 50 if "split_from_large_page" in chunk.metadata: assert chunk.metadata["split_from_large_page"] is True def test_no_merge_when_disabled(self): """Test that pages are not merged when merge_small is False.""" chunker = PageChunker(min_chunk_size=50, merge_small=False) pages = [ PageContent(page_number=1, markdown_content="Small", metadata={}), PageContent(page_number=2, markdown_content="Also small", metadata={}), ] chunks = chunker.chunk_pages(pages) # Should not merge even though they're small assert len(chunks) == 2 assert chunks[0].text == "Small" assert chunks[1].text == "Also small" def test_empty_pages_list(self): """Test handling of empty pages list.""" chunker = PageChunker() chunks = chunker.chunk_pages([]) assert chunks == [] def test_fallback_chunk_method(self): """Test the fallback chunk() method for non-page-aware content.""" chunker = PageChunker() text = "This is some text content" chunks = chunker.chunk(text, {"title": "Test"}) assert len(chunks) == 1 assert chunks[0].text == text assert chunks[0].metadata["title"] == "Test" assert chunks[0].metadata["warning"] == "PageChunker used without page-aware content" def test_metadata_preservation(self): """Test that document-level metadata is preserved in chunks.""" chunker = PageChunker() pages = [ PageContent(page_number=1, markdown_content="Page 1", metadata={"source": "pdf"}), ] doc_metadata = {"title": "Test Document", "author": "Test Author"} chunks = chunker.chunk_pages(pages, doc_metadata) assert len(chunks) == 1 assert chunks[0].metadata["title"] == "Test Document" assert chunks[0].metadata["author"] == "Test Author" assert chunks[0].metadata["chunk_strategy"] == "page" def test_chunk_indexing(self): """Test that chunks are properly indexed.""" chunker = PageChunker() pages = [ PageContent(page_number=1, markdown_content="Page 1", metadata={}), PageContent(page_number=2, markdown_content="Page 2", metadata={}), PageContent(page_number=3, markdown_content="Page 3", metadata={}), ] chunks = chunker.chunk_pages(pages) for i, chunk in enumerate(chunks): assert chunk.chunk_index == i assert chunk.metadata["chunk_index"] == i assert chunk.metadata["total_chunks"] == 3 def test_page_range_metadata(self): """Test page range metadata for merged chunks.""" chunker = PageChunker(min_chunk_size=50, merge_small=True) pages = [ PageContent(page_number=1, markdown_content="A", metadata={}), PageContent(page_number=2, markdown_content="B", metadata={}), PageContent(page_number=3, markdown_content="C", metadata={}), ] chunks = chunker.chunk_pages(pages) # All pages should be merged into one chunk assert len(chunks) == 1 assert chunks[0].metadata["page_range"] == "1-3" assert chunks[0].metadata["page_count"] == 3 assert chunks[0].metadata["merged_pages"] is True

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server