ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests

test_pdf_indexing_integration.py•13.3 KiB

"""Comprehensive PDF indexing integration test. Tests that PDF content is faithfully preserved across all chunks when indexed, using the research paper PDF as a comprehensive test case. """ import pytest import tempfile import shutil from pathlib import Path from typing import List, Dict, Any from chunkhound.core.types.common import Language from chunkhound.parsers.parser_factory import create_parser_for_language from chunkhound.providers.database.duckdb_provider import DuckDBProvider from chunkhound.services.indexing_coordinator import IndexingCoordinator from chunkhound.services.search_service import SearchService from chunkhound.core.models.chunk import Chunk # Import conditional PyMuPDF test skip try: import fitz # PyMuPDF PYMUPDF_AVAILABLE = True except ImportError: PYMUPDF_AVAILABLE = False pytestmark = pytest.mark.skipif(not PYMUPDF_AVAILABLE, reason="PyMuPDF not available") @pytest.fixture def pdf_services(tmp_path): """Create services for PDF indexing without embeddings.""" db = DuckDBProvider(":memory:", base_directory=tmp_path) db.connect() parser = create_parser_for_language(Language.PDF) coordinator = IndexingCoordinator(db, tmp_path, None, {Language.PDF: parser}) search_service = SearchService(db) return { "db": db, "coordinator": coordinator, "search": search_service, "tmp_path": tmp_path } @pytest.fixture def research_paper_pdf(tmp_path): """Copy the research paper PDF to test directory.""" # Use relative path from test file location fixtures_dir = Path(__file__).parent / "fixtures" source_pdf = fixtures_dir / "cast_research_paper.pdf" # Verify PDF exists assert source_pdf.exists(), f"Research paper PDF not found at: {source_pdf}" # Create test directory test_dir = tmp_path / "pdf_test" test_dir.mkdir() # Copy PDF to test directory test_pdf = test_dir / "cast_paper.pdf" shutil.copy2(source_pdf, test_pdf) return test_pdf @pytest.fixture def expected_content(): """Load the expected markdown content for comparison.""" fixtures_dir = Path(__file__).parent / "fixtures" markdown_path = fixtures_dir / "cast_research_paper_content.md" # Verify markdown file exists assert markdown_path.exists(), f"Expected content file not found at: {markdown_path}" return markdown_path.read_text(encoding='utf-8') class TestPDFIndexingIntegration: """Comprehensive PDF indexing integration tests.""" @pytest.mark.asyncio async def test_pdf_complete_content_preservation(self, pdf_services, research_paper_pdf, expected_content): """Test that all PDF content is faithfully preserved in chunks.""" coordinator = pdf_services["coordinator"] db = pdf_services["db"] search = pdf_services["search"] # INDEX: Process the PDF file result = await coordinator.process_file(research_paper_pdf) # Verify successful indexing assert result["status"] == "success", f"PDF indexing failed: {result.get('error')}" assert result["chunks"] > 0, "No chunks were created from PDF" file_id = result["file_id"] print(f"✅ PDF indexed successfully: {result['chunks']} chunks created") # GET CHUNKS: Retrieve all chunks from database chunks = db.get_chunks_by_file_id(file_id, as_model=True) assert len(chunks) > 0, "No chunks found in database" # Verify chunks are properly ordered and have PDF metadata self._verify_chunk_structure(chunks) # CONTENT VERIFICATION: Compare with expected content await self._verify_content_completeness(chunks, expected_content, search) print(f"✅ All content verification tests passed!") def _verify_chunk_structure(self, chunks: List[Chunk]): """Verify chunk structure and metadata.""" print(f"🔍 Verifying chunk structure for {len(chunks)} chunks...") # Verify all chunks are PDF language for chunk in chunks: assert chunk.language == Language.PDF, f"Wrong language: {chunk.language}" # Verify page-based symbols page_symbols = [chunk.symbol for chunk in chunks if chunk.symbol.startswith("page_")] assert len(page_symbols) > 0, "No page-based symbols found" # Verify chunks span multiple pages (paper has 11 pages) page_numbers = set() for chunk in chunks: if chunk.symbol.startswith("page_"): page_num = int(chunk.symbol.split("_")[1]) page_numbers.add(page_num) assert len(page_numbers) > 5, f"Expected multiple pages, got: {sorted(page_numbers)}" assert min(page_numbers) == 1, "Should start from page 1" print(f"✅ Chunks span pages: {sorted(page_numbers)}") # Verify chunks have reasonable content length total_chars = sum(len(chunk.code) for chunk in chunks) assert total_chars > 10000, f"Total content seems too short: {total_chars} chars" print(f"✅ Total content: {total_chars} characters across {len(chunks)} chunks") async def _verify_content_completeness(self, chunks: List[Chunk], expected_content: str, search: SearchService): """Verify all expected content is present in chunks.""" print("🔍 Verifying content completeness...") # Combine all chunk content combined_content = "\n".join(chunk.code for chunk in chunks) # Key content sections that must be present (using flexible matching) required_sections = [ # Title and authors "CAST: Enhancing Code Retrieval-Augmented Generation", "Yilin Zhang", "Carnegie Mellon", # Abstract key terms "Retrieval-Augmented Generation", "chunking", "Abstract Syntax Tree", "structure-aware", "coherent units", # Technical content "cAST", "recursive", "split-then-merge", "syntactic integrity", "information density", # Code examples (from figures) "def normalize", "compute_stats", # References and conclusions "References", "Conclusion" ] missing_sections = [] for section in required_sections: if section.lower() not in combined_content.lower(): missing_sections.append(section) if missing_sections: print(f"❌ Missing sections: {missing_sections}") # Show sample of content for debugging print(f"Sample content (first 500 chars): {combined_content[:500]}") assert len(missing_sections) == 0, f"Missing required content: {missing_sections}" print(f"✅ All {len(required_sections)} required sections found") # Test search functionality await self._verify_search_functionality(search) async def _verify_search_functionality(self, search: SearchService): """Test that indexed PDF content is searchable.""" print("🔍 Testing search functionality...") # Test searches for key terms (using flexible patterns) search_tests = [ ("CAST", "algorithm name"), ("Abstract Syntax Tree", "core concept"), ("chunking", "main topic"), ("def normalize", "code example"), ("Carnegie", "institution"), ] for pattern, description in search_tests: results, _ = search.search_regex(pattern, page_size=10) assert len(results) > 0, f"No results for {description} search: '{pattern}'" # Verify results contain the search term found = any(pattern.lower() in result.get("content", "").lower() for result in results) assert found, f"Search results don't contain '{pattern}'" print(f"✅ Found {len(results)} results for '{pattern}' ({description})") @pytest.mark.asyncio async def test_pdf_metadata_preservation(self, pdf_services, research_paper_pdf): """Test that PDF metadata is properly preserved.""" coordinator = pdf_services["coordinator"] db = pdf_services["db"] # Index PDF result = await coordinator.process_file(research_paper_pdf) assert result["status"] == "success" # Get chunks chunks = db.get_chunks_by_file_id(result["file_id"], as_model=True) # Verify file path is preserved (may be None in some cases) # Note: File path might not be set in all chunk implementations pdf_filename_found = any("cast_paper.pdf" in str(chunk.file_path) for chunk in chunks if chunk.file_path) # If no file paths are set, that's acceptable for this test if any(chunk.file_path for chunk in chunks): assert pdf_filename_found, "Expected PDF filename in at least one chunk's file path" # Verify line numbers are sequential and reasonable line_numbers = [chunk.start_line for chunk in chunks] assert min(line_numbers) >= 1, "Line numbers should start from 1" assert max(line_numbers) > 10, "Should have content spanning multiple lines" print(f"✅ Metadata preserved: line numbers {min(line_numbers)}-{max(line_numbers)}") @pytest.mark.asyncio async def test_pdf_multi_page_handling(self, pdf_services, research_paper_pdf): """Test that multi-page PDFs are handled correctly.""" coordinator = pdf_services["coordinator"] db = pdf_services["db"] # Index PDF result = await coordinator.process_file(research_paper_pdf) chunks = db.get_chunks_by_file_id(result["file_id"], as_model=True) # Extract page numbers from chunk symbols page_info = {} for chunk in chunks: if chunk.symbol.startswith("page_"): parts = chunk.symbol.split("_") if len(parts) >= 2: try: page_num = int(parts[1]) if page_num not in page_info: page_info[page_num] = [] page_info[page_num].append(chunk) except ValueError: pass # Verify multi-page structure assert len(page_info) >= 8, f"Expected at least 8 pages, got {len(page_info)}" # Verify each page has content for page_num, page_chunks in page_info.items(): total_content = sum(len(chunk.code) for chunk in page_chunks) assert total_content > 50, f"Page {page_num} has insufficient content: {total_content} chars" print(f"✅ Multi-page handling verified: {len(page_info)} pages processed") @pytest.mark.asyncio async def test_pdf_content_ordering(self, pdf_services, research_paper_pdf): """Test that PDF content maintains proper document order.""" coordinator = pdf_services["coordinator"] db = pdf_services["db"] # Index PDF result = await coordinator.process_file(research_paper_pdf) chunks = db.get_chunks_by_file_id(result["file_id"], as_model=True) # Sort chunks by line number to verify ordering sorted_chunks = sorted(chunks, key=lambda c: c.start_line) # Verify line numbers are increasing prev_line = 0 for chunk in sorted_chunks: assert chunk.start_line >= prev_line, "Chunks should be in line number order" prev_line = chunk.start_line # Verify logical content ordering combined_text = " ".join(chunk.code for chunk in sorted_chunks) # Abstract should come before Introduction abstract_pos = combined_text.lower().find("abstract") intro_pos = combined_text.lower().find("introduction") if abstract_pos != -1 and intro_pos != -1: assert abstract_pos < intro_pos, "Abstract should come before Introduction" # Introduction should come before Conclusion conclusion_pos = combined_text.lower().find("conclusion") if intro_pos != -1 and conclusion_pos != -1: assert intro_pos < conclusion_pos, "Introduction should come before Conclusion" print("✅ Content ordering verified") @pytest.mark.asyncio async def test_pdf_search_precision(self, pdf_services, research_paper_pdf): """Test search precision on PDF content.""" coordinator = pdf_services["coordinator"] search = pdf_services["search"] # Index PDF result = await coordinator.process_file(research_paper_pdf) assert result["status"] == "success" # Test precise searches (adjusted for PDF content) precision_tests = [ ("def normalize", "function signature"), ("Carnegie Mellon", "institution name"), ("CAST.*chunking", "algorithm description"), ] for pattern, description in precision_tests: results, _ = search.search_regex(pattern, page_size=5) assert len(results) > 0, f"No precise results for {description}: '{pattern}'" print(f"✅ Precise search for {description}: {len(results)} results")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_pdf_indexing_integration.py•13.3 KiB