PDF MCP Server

test_phase3_features.py•21.5 KiB

""" Integration tests for Phase 3 features: - Link extraction: Extract URLs, hyperlinks, internal references - PDF optimization: Compress/reduce PDF file size - Barcode/QR code detection: Detect and decode barcodes and QR codes - Page splitting: Split PDFs by bookmarks or content markers - PDF comparison: Diff two PDFs and highlight changes - Batch processing: Process multiple PDFs in a single call TDD Pattern: Tests written FIRST, implementation follows. """ from pathlib import Path from typing import Dict, Any, List import tempfile import shutil import os import pytest from pdf_mcp import pdf_tools from pdf_mcp.pdf_tools import PdfToolError # ============================================================================= # Test Fixtures # ============================================================================= TESTS_DIR = Path(__file__).parent def get_test_pdf(name: str) -> str: """Get path to test PDF if it exists.""" path = TESTS_DIR / name if path.exists(): return str(path) return None @pytest.fixture def temp_dir(): """Create a temporary directory for test outputs.""" tmp = tempfile.mkdtemp() yield tmp shutil.rmtree(tmp, ignore_errors=True) @pytest.fixture def sample_pdf(): """Get a sample PDF for testing.""" return get_test_pdf("1006.pdf") @pytest.fixture def multiple_pdfs(): """Get multiple PDFs for batch testing.""" pdfs = ["1006.pdf", "pdf_sample2.pdf", "PublicWaterMassMailing.pdf"] return [get_test_pdf(p) for p in pdfs if get_test_pdf(p)] # ============================================================================= # Link Extraction Tests # ============================================================================= class TestExtractLinks: """Tests for link extraction functionality.""" def test_extract_links_returns_structure(self, sample_pdf): """Test that extract_links returns proper structure.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.extract_links(sample_pdf) assert "pdf_path" in result assert "total_links" in result assert "links" in result assert isinstance(result["links"], list) assert "pages_scanned" in result def test_extract_links_specific_pages(self, sample_pdf): """Test extracting links from specific pages.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.extract_links(sample_pdf, pages=[1]) assert result["pages_scanned"] == 1 def test_extract_links_categories(self, sample_pdf): """Test that links are categorized by type.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.extract_links(sample_pdf) # Should have link type categories assert "link_types" in result # Common types: uri, internal, goto assert isinstance(result["link_types"], dict) def test_extract_links_invalid_pdf(self): """Test error handling for invalid PDF.""" with pytest.raises(PdfToolError): pdf_tools.extract_links("/nonexistent/file.pdf") def test_extract_links_contains_url_info(self, sample_pdf): """Test that URL links contain proper information.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.extract_links(sample_pdf) if result["total_links"] > 0: link = result["links"][0] assert "page" in link assert "type" in link # URI links should have a uri field if link["type"] == "uri": assert "uri" in link # ============================================================================= # PDF Optimization Tests # ============================================================================= class TestOptimizePdf: """Tests for PDF optimization/compression.""" def test_optimize_pdf_returns_structure(self, sample_pdf, temp_dir): """Test that optimize_pdf returns proper structure.""" if not sample_pdf: pytest.skip("Sample PDF not available") output_path = os.path.join(temp_dir, "optimized.pdf") result = pdf_tools.optimize_pdf(sample_pdf, output_path) assert "input_path" in result assert "output_path" in result assert "original_size" in result assert "optimized_size" in result assert "compression_ratio" in result assert "size_reduction_percent" in result def test_optimize_pdf_creates_file(self, sample_pdf, temp_dir): """Test that optimize_pdf creates output file.""" if not sample_pdf: pytest.skip("Sample PDF not available") output_path = os.path.join(temp_dir, "optimized.pdf") result = pdf_tools.optimize_pdf(sample_pdf, output_path) assert os.path.exists(output_path) assert result["optimized_size"] > 0 def test_optimize_pdf_quality_settings(self, sample_pdf, temp_dir): """Test optimization with different quality settings.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Test with low quality (max compression) output_low = os.path.join(temp_dir, "low_quality.pdf") result_low = pdf_tools.optimize_pdf(sample_pdf, output_low, quality="low") # Test with high quality (min compression) output_high = os.path.join(temp_dir, "high_quality.pdf") result_high = pdf_tools.optimize_pdf(sample_pdf, output_high, quality="high") # Low quality should generally be smaller assert result_low["optimized_size"] <= result_high["optimized_size"] def test_optimize_pdf_invalid_input(self, temp_dir): """Test error handling for invalid input.""" with pytest.raises(PdfToolError): pdf_tools.optimize_pdf("/nonexistent.pdf", os.path.join(temp_dir, "out.pdf")) def test_optimize_pdf_preserves_content(self, sample_pdf, temp_dir): """Test that optimization preserves page count.""" if not sample_pdf: pytest.skip("Sample PDF not available") output_path = os.path.join(temp_dir, "optimized.pdf") pdf_tools.optimize_pdf(sample_pdf, output_path) # Check page count is preserved import pymupdf with pymupdf.open(sample_pdf) as orig: with pymupdf.open(output_path) as opt: assert len(opt) == len(orig) # ============================================================================= # Barcode/QR Code Detection Tests # ============================================================================= class TestDetectBarcodes: """Tests for barcode/QR code detection.""" def test_detect_barcodes_returns_structure(self, sample_pdf): """Test that detect_barcodes returns proper structure.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.detect_barcodes(sample_pdf) assert "pdf_path" in result assert "total_barcodes" in result assert "barcodes" in result assert isinstance(result["barcodes"], list) assert "pages_scanned" in result assert "pyzbar_available" in result def test_detect_barcodes_specific_pages(self, sample_pdf): """Test detecting barcodes on specific pages.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.detect_barcodes(sample_pdf, pages=[1]) assert result["pages_scanned"] == 1 def test_detect_barcodes_barcode_info(self): """Test that detected barcodes contain proper info.""" # This test uses any PDF - barcodes may or may not be present pdf_path = get_test_pdf("1006.pdf") if not pdf_path: pytest.skip("Test PDF not available") result = pdf_tools.detect_barcodes(pdf_path) # If barcodes found, verify structure if result["total_barcodes"] > 0: barcode = result["barcodes"][0] assert "page" in barcode assert "type" in barcode # e.g., QRCODE, CODE128, EAN13 assert "data" in barcode assert "position" in barcode def test_detect_barcodes_invalid_pdf(self): """Test error handling for invalid PDF.""" with pytest.raises(PdfToolError): pdf_tools.detect_barcodes("/nonexistent/file.pdf") @pytest.mark.skipif( not hasattr(pdf_tools, "_HAS_PYZBAR") or not pdf_tools._HAS_PYZBAR, reason="pyzbar not installed" ) def test_detect_barcodes_with_pyzbar(self, sample_pdf): """Test barcode detection when pyzbar is available.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.detect_barcodes(sample_pdf) assert result["pyzbar_available"] is True # ============================================================================= # Page Splitting Tests # ============================================================================= class TestSplitPdfByBookmarks: """Tests for splitting PDFs by bookmarks using split_pdf(mode='bookmarks').""" def test_split_by_bookmarks_returns_structure(self, sample_pdf, temp_dir): """Test that split_pdf with mode='bookmarks' returns proper structure.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.split_pdf(sample_pdf, temp_dir, mode="bookmarks") assert "input_path" in result assert "output_dir" in result assert "total_bookmarks" in result assert "files_created" in result assert isinstance(result["files_created"], list) def test_split_by_bookmarks_creates_files(self, sample_pdf, temp_dir): """Test that splitting creates output files.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.split_pdf(sample_pdf, temp_dir, mode="bookmarks") # Even if no bookmarks, should report results assert "files_created" in result for file_info in result["files_created"]: assert "path" in file_info assert "title" in file_info assert "page_range" in file_info def test_split_by_bookmarks_no_bookmarks(self, temp_dir): """Test handling of PDFs without bookmarks.""" # Use a simple PDF without bookmarks pdf_path = get_test_pdf("pdf_sample2.pdf") if not pdf_path: pytest.skip("Test PDF not available") result = pdf_tools.split_pdf(pdf_path, temp_dir, mode="bookmarks") # Should indicate no bookmarks found assert result["total_bookmarks"] == 0 def test_split_by_bookmarks_invalid_pdf(self, temp_dir): """Test error handling for invalid PDF.""" with pytest.raises(PdfToolError): pdf_tools.split_pdf("/nonexistent.pdf", temp_dir, mode="bookmarks") def test_split_by_pages(self, sample_pdf, temp_dir): """Test splitting PDF by page ranges.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Split every 2 pages result = pdf_tools.split_pdf(sample_pdf, temp_dir, mode="pages", pages_per_split=2) assert "files_created" in result assert len(result["files_created"]) > 0 # ============================================================================= # PDF Comparison Tests # ============================================================================= class TestComparePdfs: """Tests for PDF comparison/diff functionality.""" def test_compare_pdfs_returns_structure(self, sample_pdf): """Test that compare_pdfs returns proper structure.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Compare PDF with itself (should be identical) result = pdf_tools.compare_pdfs(sample_pdf, sample_pdf) assert "pdf1_path" in result assert "pdf2_path" in result assert "are_identical" in result assert "differences" in result assert isinstance(result["differences"], list) assert "summary" in result def test_compare_pdfs_identical(self, sample_pdf): """Test comparing identical PDFs.""" if not sample_pdf: pytest.skip("Sample PDF not available") result = pdf_tools.compare_pdfs(sample_pdf, sample_pdf) assert result["are_identical"] is True assert len(result["differences"]) == 0 def test_compare_pdfs_different(self, multiple_pdfs): """Test comparing different PDFs.""" if len(multiple_pdfs) < 2: pytest.skip("Need at least 2 PDFs for comparison") result = pdf_tools.compare_pdfs(multiple_pdfs[0], multiple_pdfs[1]) assert result["are_identical"] is False assert len(result["differences"]) > 0 def test_compare_pdfs_text_diff(self, sample_pdf, temp_dir): """Test that comparison detects text differences.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Create a modified PDF import pymupdf modified_path = os.path.join(temp_dir, "modified.pdf") with pymupdf.open(sample_pdf) as doc: # Add some text to first page page = doc[0] page.insert_text((100, 100), "TEST MODIFICATION") doc.save(modified_path) result = pdf_tools.compare_pdfs(sample_pdf, modified_path) assert result["are_identical"] is False # Should detect text difference text_diffs = [d for d in result["differences"] if d.get("type") == "text"] assert len(text_diffs) > 0 def test_compare_pdfs_page_count_diff(self, sample_pdf, temp_dir): """Test that comparison detects page count differences.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Create a PDF with different page count import pymupdf modified_path = os.path.join(temp_dir, "extra_page.pdf") with pymupdf.open(sample_pdf) as doc: # Add a new page doc.new_page() doc.save(modified_path) result = pdf_tools.compare_pdfs(sample_pdf, modified_path) assert result["are_identical"] is False # Should detect page count difference page_diffs = [d for d in result["differences"] if d.get("type") == "page_count"] assert len(page_diffs) > 0 def test_compare_pdfs_invalid_input(self): """Test error handling for invalid inputs.""" with pytest.raises(PdfToolError): pdf_tools.compare_pdfs("/nonexistent1.pdf", "/nonexistent2.pdf") # ============================================================================= # Batch Processing Tests # ============================================================================= class TestBatchProcess: """Tests for batch processing multiple PDFs.""" def test_batch_process_returns_structure(self, multiple_pdfs): """Test that batch_process returns proper structure.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") result = pdf_tools.batch_process( pdf_paths=multiple_pdfs, operation="get_info" ) assert "operation" in result assert "total_files" in result assert "successful" in result assert "failed" in result assert "results" in result assert isinstance(result["results"], list) def test_batch_process_get_info(self, multiple_pdfs): """Test batch getting PDF info.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") result = pdf_tools.batch_process( pdf_paths=multiple_pdfs, operation="get_info" ) assert result["total_files"] == len(multiple_pdfs) assert result["successful"] == len(multiple_pdfs) assert len(result["results"]) == len(multiple_pdfs) def test_batch_process_extract_text(self, multiple_pdfs): """Test batch text extraction.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") result = pdf_tools.batch_process( pdf_paths=multiple_pdfs, operation="extract_text" ) assert result["successful"] >= 1 for r in result["results"]: if r["success"]: assert "text" in r["result"] or "error" not in r def test_batch_process_extract_links(self, multiple_pdfs): """Test batch link extraction.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") result = pdf_tools.batch_process( pdf_paths=multiple_pdfs, operation="extract_links" ) assert result["total_files"] == len(multiple_pdfs) for r in result["results"]: assert "pdf_path" in r def test_batch_process_invalid_operation(self, sample_pdf): """Test error handling for invalid operation.""" if not sample_pdf: pytest.skip("Sample PDF not available") with pytest.raises(PdfToolError): pdf_tools.batch_process([sample_pdf], operation="invalid_op") def test_batch_process_partial_failure(self, sample_pdf, temp_dir): """Test handling of partial failures in batch.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Mix valid and invalid paths pdf_paths = [sample_pdf, "/nonexistent/file.pdf"] result = pdf_tools.batch_process( pdf_paths=pdf_paths, operation="get_info" ) assert result["total_files"] == 2 assert result["successful"] == 1 assert result["failed"] == 1 def test_batch_process_optimize(self, multiple_pdfs, temp_dir): """Test batch PDF optimization.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") result = pdf_tools.batch_process( pdf_paths=multiple_pdfs, operation="optimize", output_dir=temp_dir ) assert result["successful"] >= 1 # Check files were created for r in result["results"]: if r["success"]: assert os.path.exists(r["result"]["output_path"]) # ============================================================================= # MCP Layer Tests for Phase 3 # ============================================================================= class TestMcpLayerPhase3: """Test MCP tool wrappers for Phase 3 features.""" def test_mcp_extract_links_exists(self): """Test that MCP extract_links tool exists.""" from pdf_mcp import server # Check tool is registered assert hasattr(server, "extract_links") def test_mcp_optimize_pdf_exists(self): """Test that MCP optimize_pdf tool exists.""" from pdf_mcp import server assert hasattr(server, "optimize_pdf") def test_mcp_detect_barcodes_exists(self): """Test that MCP detect_barcodes tool exists.""" from pdf_mcp import server assert hasattr(server, "detect_barcodes") def test_mcp_split_pdf_exists(self): """Test that MCP split_pdf tool exists.""" from pdf_mcp import server assert hasattr(server, "split_pdf") def test_mcp_compare_pdfs_exists(self): """Test that MCP compare_pdfs tool exists.""" from pdf_mcp import server assert hasattr(server, "compare_pdfs") def test_mcp_batch_process_exists(self): """Test that MCP batch_process tool exists.""" from pdf_mcp import server assert hasattr(server, "batch_process") # ============================================================================= # End-to-End Workflow Tests # ============================================================================= class TestPhase3Workflows: """End-to-end workflow tests combining Phase 3 features.""" def test_analyze_and_optimize_workflow(self, sample_pdf, temp_dir): """Test workflow: extract info -> extract links -> optimize.""" if not sample_pdf: pytest.skip("Sample PDF not available") # Step 1: Extract links links_result = pdf_tools.extract_links(sample_pdf) # Step 2: Optimize output_path = os.path.join(temp_dir, "optimized.pdf") opt_result = pdf_tools.optimize_pdf(sample_pdf, output_path) # Verify workflow completed assert links_result["pdf_path"] == sample_pdf assert os.path.exists(opt_result["output_path"]) def test_batch_analysis_workflow(self, multiple_pdfs, temp_dir): """Test workflow: batch extract text -> batch extract links.""" if len(multiple_pdfs) < 2: pytest.skip("Need multiple PDFs for batch testing") # Step 1: Batch extract links links_result = pdf_tools.batch_process(multiple_pdfs, "extract_links") # Step 2: Batch get info info_result = pdf_tools.batch_process(multiple_pdfs, "get_info") assert links_result["total_files"] == len(multiple_pdfs) assert info_result["total_files"] == len(multiple_pdfs) def test_compare_and_report_workflow(self, multiple_pdfs): """Test workflow: compare PDFs and generate report.""" if len(multiple_pdfs) < 2: pytest.skip("Need at least 2 PDFs for comparison") # Compare first two PDFs result = pdf_tools.compare_pdfs(multiple_pdfs[0], multiple_pdfs[1]) # Generate a summary report summary = result["summary"] assert "pages" in summary or "text" in summary or isinstance(summary, str)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nfsarch33/pdf-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_phase3_features.py•21.5 KiB