pdf4vllm

test_page_range.py•8.51 KiB

""" Tests for start_page/end_page parameter handling in read_pdf and grep_pdf. Test cases: 1. Default values (start=1, end=None) → all pages 2. start_page only (start=3, end=None) → page 3 to end 3. end_page only (start=1, end=2) → pages 1-2 4. Both specified (start=2, end=3) → pages 2-3 5. Invalid range (start > end) → error 6. Out of range (start > total_pages) → error or empty 7. end_page > total_pages → reads up to last page """ import asyncio import json import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).parent.parent)) from src.pdf_tools import read_pdf_handler, grep_pdf_handler SAMPLE_PDF_DIR = Path(__file__).parent.parent / "sample_pdfs" def get_sample_pdf() -> str: """Get first available sample PDF path""" pdfs = list(SAMPLE_PDF_DIR.glob("*.pdf")) if not pdfs: pytest.skip("No sample PDFs available") return str(pdfs[0]) class TestReadPdfPageRange: """Tests for read_pdf start_page/end_page handling""" @pytest.mark.asyncio async def test_default_values(self): """Default: start=1, end=None → reads from page 1""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path }) result = json.loads(result_json) assert "error" not in result assert "pages" in result assert len(result["pages"]) >= 1 assert result["pages"][0]["page_number"] == 1 @pytest.mark.asyncio async def test_start_page_only(self): """start_page=2, end=None → reads from page 2 to end""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path, "start_page": 2 }) result = json.loads(result_json) # May error if PDF has only 1 page, or succeed starting from page 2 if "error" not in result: assert result["pages"][0]["page_number"] == 2 @pytest.mark.asyncio async def test_end_page_only(self): """start=1, end_page=1 → reads only page 1""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path, "start_page": 1, "end_page": 1 }) result = json.loads(result_json) assert "error" not in result assert len(result["pages"]) == 1 assert result["pages"][0]["page_number"] == 1 @pytest.mark.asyncio async def test_both_specified(self): """start=1, end=2 → reads pages 1-2""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path, "start_page": 1, "end_page": 2 }) result = json.loads(result_json) # May error if PDF has only 1 page if "error" not in result: assert len(result["pages"]) <= 2 assert result["pages"][0]["page_number"] == 1 @pytest.mark.asyncio async def test_invalid_range_start_greater_than_end(self): """start > end → should error (validation in Pydantic)""" pdf_path = get_sample_pdf() # Pydantic validation should catch this with pytest.raises(ValueError): from src.schemas import ReadPDFInput ReadPDFInput(file_path=pdf_path, start_page=5, end_page=2) @pytest.mark.asyncio async def test_start_page_exceeds_total(self): """start > total_pages → error INVALID_PAGE_RANGE""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path, "start_page": 9999 }) result = json.loads(result_json) assert "error" in result assert result["error"] == "INVALID_PAGE_RANGE" @pytest.mark.asyncio async def test_end_page_exceeds_total(self): """end > total_pages → reads up to last page (no error)""" pdf_path = get_sample_pdf() result_json, _ = await read_pdf_handler({ "file_path": pdf_path, "start_page": 1, "end_page": 9999 }) result = json.loads(result_json) # Should succeed (capped at total pages) or hit page limit # Either way, should not be INVALID_PAGE_RANGE for end > total if "error" in result: assert result["error"] in ["PAGE_LIMIT_EXCEEDED"] else: assert len(result["pages"]) >= 1 @pytest.mark.asyncio async def test_zero_start_page(self): """start=0 → should error (minimum is 1)""" pdf_path = get_sample_pdf() with pytest.raises(ValueError): from src.schemas import ReadPDFInput ReadPDFInput(file_path=pdf_path, start_page=0) @pytest.mark.asyncio async def test_negative_start_page(self): """start=-1 → should error""" pdf_path = get_sample_pdf() with pytest.raises(ValueError): from src.schemas import ReadPDFInput ReadPDFInput(file_path=pdf_path, start_page=-1) class TestGrepPdfPageRange: """Tests for grep_pdf start_page/end_page handling""" @pytest.fixture def check_pdfgrep(self): """Skip tests if pdfgrep is not installed""" import shutil if not shutil.which("pdfgrep"): pytest.skip("pdfgrep not installed") @pytest.mark.asyncio async def test_default_values(self, check_pdfgrep): """Default: start=1, end=None → searches all pages""" result_json, _ = await grep_pdf_handler({ "pattern": ".", # Match any character "working_directory": str(SAMPLE_PDF_DIR), "max_count": 5 }) result = json.loads(result_json) # Should not error (unless no PDFs match) if "error" not in result: assert "matches" in result @pytest.mark.asyncio async def test_start_page_only(self, check_pdfgrep): """start_page=2 → searches from page 2""" result_json, _ = await grep_pdf_handler({ "pattern": ".", "working_directory": str(SAMPLE_PDF_DIR), "start_page": 2, "max_count": 5 }) result = json.loads(result_json) if "error" not in result and result.get("matches"): # All matches should be from page 2 or later for match in result["matches"]: assert match["page"] >= 2 @pytest.mark.asyncio async def test_end_page_only(self, check_pdfgrep): """end_page=1 → searches only page 1""" result_json, _ = await grep_pdf_handler({ "pattern": ".", "working_directory": str(SAMPLE_PDF_DIR), "end_page": 1, "max_count": 5 }) result = json.loads(result_json) if "error" not in result and result.get("matches"): # All matches should be from page 1 for match in result["matches"]: assert match["page"] == 1 @pytest.mark.asyncio async def test_both_specified(self, check_pdfgrep): """start=1, end=2 → searches pages 1-2""" result_json, _ = await grep_pdf_handler({ "pattern": ".", "working_directory": str(SAMPLE_PDF_DIR), "start_page": 1, "end_page": 2, "max_count": 10 }) result = json.loads(result_json) if "error" not in result and result.get("matches"): for match in result["matches"]: assert 1 <= match["page"] <= 2 @pytest.mark.asyncio async def test_start_page_exceeds_total(self, check_pdfgrep): """start=9999 → no matches (pdfgrep handles gracefully)""" result_json, _ = await grep_pdf_handler({ "pattern": ".", "working_directory": str(SAMPLE_PDF_DIR), "start_page": 9999 }) result = json.loads(result_json) # pdfgrep should return no matches, not error if "error" not in result: assert result.get("matches", []) == [] @pytest.mark.asyncio async def test_no_page_limit_unlike_read_pdf(self, check_pdfgrep): """grep_pdf has no 10-page limit unlike read_pdf""" result_json, _ = await grep_pdf_handler({ "pattern": ".", "working_directory": str(SAMPLE_PDF_DIR), "start_page": 1, "end_page": 100, # Would exceed read_pdf's limit "max_count": 5 }) result = json.loads(result_json) # Should NOT return PAGE_LIMIT_EXCEEDED error if "error" in result: assert result["error"] != "PAGE_LIMIT_EXCEEDED"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PyJudge/pdf4vllm-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_page_range.py•8.51 KiB