Zotero Chunk RAG

test_index_report.py•16 KiB

"""Tests for indexing report functionality (Feature 5). Tests cover: - IndexReport dataclass: to_dict(), to_markdown() - CLI --report flag integration - Edge cases: empty results, all failures, no quality data """ from __future__ import annotations import json import tempfile from pathlib import Path from unittest.mock import MagicMock, patch import pytest # ============================================================================= # Fixtures # ============================================================================= @pytest.fixture def sample_index_result(): """Create a sample IndexResult for testing.""" from zotero_chunk_rag.indexer import IndexResult return IndexResult( item_key="ABC123", title="Test Paper Title", status="indexed", reason="", n_chunks=25, n_tables=3, quality_grade="A", ) @pytest.fixture def failed_index_result(): """Create a failed IndexResult for testing.""" from zotero_chunk_rag.indexer import IndexResult return IndexResult( item_key="DEF456", title="Failed Paper", status="failed", reason="PDF corrupted: invalid header", n_chunks=0, n_tables=0, quality_grade="F", ) @pytest.fixture def empty_index_result(): """Create an empty (no text) IndexResult.""" from zotero_chunk_rag.indexer import IndexResult return IndexResult( item_key="GHI789", title="Scanned Only Paper", status="empty", reason="No extractable text (scanned PDF)", n_chunks=0, n_tables=0, quality_grade="F", ) @pytest.fixture def sample_report(sample_index_result, failed_index_result, empty_index_result): """Create a sample IndexReport with mixed results.""" from zotero_chunk_rag.models import IndexReport return IndexReport( total_items=10, indexed=5, skipped=2, failed=1, empty=2, already_indexed=3, results=[sample_index_result, failed_index_result, empty_index_result], extraction_stats={ "total_pages": 100, "text_pages": 85, "ocr_pages": 10, "empty_pages": 5, }, quality_distribution={ "A": 3, "B": 1, "C": 0, "D": 0, "F": 1, }, ) # ============================================================================= # to_dict() Tests # ============================================================================= class TestIndexReportToDict: """Tests for IndexReport.to_dict() method.""" def test_summary_fields(self, sample_report): """Summary should contain all count fields.""" result = sample_report.to_dict() assert "summary" in result summary = result["summary"] assert summary["total_items"] == 10 assert summary["indexed"] == 5 assert summary["skipped"] == 2 assert summary["failed"] == 1 assert summary["empty"] == 2 assert summary["already_indexed"] == 3 def test_extraction_stats_included(self, sample_report): """Extraction stats should be included.""" result = sample_report.to_dict() assert "extraction_stats" in result stats = result["extraction_stats"] assert stats["total_pages"] == 100 assert stats["text_pages"] == 85 assert stats["ocr_pages"] == 10 assert stats["empty_pages"] == 5 def test_quality_distribution_included(self, sample_report): """Quality distribution should be included.""" result = sample_report.to_dict() assert "quality_distribution" in result dist = result["quality_distribution"] assert dist["A"] == 3 assert dist["B"] == 1 assert dist["F"] == 1 def test_failures_list(self, sample_report): """Failures should be listed with details.""" result = sample_report.to_dict() assert "failures" in result failures = result["failures"] assert len(failures) == 1 assert failures[0]["item_key"] == "DEF456" assert failures[0]["title"] == "Failed Paper" assert "corrupted" in failures[0]["reason"] assert failures[0]["quality_grade"] == "F" def test_empty_documents_list(self, sample_report): """Empty documents should be listed.""" result = sample_report.to_dict() assert "empty_documents" in result empty = result["empty_documents"] assert len(empty) == 1 assert empty[0]["item_key"] == "GHI789" assert "scanned" in empty[0]["reason"].lower() def test_indexed_documents_list(self, sample_report): """Indexed documents should be listed with stats.""" result = sample_report.to_dict() assert "indexed_documents" in result indexed = result["indexed_documents"] assert len(indexed) == 1 assert indexed[0]["item_key"] == "ABC123" assert indexed[0]["n_chunks"] == 25 assert indexed[0]["n_tables"] == 3 assert indexed[0]["quality_grade"] == "A" def test_json_serializable(self, sample_report): """Result should be JSON serializable.""" result = sample_report.to_dict() # Should not raise json_str = json.dumps(result) assert isinstance(json_str, str) # Should round-trip parsed = json.loads(json_str) assert parsed["summary"]["indexed"] == 5 class TestIndexReportToDictEdgeCases: """Edge cases for to_dict().""" def test_empty_results(self): """Empty results should produce valid dict.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=0, indexed=0, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, quality_distribution={}, ) result = report.to_dict() assert result["summary"]["total_items"] == 0 assert result["failures"] == [] assert result["empty_documents"] == [] assert result["indexed_documents"] == [] def test_all_failures(self, failed_index_result): """Report with only failures should work.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=3, indexed=0, skipped=0, failed=3, empty=0, already_indexed=0, results=[failed_index_result] * 3, extraction_stats={}, quality_distribution={"F": 3}, ) result = report.to_dict() assert len(result["failures"]) == 3 assert result["indexed_documents"] == [] # ============================================================================= # to_markdown() Tests # ============================================================================= class TestIndexReportToMarkdown: """Tests for IndexReport.to_markdown() method.""" def test_contains_summary_section(self, sample_report): """Markdown should contain summary section.""" md = sample_report.to_markdown() assert "# Indexing Report" in md assert "## Summary" in md assert "Total items processed:" in md assert "Newly indexed:" in md assert "Already in index:" in md assert "Empty (no text):" in md assert "Failed:" in md def test_summary_values_correct(self, sample_report): """Summary values should be correct.""" md = sample_report.to_markdown() assert "**Total items processed:** 10" in md assert "**Newly indexed:** 5" in md assert "**Already in index:** 3" in md def test_extraction_stats_section(self, sample_report): """Should include extraction statistics.""" md = sample_report.to_markdown() assert "## Extraction Statistics" in md assert "Total pages: 100" in md assert "Text pages: 85" in md assert "OCR pages: 10" in md assert "Empty pages: 5" in md def test_quality_distribution_table(self, sample_report): """Should include quality distribution table.""" md = sample_report.to_markdown() assert "## Quality Distribution" in md assert "| Grade | Count |" in md assert "| A | 3 |" in md assert "| B | 1 |" in md def test_failures_table(self, sample_report): """Should include failures table.""" md = sample_report.to_markdown() assert "## Failures" in md assert "| Item Key | Title | Error |" in md assert "`DEF456`" in md assert "Failed Paper" in md def test_empty_documents_table(self, sample_report): """Should include empty documents table.""" md = sample_report.to_markdown() assert "## Empty Documents" in md assert "`GHI789`" in md assert "Scanned Only Paper" in md def test_long_title_truncated(self): """Long titles should be truncated in tables.""" from zotero_chunk_rag.indexer import IndexResult from zotero_chunk_rag.models import IndexReport long_title = "A" * 100 # 100 character title result = IndexResult( item_key="LONG", title=long_title, status="failed", reason="Error", quality_grade="F", ) report = IndexReport( total_items=1, indexed=0, skipped=0, failed=1, empty=0, already_indexed=0, results=[result], extraction_stats={}, quality_distribution={}, ) md = report.to_markdown() # Title should be truncated to 40 chars + "..." assert "A" * 40 + "..." in md assert "A" * 50 not in md def test_pipe_characters_escaped(self): """Pipe characters in title/reason should be escaped.""" from zotero_chunk_rag.indexer import IndexResult from zotero_chunk_rag.models import IndexReport result = IndexResult( item_key="PIPE", title="Title | With | Pipes", status="failed", reason="Error | message", quality_grade="F", ) report = IndexReport( total_items=1, indexed=0, skipped=0, failed=1, empty=0, already_indexed=0, results=[result], extraction_stats={}, quality_distribution={}, ) md = report.to_markdown() # Pipes should be escaped assert "\\|" in md class TestIndexReportToMarkdownEdgeCases: """Edge cases for to_markdown().""" def test_empty_extraction_stats_omits_section(self): """Empty extraction stats should omit the section.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=1, indexed=1, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, # Empty quality_distribution={}, ) md = report.to_markdown() assert "## Extraction Statistics" not in md def test_empty_quality_distribution_omits_section(self): """Empty quality distribution should omit the section.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=1, indexed=1, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, quality_distribution={}, # Empty ) md = report.to_markdown() assert "## Quality Distribution" not in md def test_zero_quality_counts_omits_section(self): """All-zero quality distribution should omit the section.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=1, indexed=1, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, quality_distribution={"A": 0, "B": 0, "C": 0, "D": 0, "F": 0}, ) md = report.to_markdown() assert "## Quality Distribution" not in md def test_no_failures_omits_failures_section(self): """No failures should omit failures section.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=1, indexed=1, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, quality_distribution={}, ) md = report.to_markdown() assert "## Failures" not in md # ============================================================================= # CLI --report Flag Tests # ============================================================================= class TestCLIReportFlag: """Tests for the --report CLI flag.""" def test_json_report_output(self, tmp_path): """--report file.json should produce JSON output.""" from zotero_chunk_rag.models import IndexReport from zotero_chunk_rag.indexer import IndexResult report = IndexReport( total_items=5, indexed=3, skipped=1, failed=1, empty=0, already_indexed=2, results=[ IndexResult("A", "Title A", "indexed", n_chunks=10, quality_grade="A"), IndexResult("B", "Title B", "failed", reason="Error", quality_grade="F"), ], extraction_stats={"total_pages": 50}, quality_distribution={"A": 1, "F": 1}, ) report_path = tmp_path / "report.json" report_path.write_text(json.dumps(report.to_dict(), indent=2)) # Verify file exists and is valid JSON assert report_path.exists() content = json.loads(report_path.read_text()) assert content["summary"]["indexed"] == 3 def test_markdown_report_output(self, tmp_path): """--report file.md should produce Markdown output.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=5, indexed=3, skipped=1, failed=0, empty=1, already_indexed=2, results=[], extraction_stats={"total_pages": 50}, quality_distribution={"A": 2, "B": 1}, ) report_path = tmp_path / "report.md" report_path.write_text(report.to_markdown()) assert report_path.exists() content = report_path.read_text() assert "# Indexing Report" in content assert "**Newly indexed:** 3" in content def test_report_suffix_determines_format(self, tmp_path): """File suffix should determine output format.""" from zotero_chunk_rag.models import IndexReport report = IndexReport( total_items=1, indexed=1, skipped=0, failed=0, empty=0, already_indexed=0, results=[], extraction_stats={}, quality_distribution={}, ) # JSON suffix json_path = tmp_path / "test.json" if json_path.suffix == ".json": json_path.write_text(json.dumps(report.to_dict(), indent=2)) else: json_path.write_text(report.to_markdown()) content = json_path.read_text() assert content.startswith("{") # JSON starts with { # MD suffix md_path = tmp_path / "test.md" if md_path.suffix == ".json": md_path.write_text(json.dumps(report.to_dict(), indent=2)) else: md_path.write_text(report.to_markdown()) content = md_path.read_text() assert content.startswith("#") # Markdown starts with #

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_index_report.py•16 KiB