Zotero Chunk RAG

Overview Schema Related Servers Score Discussions

zotero-chunk-mcp
tests
test_feature_extraction

test_captions.py•10.9 KiB

"""Tests for unified captions module.""" from __future__ import annotations import dataclasses from unittest.mock import MagicMock import pytest from zotero_chunk_rag.feature_extraction.captions import ( DetectedCaption, _FIG_CAPTION_RE, _FIG_CAPTION_RE_RELAXED, _FIG_LABEL_ONLY_RE, _NUM_GROUP, _SUPP_PREFIX_RE, _TABLE_CAPTION_RE, _TABLE_CAPTION_RE_RELAXED, _TABLE_LABEL_ONLY_RE, _block_has_label_font_change, _block_is_bold, _block_label_on_own_line, _font_name_is_bold, _parse_caption_number, find_all_captions, is_in_references, ) # --------------------------------------------------------------------------- # TestDetectedCaption # --------------------------------------------------------------------------- class TestDetectedCaption: def test_frozen(self) -> None: """Assert mutation raises error.""" cap = DetectedCaption( text="Table 1. Results", bbox=(10.0, 20.0, 300.0, 40.0), y_center=30.0, caption_type="table", number="1", ) with pytest.raises(dataclasses.FrozenInstanceError): cap.text = "modified" # type: ignore[misc] def test_fields(self) -> None: """Construct with known values, assert all fields accessible.""" cap = DetectedCaption( text="Figure 3: Comparison of methods", bbox=(50.0, 100.0, 400.0, 120.0), y_center=110.0, caption_type="figure", number="3", ) assert cap.text == "Figure 3: Comparison of methods" assert cap.bbox == (50.0, 100.0, 400.0, 120.0) assert cap.y_center == 110.0 assert cap.caption_type == "figure" assert cap.number == "3" # --------------------------------------------------------------------------- # TestCaptionRegex # --------------------------------------------------------------------------- class TestCaptionRegex: def test_table_caption(self) -> None: """'Table 1. Results' matches table caption pattern, parses number '1'.""" assert _TABLE_CAPTION_RE.match("Table 1. Results") assert _parse_caption_number("Table 1. Results") == "1" def test_figure_caption(self) -> None: """'Figure 3: Comparison' matches figure caption pattern, parses number '3'.""" assert _FIG_CAPTION_RE.match("Figure 3: Comparison") assert _parse_caption_number("Figure 3: Comparison") == "3" def test_appendix_number(self) -> None: """'Table A.1' parses number 'A.1'.""" assert _TABLE_CAPTION_RE_RELAXED.match("Table A.1 Results") assert _parse_caption_number("Table A.1 Results") == "A.1" def test_supplementary_number(self) -> None: """'Figure S2' parses number 'S2'.""" # S2 followed by delimiter matches strict pattern assert _FIG_CAPTION_RE.match("Figure S2. Supplementary analysis") assert _parse_caption_number("Figure S2. Supplementary analysis") == "S2" def test_roman_numeral(self) -> None: """'Table IV' parses number 'IV'.""" assert _TABLE_CAPTION_RE_RELAXED.match("Table IV Results summary") assert _parse_caption_number("Table IV Results summary") == "IV" # --------------------------------------------------------------------------- # TestFontHelpers # --------------------------------------------------------------------------- class TestFontHelpers: def test_bold_detection(self) -> None: """Font names containing .B, -Bold, -bd detected as bold.""" assert _font_name_is_bold("TimesNewRoman.B") assert _font_name_is_bold("Helvetica-Bold") assert _font_name_is_bold("Arial-bd") def test_non_bold(self) -> None: """Font names like 'TimesNewRoman' not detected as bold.""" assert not _font_name_is_bold("TimesNewRoman") assert not _font_name_is_bold("Helvetica") assert not _font_name_is_bold("Courier-Oblique") # --------------------------------------------------------------------------- # Helpers for mock pages # --------------------------------------------------------------------------- def _make_block(text: str, bbox: tuple = (10, 100, 400, 120), font: str = "Regular", bold: bool = False) -> dict: """Create a mock text block dict for page.get_text('dict').""" flags = 16 if bold else 0 return { "type": 0, "bbox": bbox, "lines": [ { "spans": [ { "text": text, "font": font, "flags": flags, } ] } ], } def _make_font_change_block(label: str, body: str, bbox: tuple = (10, 100, 400, 120)) -> dict: """Create a block where label has different font from body (font change signal).""" return { "type": 0, "bbox": bbox, "lines": [ { "spans": [ {"text": label, "font": "Helvetica-Bold", "flags": 16}, {"text": body, "font": "TimesNewRoman", "flags": 0}, ] } ], } def _make_mock_page(blocks: list[dict]) -> MagicMock: """Create a mock pymupdf Page returning specific text blocks.""" page = MagicMock() page.get_text.return_value = {"blocks": blocks} return page # --------------------------------------------------------------------------- # TestFindAllCaptions # --------------------------------------------------------------------------- class TestFindAllCaptions: def test_finds_table_and_figure(self) -> None: """Mock page with blocks containing 'Table 1...' and 'Figure 2...', assert both found.""" blocks = [ _make_block("Table 1. Baseline demographics", bbox=(10, 100, 400, 120)), _make_block("Figure 2: Results comparison", bbox=(10, 300, 400, 320)), ] page = _make_mock_page(blocks) captions = find_all_captions(page) assert len(captions) == 2 types = {c.caption_type for c in captions} assert types == {"table", "figure"} table_cap = [c for c in captions if c.caption_type == "table"][0] assert table_cap.number == "1" fig_cap = [c for c in captions if c.caption_type == "figure"][0] assert fig_cap.number == "2" def test_filter_by_type(self) -> None: """include_figures=False returns only table captions.""" blocks = [ _make_block("Table 1. Results", bbox=(10, 100, 400, 120)), _make_block("Figure 2: Graph", bbox=(10, 300, 400, 320)), ] page = _make_mock_page(blocks) captions = find_all_captions(page, include_figures=False) assert len(captions) == 1 assert captions[0].caption_type == "table" def test_sorted_by_y(self) -> None: """Multiple captions returned in top-to-bottom order.""" blocks = [ _make_block("Figure 2: Bottom", bbox=(10, 500, 400, 520)), _make_block("Table 1. Top", bbox=(10, 100, 400, 120)), _make_block("Figure 1: Middle", bbox=(10, 300, 400, 320)), ] page = _make_mock_page(blocks) captions = find_all_captions(page) assert len(captions) == 3 y_centers = [c.y_center for c in captions] assert y_centers == sorted(y_centers) def test_relaxed_with_font_change(self) -> None: """Caption matching relaxed regex is accepted only with font-change structural signal.""" # Relaxed match: "Figure 1 something" (no delimiter after number) # WITH font change -- should match block_with_change = _make_font_change_block("Figure 1 ", "Results of the analysis", bbox=(10, 100, 400, 120)) # WITHOUT font change -- should NOT match block_no_change = _make_block("Figure 3 results of something", bbox=(10, 200, 400, 220), font="Regular") page = _make_mock_page([block_with_change, block_no_change]) captions = find_all_captions(page) # Only the font-change block should be detected assert len(captions) == 1 assert captions[0].number == "1" # --------------------------------------------------------------------------- # TestIsInReferences # --------------------------------------------------------------------------- class TestIsInReferences: def test_in_references(self) -> None: """Page within references section returns True.""" from zotero_chunk_rag.models import SectionSpan, PageExtraction sections = [ SectionSpan(label="introduction", char_start=0, char_end=1000, heading_text="Introduction", confidence=1.0), SectionSpan(label="references", char_start=1000, char_end=2000, heading_text="References", confidence=1.0), ] pages = [ PageExtraction(page_num=1, markdown="intro text", char_start=0), PageExtraction(page_num=2, markdown="ref text", char_start=1200), ] assert is_in_references(2, sections, pages) is True def test_not_in_references(self) -> None: """Page before references returns False.""" from zotero_chunk_rag.models import SectionSpan, PageExtraction sections = [ SectionSpan(label="introduction", char_start=0, char_end=1000, heading_text="Introduction", confidence=1.0), SectionSpan(label="references", char_start=1000, char_end=2000, heading_text="References", confidence=1.0), ] pages = [ PageExtraction(page_num=1, markdown="intro text", char_start=0), PageExtraction(page_num=2, markdown="ref text", char_start=1200), ] assert is_in_references(1, sections, pages) is False # --------------------------------------------------------------------------- # TestBackwardCompat # --------------------------------------------------------------------------- class TestCanonicalImports: def test_captions_module_exports(self) -> None: """Verify all caption functions are importable from the canonical location.""" from zotero_chunk_rag.feature_extraction.captions import ( _FIG_CAPTION_RE, _FIG_CAPTION_RE_RELAXED, _FIG_LABEL_ONLY_RE, _block_has_label_font_change, _block_is_bold, _block_label_on_own_line, _font_name_is_bold, is_in_references, _scan_lines_for_caption, _text_from_line_onward, find_all_captions, ) assert callable(_block_has_label_font_change) assert callable(_block_is_bold) assert callable(_block_label_on_own_line) assert callable(_font_name_is_bold) assert callable(is_in_references) assert callable(find_all_captions) assert callable(_scan_lines_for_caption) assert callable(_text_from_line_onward) assert _FIG_CAPTION_RE is not None assert _FIG_CAPTION_RE_RELAXED is not None assert _FIG_LABEL_ONLY_RE is not None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_captions.py•10.9 KiB