Zotero Chunk RAG

test_content_quality.py•18.6 KiB

"""Unit tests for content quality detection functions.""" import pytest from zotero_chunk_rag.pdf_processor import ( _detect_garbled_spacing, _detect_interleaved_chars, _detect_encoding_artifacts, _check_content_readability, _normalize_ligatures, _classify_artifact, _parse_prose_rows, _find_column_gap_threshold, SYNTHETIC_CAPTION_PREFIX, ) from zotero_chunk_rag.feature_extraction.postprocessors.cell_cleaning import ( _looks_numeric, ) from zotero_chunk_rag.feature_extraction.methods._row_clustering import ( adaptive_row_tolerance as _adaptive_row_tolerance, ) from zotero_chunk_rag.models import ExtractedTable # ---- _detect_garbled_spacing ---- class TestDetectGarbledSpacing: def test_normal_text_not_garbled(self): ok, _ = _detect_garbled_spacing("The quick brown fox jumps over the lazy dog") assert not ok def test_merged_words_flagged(self): # Simulate merged words (no spaces) — avg word length >> 25 ok, reason = _detect_garbled_spacing( "ThisisaveryverylongstringwithnospacesthatrepresentsgarbageextractedPDFtext" ) assert ok assert "avg word length" in reason def test_empty_text_not_garbled(self): ok, _ = _detect_garbled_spacing("") assert not ok def test_whitespace_only_not_garbled(self): ok, _ = _detect_garbled_spacing(" \n\t ") assert not ok def test_short_words_pass(self): ok, _ = _detect_garbled_spacing("a b c d e f g h i j") assert not ok def test_borderline_passes(self): # 24 char word — just under threshold ok, _ = _detect_garbled_spacing("abcdefghijklmnopqrstuvwx normal words here") assert not ok def test_greek_letters_not_flagged(self): """Cells with Greek characters are technical content, not garbled.""" ok, _ = _detect_garbled_spacing("sπ,τ=1averylongmathexpressionwithgreek") assert not ok def test_math_operators_not_flagged(self): """Cells with math operators (=, ±, ×) are technical content.""" ok, _ = _detect_garbled_spacing("0992±0013iverylongconcatenatedfiltercoeff") assert not ok def test_superscripts_not_flagged(self): ok, _ = _detect_garbled_spacing("thisIsAVeryLongCellWith²exponents³") assert not ok # ---- _detect_interleaved_chars ---- class TestDetectInterleavedChars: def test_normal_text_not_interleaved(self): ok, _ = _detect_interleaved_chars("The quick brown fox jumps over the lazy dog") assert not ok def test_interleaved_flagged(self): # >40% single alpha-char tokens ok, reason = _detect_interleaved_chars("a b c d e f g h real word here") assert ok assert "single alpha chars" in reason def test_leading_decimals_not_flagged(self): """Cells with leading-decimal numbers should not be flagged.""" ok, _ = _detect_interleaved_chars(".906 .870 , . .432 .123 .456 .789") assert not ok def test_math_notation_sparse_not_flagged(self): """Single-letter variables diluted by normal words pass.""" ok, _ = _detect_interleaved_chars("C × A × V → R states of the world") assert not ok def test_empty_not_interleaved(self): ok, _ = _detect_interleaved_chars("") assert not ok def test_too_few_tokens_not_flagged(self): # <5 tokens — not flagged even if all single chars ok, _ = _detect_interleaved_chars("a b c d") assert not ok def test_normal_sentence_passes(self): ok, _ = _detect_interleaved_chars("Heart rate variability is a measure of autonomic function") assert not ok # ---- _detect_encoding_artifacts ---- class TestDetectEncodingArtifacts: def test_clean_text_no_artifacts(self): ok, found = _detect_encoding_artifacts("Figure 1. Normal caption text") assert not ok assert found == [] def test_fi_ligature_detected(self): ok, found = _detect_encoding_artifacts("The \ufb01rst finding shows") assert ok assert "\ufb01" in found def test_fl_ligature_detected(self): ok, found = _detect_encoding_artifacts("The \ufb02ow rate was") assert ok assert "\ufb02" in found def test_ffi_ligature_detected(self): ok, found = _detect_encoding_artifacts("The e\ufb03cient method") assert ok assert "\ufb03" in found def test_empty_text_no_artifacts(self): ok, found = _detect_encoding_artifacts("") assert not ok assert found == [] def test_multiple_ligatures(self): ok, found = _detect_encoding_artifacts("The \ufb01rst \ufb02ow was e\ufb03cient") assert ok assert len(found) == 3 # ---- _check_content_readability ---- class TestCheckContentReadability: def _make_table(self, rows, caption=None, headers=None): return ExtractedTable( page_num=1, table_index=0, bbox=(0, 0, 100, 100), headers=headers or [], rows=rows, caption=caption, ) def test_clean_table_passes(self): table = self._make_table( [["Cell A", "Cell B"], ["Value 1", "Value 2"]], caption="Table 1. Results", ) rpt = _check_content_readability(table) assert rpt["garbled_cells"] == 0 assert rpt["interleaved_cells"] == 0 assert not rpt["encoding_artifacts"] assert rpt["details"] == [] def test_garbled_cell_detected(self): table = self._make_table( [["ThisisaveryverylongstringwithnospacesthatrepresentsgarbageextractedPDFtext", "OK"]], ) rpt = _check_content_readability(table) assert rpt["garbled_cells"] >= 1 def test_interleaved_cell_detected(self): table = self._make_table( [["a b c d e f g h real word", "OK"]], ) rpt = _check_content_readability(table) assert rpt["interleaved_cells"] >= 1 def test_encoding_artifact_in_caption(self): table = self._make_table( [["Cell", "Cell"]], caption="Table 1. The \ufb01rst finding", ) rpt = _check_content_readability(table) assert rpt["encoding_artifacts"] def test_no_caption_no_encoding_check(self): table = self._make_table( [["Cell", "Cell"]], caption=None, ) rpt = _check_content_readability(table) assert not rpt["encoding_artifacts"] # ---- _normalize_ligatures ---- class TestNormalizeLigatures: def test_fi_ligature(self): assert _normalize_ligatures("The \ufb01rst \ufb01nding") == "The first finding" def test_fl_ligature(self): assert _normalize_ligatures("The \ufb02ow rate") == "The flow rate" def test_ffi_ligature(self): assert _normalize_ligatures("e\ufb03cient") == "efficient" def test_ffl_ligature(self): assert _normalize_ligatures("ba\ufb04e") == "baffle" def test_ff_ligature(self): assert _normalize_ligatures("co\ufb00ee") == "coffee" def test_multiple_ligatures(self): assert _normalize_ligatures("The \ufb01rst \ufb02ow was e\ufb03cient") == "The first flow was efficient" def test_none_passthrough(self): assert _normalize_ligatures(None) is None def test_empty_passthrough(self): assert _normalize_ligatures("") == "" def test_clean_text_unchanged(self): text = "Normal text without ligatures" assert _normalize_ligatures(text) == text # ---- _classify_artifact ---- class TestClassifyArtifact: """Tests for layout-artifact table detection and classification.""" def test_elsevier_article_info_box(self): """Elsevier article-info/abstract header is detected.""" table = ExtractedTable( page_num=1, table_index=0, bbox=(0, 0, 500, 300), headers=["a r t i c l e", "i n f o", "a b s t r a c t"], rows=[ ["Article history:", "Received 1 March 2021", "This paper presents..."], ["Keywords:", "Active inference", ""], ], caption="", ) assert _classify_artifact(table) is not None assert _classify_artifact(table) == "article_info_box" def test_elsevier_uppercase_variant(self): """Uppercase ARTICLE INFO variant is also detected.""" table = ExtractedTable( page_num=1, table_index=0, bbox=(0, 0, 500, 300), headers=["A R T I C L E", "I N F O", "A B S T R A C T"], rows=[["Article history:", "Received 2020", "We present..."]], caption="", ) assert _classify_artifact(table) is not None assert _classify_artifact(table) == "article_info_box" def test_table_of_contents(self): """TOC with section numbers and page numbers is detected.""" table = ExtractedTable( page_num=1, table_index=0, bbox=(0, 0, 500, 600), headers=["1 Introduction 904"], rows=[ ["2 Review of methods 907"], ["3 The empirical mode decomposition 912"], ["4 Hilbert spectral analysis 935"], ["5 Discussion 987"], ["6 Conclusions 991"], ], caption="", ) assert _classify_artifact(table) is not None assert _classify_artifact(table) == "table_of_contents" def test_toc_packed_single_cell(self): """TOC entries packed into one cell (after newline collapse).""" table = ExtractedTable( page_num=1, table_index=0, bbox=(0, 0, 500, 600), headers=[ "page 1 Introduction 904 . 2 Review of non-stationary data " "processing methods 907 . 3 The empirical mode decomposition 912 " "4 Hilbert spectral analysis 935", ], rows=[], caption="", ) assert _classify_artifact(table) == "table_of_contents" def test_toc_multicolumn(self): """TOC split across 3 columns: number | title | page.""" table = ExtractedTable( page_num=2, table_index=1, bbox=(0, 0, 500, 200), headers=[".10", "Discussion", "987"], rows=[ [".11", "Conclusions", "991"], ["", "References", "993"], ], caption="", ) assert _classify_artifact(table) == "table_of_contents" def test_block_diagram_as_table(self): """Sparse uncaptioned table with Figure N reference is detected.""" table = ExtractedTable( page_num=5, table_index=0, bbox=(0, 0, 500, 400), headers=["Interface to Human Body", "Analog", "Circuit", "Output"], rows=[ ["Electrode", "", "", ""], ["", "Amplifier", "", ""], ["Figure 3 Block diagram outlining the system", "", "", ""], ["", "", "ADC", ""], ["", "", "", "DAC"], ["", "", "", ""], ], caption="", # no caption -> uncaptioned ) assert _classify_artifact(table) is not None assert _classify_artifact(table) == "diagram_as_table" def test_real_data_table_not_filtered(self): """A normal data table with a caption is NOT an artifact.""" table = ExtractedTable( page_num=3, table_index=1, bbox=(0, 0, 500, 200), headers=["Parameter", "Value", "Unit"], rows=[ ["Heart rate", "72", "bpm"], ["Systolic BP", "120", "mmHg"], ["Diastolic BP", "80", "mmHg"], ], caption="Table 1. Patient demographics.", ) assert _classify_artifact(table) is None assert _classify_artifact(table) is None def test_sparse_table_with_caption_not_filtered(self): """A sparse table WITH a caption is NOT an artifact.""" table = ExtractedTable( page_num=10, table_index=3, bbox=(0, 0, 500, 200), headers=["Filter", "Pole 1", "Pole 2", "Pole 3"], rows=[ ["Comb", "0.914±0.119i", "", ""], ["Highpass", "", "0.707", ""], ], caption="Table 3. Poles of comb filters.", ) assert _classify_artifact(table) is None def test_abbreviation_glossary_not_filtered(self): """Abbreviation glossary is well-formed — kept (useful reference).""" table = ExtractedTable( page_num=22, table_index=8, bbox=(0, 0, 300, 400), headers=["ADC", "Analog-to-digital conversion"], rows=[ ["BLE", "Bluetooth Low Energy"], ["DAC", "Digital-to-analog converter"], ["EMG", "Electromyography"], ], caption="", ) # 100% fill, no figure ref, no TOC pattern → NOT an artifact assert _classify_artifact(table) is None def test_header_with_plain_abstract_not_filtered(self): """A real table whose header contains 'Abstract' is NOT an artifact.""" table = ExtractedTable( page_num=4, table_index=2, bbox=(0, 0, 500, 200), headers=["Study", "Abstract Concepts", "Concrete Concepts"], rows=[ ["Smith 2020", "4.2", "3.8"], ["Jones 2021", "5.1", "4.9"], ], caption="", ) assert _classify_artifact(table) is None assert _classify_artifact(table) is None def test_captioned_table_with_article_info_header_not_filtered(self): """A captioned table with 'article' in header is NOT an artifact.""" table = ExtractedTable( page_num=2, table_index=1, bbox=(0, 0, 500, 200), headers=["Article", "Year", "Citations"], rows=[ ["Smith et al.", "2020", "42"], ], caption="Table 1. Summary of articles reviewed.", ) assert _classify_artifact(table) is None # ---- SYNTHETIC_CAPTION_PREFIX ---- class TestSyntheticCaptionPrefix: def test_prefix_value(self): """Ensure the prefix is stable for downstream checks.""" assert SYNTHETIC_CAPTION_PREFIX == "Uncaptioned " class TestParseProseRows: def test_definition_list_parsed(self): content = "ACC: Accuracy\nAUC: Area Under Curve\nPPV: Positive Predictive Value" rows = _parse_prose_rows(content) assert len(rows) == 3 assert rows[0] == ["ACC", "Accuracy"] assert rows[1] == ["AUC", "Area Under Curve"] def test_em_dash_delimiter(self): content = "Term1 \u2014 Definition1\nTerm2 \u2014 Definition2" rows = _parse_prose_rows(content) assert len(rows) == 2 assert rows[0] == ["Term1", "Definition1"] def test_plain_paragraph_single_cell(self): content = "This is a regular paragraph with no definition structure at all." rows = _parse_prose_rows(content) assert len(rows) == 1 assert rows[0] == [content] def test_single_line_single_cell(self): content = "Just one line" rows = _parse_prose_rows(content) assert len(rows) == 1 assert rows[0] == [content] def test_mixed_content_handled(self): content = "Section header\nTerm1: Def1\nTerm2: Def2\nTerm3: Def3" rows = _parse_prose_rows(content) # 3/4 lines have colons = 75% > 40% threshold assert len(rows) == 4 # First line has no delimiter, stays as single-cell assert rows[0] == ["Section header"] assert rows[1] == ["Term1", "Def1"] def test_below_threshold_stays_single_cell(self): content = "Line one\nLine two\nLine three\nTerm: Definition" rows = _parse_prose_rows(content) # Only 1/4 = 25% have delimiters, below 40% threshold assert len(rows) == 1 assert rows[0] == [content] # ---- _looks_numeric ---- class TestLooksNumeric: def test_plain_integer(self): assert _looks_numeric("42") is True def test_decimal(self): assert _looks_numeric("3.14") is True def test_leading_dot(self): assert _looks_numeric(".4198") is True def test_negative(self): assert _looks_numeric("-3.14") is True def test_with_stat_marker(self): assert _looks_numeric("0.4198*") is True def test_with_dagger(self): assert _looks_numeric("0.512\u2020") is True def test_multi_value(self): assert _looks_numeric(".9931 .1789") is True def test_text_not_numeric(self): assert _looks_numeric("hello") is False def test_dotfile_not_numeric(self): assert _looks_numeric(".gitignore") is False def test_empty(self): assert _looks_numeric("") is False class TestAdaptiveRowTolerance: def test_12pt_font(self): """12pt font (~12pt height) → tolerance ~3.6pt.""" # Simulate words with ~12pt height: y0=0, y1=12 words = [(0, 0, 50, 12, "word")] * 10 tol = _adaptive_row_tolerance(words) assert 3.0 <= tol <= 4.5 def test_24pt_font(self): """24pt font → tolerance ~7.2pt.""" words = [(0, 0, 50, 24, "word")] * 10 tol = _adaptive_row_tolerance(words) assert 6.0 <= tol <= 9.0 def test_8pt_font(self): """8pt font → tolerance ~2.4pt.""" words = [(0, 0, 50, 8, "word")] * 10 tol = _adaptive_row_tolerance(words) assert 1.5 <= tol <= 3.5 def test_empty_words_fallback(self): """Empty word list returns fallback derived from assumed 12pt height.""" assert _adaptive_row_tolerance([]) == pytest.approx(12.0 * 0.3) # ---- _find_column_gap_threshold ---- class TestFindColumnGapThreshold: def test_bimodal_gaps_separated(self): """Bimodal distribution: intra-word (1-3pt) vs inter-column (20-30pt).""" gaps = [1, 1.5, 2, 2.5, 3, 20, 22, 25, 28, 30] threshold = _find_column_gap_threshold(gaps) # Should separate 3pt cluster from 20pt cluster assert 3 < threshold < 20 def test_uniform_gaps_fallthrough(self): """Uniform distribution with no clear break.""" gaps = [5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5] threshold = _find_column_gap_threshold(gaps) # Should return a value derived from the data assert threshold > 0 def test_empty_returns_inf(self): """Empty gap list returns infinity (no column splits possible).""" assert _find_column_gap_threshold([]) == float("inf") def test_single_gap(self): """Single gap value doesn't crash.""" threshold = _find_column_gap_threshold([10.0]) assert threshold > 0

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_content_quality.py•18.6 KiB