Zotero Chunk RAG

test_section_quality.py•6.58 KiB

"""Section detection quality tests against real papers.""" # Only sections whose headings contain a standard keyword are tested. # Non-standard headings (e.g., "Modeling Diseases") become "unknown" — that is correct. NONAME1_KEYWORD_SECTIONS = [ ("Introduction", "introduction"), ("Summary and Outlook", "conclusion"), ("References", "references"), ] NONAME2_KEYWORD_SECTIONS = [ ("Introduction", "introduction"), ("Numerical methods", "methods"), ("Results", "results"), ("Discussion", "discussion"), ("Conclusions", "conclusion"), ("References", "references"), ] NONAME3_KEYWORD_SECTIONS = [ ("METHODS", "methods"), ("RESULTS", "results"), ("DISCUSSION", "discussion"), ("ACKNOWLEDGMENTS", "appendix"), ("GRANTS", "appendix"), ("REFERENCES", "references"), ] def _find_section(sections, heading_substring): """Find the first section whose heading_text contains the substring (case-insensitive).""" for s in sections: if heading_substring.lower() in s.heading_text.lower(): return s return None def test_noname1_keyword_sections(extracted_papers): ex = extracted_papers["noname1.pdf"] for heading_sub, expected_label in NONAME1_KEYWORD_SECTIONS: s = _find_section(ex.sections, heading_sub) assert s is not None, ( f"Section with heading containing {heading_sub!r} not found. " f"Sections: {[(s.heading_text[:40], s.label) for s in ex.sections]}" ) assert s.label == expected_label, ( f"Section {heading_sub!r} labelled {s.label!r}, expected {expected_label!r}" ) def test_noname2_keyword_sections(extracted_papers): ex = extracted_papers["noname2.pdf"] for heading_sub, expected_label in NONAME2_KEYWORD_SECTIONS: s = _find_section(ex.sections, heading_sub) assert s is not None, ( f"Section with heading containing {heading_sub!r} not found. " f"Sections: {[(s.heading_text[:40], s.label) for s in ex.sections]}" ) assert s.label == expected_label, ( f"Section {heading_sub!r} labelled {s.label!r}, expected {expected_label!r}" ) def test_noname3_keyword_sections(extracted_papers): ex = extracted_papers["noname3.pdf"] for heading_sub, expected_label in NONAME3_KEYWORD_SECTIONS: s = _find_section(ex.sections, heading_sub) assert s is not None, ( f"Section with heading containing {heading_sub!r} not found. " f"Sections: {[(s.heading_text[:40], s.label) for s in ex.sections]}" ) assert s.label == expected_label, ( f"Section {heading_sub!r} labelled {s.label!r}, expected {expected_label!r}" ) def test_noname1_non_keyword_sections_are_unknown(extracted_papers): """Headings without standard keywords must be labelled 'unknown', not force-fitted.""" ex = extracted_papers["noname1.pdf"] for s in ex.sections: if s.label not in ("preamble", "unknown", "introduction", "conclusion", "references", "abstract"): # This section was classified by keyword. Verify the keyword actually exists. heading_lower = s.heading_text.lower() from zotero_chunk_rag.section_classifier import categorize_heading cat, _ = categorize_heading(heading_lower) assert cat is not None, ( f"Section {s.heading_text[:40]!r} labelled {s.label!r} " f"but heading has no matching keyword — should be 'unknown'" ) def test_noname3_page_ids_filtered(extracted_papers): """Page identifiers like R1356, R1360, R1368 must not appear as section headings.""" ex = extracted_papers["noname3.pdf"] for s in ex.sections: cleaned = s.heading_text.strip().strip("#*_ ") assert not cleaned.startswith("R1"), f"Page identifier in sections: {s.heading_text!r}" def test_all_papers_sections_cover_full_text(extracted_papers): """Sections must cover the entire document with no gaps.""" for pdf_name in ["noname1.pdf", "noname2.pdf", "noname3.pdf"]: ex = extracted_papers[pdf_name] assert ex.sections, f"{pdf_name}: no sections detected" assert ex.sections[0].char_start == 0, ( f"{pdf_name}: first section starts at {ex.sections[0].char_start}, not 0" ) assert ex.sections[-1].char_end == len(ex.full_markdown), ( f"{pdf_name}: last section ends at {ex.sections[-1].char_end}, " f"not {len(ex.full_markdown)}" ) for i in range(len(ex.sections) - 1): assert ex.sections[i].char_end == ex.sections[i + 1].char_start, ( f"{pdf_name}: gap between sections {i} and {i+1}" ) def test_abstract_detection_does_not_crash(extracted_papers): """Three-tier abstract detection runs without error on all fixtures. If a paper has an abstract (keyword or font-detected), it must appear exactly once in sections.""" for pdf_name in ["noname1.pdf", "noname2.pdf", "noname3.pdf"]: ex = extracted_papers[pdf_name] abstract_sections = [s for s in ex.sections if s.label == "abstract"] # Must be 0 or 1 — never multiple assert len(abstract_sections) <= 1, ( f"{pdf_name}: {len(abstract_sections)} abstract sections — " f"should be at most 1" ) def test_abstract_detected_via_toc(extracted_papers): """If a paper's TOC already labels 'abstract', Tier 2 should recognise it and _detect_abstract should return None (not duplicate).""" # All three noname papers should have abstract detected by some tier for pdf_name in ["noname1.pdf", "noname2.pdf", "noname3.pdf"]: ex = extracted_papers[pdf_name] abstract_sections = [s for s in ex.sections if s.label == "abstract"] # At most 1 abstract section (no duplicates) assert len(abstract_sections) <= 1, ( f"{pdf_name}: {len(abstract_sections)} abstract sections detected — " f"should be at most 1" ) def test_noname1_no_methods_overcount(extracted_papers): """noname1 is a review paper. 'methods' should only appear for headings that actually contain 'method' in the text, not for every body section.""" ex = extracted_papers["noname1.pdf"] methods_sections = [s for s in ex.sections if s.label == "methods"] # noname1 has no headings containing "method" — 0 is expected assert len(methods_sections) <= 2, ( f"noname1 has {len(methods_sections)} methods sections — " f"position heuristics are over-applying. " f"Headings: {[s.heading_text[:40] for s in methods_sections]}" )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_section_quality.py•6.58 KiB