Zotero Chunk RAG

test_journal_ranker.py•17.4 KiB

"""Tests for journal quality ranking. These tests verify: 1. Title normalization and abbreviation expansion 2. Journal lookup with exact, abbreviation, and fuzzy matching 3. Bundled CSV loading (Feature 4) 4. Override support (Feature 7) 5. Fuzzy threshold at 90% (Feature 7) Tests are designed to FAIL LOUDLY if: - Bundled CSV is missing or malformed - Overrides file has syntax errors - Fuzzy matching produces false positives """ from __future__ import annotations import csv import tempfile from pathlib import Path import pytest from zotero_chunk_rag.journal_ranker import ( JournalRanker, _expand_abbreviations, _normalize_title, ) # ============================================================================= # Test Title Normalization # ============================================================================= class TestNormalizeTitle: """Test title normalization.""" def test_lowercase(self): assert _normalize_title("Nature") == "nature" assert _normalize_title("SCIENCE") == "science" def test_punctuation_to_spaces(self): assert _normalize_title("A & B") == "a b" assert _normalize_title("A: B") == "a b" assert _normalize_title("A-B") == "a b" assert _normalize_title("A/B") == "a b" def test_collapse_spaces(self): assert _normalize_title("A B") == "a b" assert _normalize_title("A & B") == "a b" def test_strip(self): assert _normalize_title(" Nature ") == "nature" # ============================================================================= # Test Abbreviation Expansion # ============================================================================= class TestExpandAbbreviations: """Test abbreviation expansion.""" def test_single_abbreviation(self): expansions = _expand_abbreviations("J. Medicine") assert "journal medicine" in expansions def test_multiple_abbreviations(self): expansions = _expand_abbreviations("IEEE Trans. Biomed. Eng.") # Should contain full expansion assert any( "transactions" in e and "biomedical" in e and "engineering" in e for e in expansions ) def test_no_abbreviations(self): expansions = _expand_abbreviations("Nature") assert "nature" in expansions # ============================================================================= # Test JournalRanker with Test CSV # ============================================================================= class TestJournalRanker: """Test the journal ranker with a test CSV.""" @pytest.fixture def test_csv(self, tmp_path: Path) -> Path: """Create a test CSV file.""" csv_path = tmp_path / "test_scimago.csv" with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["title_normalized", "quartile"]) writer.writerow(["nature", "Q1"]) writer.writerow(["science", "Q1"]) writer.writerow(["plos one", "Q1"]) # Note: abbreviation expander produces "ieee transactions biomedical engineering" # (without "on"), so we match that form writer.writerow(["ieee transactions biomedical engineering", "Q1"]) writer.writerow(["journal of physiology", "Q2"]) writer.writerow(["medical hypotheses", "Q4"]) return csv_path def test_exact_match(self, test_csv: Path): ranker = JournalRanker(test_csv) assert ranker.lookup("Nature") == "Q1" assert ranker.lookup("NATURE") == "Q1" assert ranker.lookup("science") == "Q1" def test_abbreviation_expansion(self, test_csv: Path): """Test that abbreviations are expanded and matched.""" ranker = JournalRanker(test_csv) # "IEEE Trans. Biomed. Eng." expands to "ieee transactions biomedical engineering" result = ranker.lookup("IEEE Trans. Biomed. Eng.") assert result == "Q1" def test_fuzzy_match_requires_90_percent(self, test_csv: Path): """Fuzzy matching should require 90% similarity (Feature 7).""" ranker = JournalRanker(test_csv) # "Journal of Physiology" is exact match result = ranker.lookup("Journal of Physiology") assert result == "Q2" def test_fuzzy_match_rejects_low_similarity(self, test_csv: Path): """Fuzzy matching should reject matches below 90% threshold.""" ranker = JournalRanker(test_csv) # This is too different to match anything at 90% result = ranker.lookup("Journal of Something Completely Different") assert result is None, "Low similarity should not match at 90% threshold" def test_no_match(self, test_csv: Path): ranker = JournalRanker(test_csv) assert ranker.lookup("Unknown Journal of Stuff") is None def test_empty_publication(self, test_csv: Path): ranker = JournalRanker(test_csv) assert ranker.lookup("") is None assert ranker.lookup(None) is None def test_caching(self, test_csv: Path): ranker = JournalRanker(test_csv) # First call result1 = ranker.lookup("Nature") # Second call should be cached result2 = ranker.lookup("Nature") assert result1 == result2 == "Q1" assert "Nature" in ranker._cache def test_loaded_property(self, test_csv: Path): ranker = JournalRanker(test_csv) assert ranker.loaded is True def test_stats(self, test_csv: Path): ranker = JournalRanker(test_csv) stats = ranker.stats() assert stats["total_journals"] == 6 assert stats["quartile_counts"]["Q1"] == 4 assert stats["quartile_counts"]["Q2"] == 1 assert stats["quartile_counts"]["Q4"] == 1 def test_missing_csv(self, tmp_path: Path): """Test graceful handling of missing CSV.""" ranker = JournalRanker(tmp_path / "nonexistent.csv") assert ranker.loaded is False assert ranker.lookup("Nature") is None # ============================================================================= # Test Override Support (Feature 7) # ============================================================================= class TestJournalOverrides: """Test manual override functionality.""" @pytest.fixture def test_csv(self, tmp_path: Path) -> Path: """Create a test CSV file.""" csv_path = tmp_path / "test_scimago.csv" with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["title_normalized", "quartile"]) writer.writerow(["nature", "Q1"]) writer.writerow(["wrong journal match", "Q4"]) # Will be overridden return csv_path @pytest.fixture def test_overrides(self, tmp_path: Path) -> Path: """Create a test overrides file.""" overrides_path = tmp_path / "overrides.csv" with open(overrides_path, "w", encoding="utf-8") as f: f.write("# This is a comment\n") f.write("Correct Journal Name,Q1\n") f.write("# Another comment\n") f.write("wrong journal match,Q2\n") # Override the Q4 from main CSV return overrides_path def test_override_takes_precedence(self, test_csv: Path, test_overrides: Path): """Overrides should take precedence over SCImago data.""" ranker = JournalRanker(test_csv, overrides_path=test_overrides) # "wrong journal match" is Q4 in CSV but Q2 in overrides result = ranker.lookup("wrong journal match") assert result == "Q2", "Override should take precedence over SCImago" def test_override_exact_match(self, test_csv: Path, test_overrides: Path): """Overrides should work with exact matching.""" ranker = JournalRanker(test_csv, overrides_path=test_overrides) result = ranker.lookup("Correct Journal Name") assert result == "Q1" def test_override_with_abbreviation_expansion( self, test_csv: Path, tmp_path: Path ): """Overrides should work after abbreviation expansion.""" overrides_path = tmp_path / "overrides_abbrev.csv" with open(overrides_path, "w", encoding="utf-8") as f: # Note: "J. Biomed. Eng." expands to "journal biomedical engineering" # (without "of"), so override must match the expanded form f.write("journal biomedical engineering,Q1\n") ranker = JournalRanker(test_csv, overrides_path=overrides_path) # "J. Biomed. Eng." should expand and match override result = ranker.lookup("J. Biomed. Eng.") assert result == "Q1" def test_empty_overrides_file(self, test_csv: Path, tmp_path: Path): """Empty overrides file should not cause errors.""" overrides_path = tmp_path / "empty_overrides.csv" overrides_path.write_text("# Just comments\n# No actual overrides\n") ranker = JournalRanker(test_csv, overrides_path=overrides_path) assert ranker.loaded is True assert ranker.lookup("Nature") == "Q1" def test_missing_overrides_file(self, test_csv: Path, tmp_path: Path): """Missing overrides file should not cause errors.""" ranker = JournalRanker( test_csv, overrides_path=tmp_path / "nonexistent.csv" ) assert ranker.loaded is True assert ranker.lookup("Nature") == "Q1" # ============================================================================= # Test Bundled CSV (Feature 4) # ============================================================================= class TestBundledCSV: """Test that bundled CSV is accessible and valid. THESE TESTS FAIL LOUDLY if the bundled CSV is missing or malformed. """ def test_bundled_csv_exists(self): """Bundled CSV must exist in package data directory.""" import zotero_chunk_rag csv_path = ( Path(zotero_chunk_rag.__file__).parent / "data" / "scimago_quartiles.csv" ) assert csv_path.exists(), ( f"CRITICAL: Bundled SCImago CSV not found at {csv_path}. " "Feature 4 requires this file to be present. " "Run: python scripts/prepare_scimago.py --input <scimago_download.csv> " f"--output {csv_path}" ) def test_bundled_csv_is_valid(self): """Bundled CSV must have correct format and data.""" import zotero_chunk_rag csv_path = ( Path(zotero_chunk_rag.__file__).parent / "data" / "scimago_quartiles.csv" ) if not csv_path.exists(): pytest.fail( f"Bundled CSV not found at {csv_path}. " "Cannot verify format." ) with open(csv_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) # Verify header columns assert reader.fieldnames is not None, "CSV has no header" assert "title_normalized" in reader.fieldnames, ( "CSV missing 'title_normalized' column" ) assert "quartile" in reader.fieldnames, ( "CSV missing 'quartile' column" ) # Verify data row_count = 0 quartile_counts = {"Q1": 0, "Q2": 0, "Q3": 0, "Q4": 0} for row in reader: row_count += 1 title = row.get("title_normalized", "").strip() quartile = row.get("quartile", "").strip() assert title, f"Row {row_count} has empty title" assert quartile in ("Q1", "Q2", "Q3", "Q4"), ( f"Row {row_count} has invalid quartile: {quartile}" ) quartile_counts[quartile] += 1 assert row_count >= 10, ( f"Bundled CSV only has {row_count} journals. " "Expected at least 10 for minimal functionality." ) def test_bundled_csv_loads_automatically(self): """JournalRanker should load bundled data automatically.""" ranker = JournalRanker() # No path specified assert ranker.loaded, ( "JournalRanker should load bundled CSV automatically. " "Ensure data/scimago_quartiles.csv exists." ) def test_bundled_csv_has_common_journals(self): """Bundled CSV should include common high-impact journals.""" ranker = JournalRanker() assert ranker.loaded, ( "CRITICAL: JournalRanker failed to load bundled CSV. " "Ensure data/scimago_quartiles.csv exists in package." ) # These journals should definitely be in the bundled data common_journals = [ ("Nature", "Q1"), ("Science", "Q1"), ("PLOS ONE", "Q1"), ] for journal, expected_quartile in common_journals: result = ranker.lookup(journal) assert result == expected_quartile, ( f"Expected {journal} to be {expected_quartile}, got {result}. " "Bundled CSV may be incomplete." ) def test_bundled_overrides_exists(self): """Bundled overrides CSV must exist (can be empty).""" import zotero_chunk_rag overrides_path = ( Path(zotero_chunk_rag.__file__).parent / "data" / "journal_overrides.csv" ) assert overrides_path.exists(), ( f"CRITICAL: Bundled overrides CSV not found at {overrides_path}. " "Feature 7 requires this file to be present (can be empty with comments)." ) # ============================================================================= # Test Fuzzy Match Threshold (Feature 7) # ============================================================================= class TestFuzzyThreshold: """Test that fuzzy matching uses 90% threshold. THESE TESTS FAIL LOUDLY if false positives occur. """ @pytest.fixture def ranker_with_known_journals(self, tmp_path: Path) -> JournalRanker: """Create a ranker with specific journals for threshold testing.""" csv_path = tmp_path / "threshold_test.csv" with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["title_normalized", "quartile"]) writer.writerow(["journal of cardiology", "Q1"]) writer.writerow(["european journal of cardiology", "Q2"]) return JournalRanker(csv_path) def test_exact_match_works(self, ranker_with_known_journals: JournalRanker): """Exact matches should work.""" assert ranker_with_known_journals.lookup("Journal of Cardiology") == "Q1" def test_near_exact_match_works(self, ranker_with_known_journals: JournalRanker): """Very similar names should match.""" # "journal of cardiology" vs "Journal of Cardiology" - should match assert ranker_with_known_journals.lookup("Journal Of Cardiology") == "Q1" def test_false_positive_prevention( self, ranker_with_known_journals: JournalRanker ): """Different journals should NOT match due to 90% threshold. At 85% threshold, "Journal of Cardiology" might match "Journal of Neurology" (they share "Journal of ____ology"). At 90%, this should be rejected. """ # This is deliberately different and should NOT match result = ranker_with_known_journals.lookup("Journal of Neurology") assert result is None, ( "FALSE POSITIVE: 'Journal of Neurology' incorrectly matched. " "Fuzzy threshold may be too low." ) def test_very_different_no_match( self, ranker_with_known_journals: JournalRanker ): """Very different names should definitely not match.""" result = ranker_with_known_journals.lookup("Proceedings of the ACM") assert result is None # ============================================================================= # Test Hot Reload (existing feature) # ============================================================================= class TestHotReload: """Test hot reload functionality.""" def test_is_stale_after_modification(self, tmp_path: Path): """is_stale should detect when CSV has been modified.""" csv_path = tmp_path / "test.csv" with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["title_normalized", "quartile"]) writer.writerow(["nature", "Q1"]) ranker = JournalRanker(csv_path) assert not ranker.is_stale() # Modify the file import time time.sleep(0.1) # Ensure mtime changes with open(csv_path, "a", encoding="utf-8") as f: f.write("science,Q1\n") assert ranker.is_stale() def test_reload_if_stale(self, tmp_path: Path): """reload_if_stale should reload modified CSV.""" csv_path = tmp_path / "test.csv" with open(csv_path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["title_normalized", "quartile"]) writer.writerow(["nature", "Q1"]) ranker = JournalRanker(csv_path) assert ranker.lookup("Science") is None # Modify the file to add Science import time time.sleep(0.1) with open(csv_path, "a", encoding="utf-8") as f: f.write("science,Q1\n") # Reload reloaded = ranker.reload_if_stale() assert reloaded is True # Now Science should be found assert ranker.lookup("Science") == "Q1"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_journal_ranker.py•17.4 KiB