Zotero Chunk RAG

Overview Schema Related Servers Score Discussions

zotero-chunk-mcp
src
zotero_chunk_rag

journal_ranker.py•8.71 KiB

""" Journal quality ranking via SCImago quartile lookup. Provides a 3-tier matching strategy: 1. Exact match on normalized journal name 2. Acronym expansion then exact match 3. Fuzzy matching with rapidfuzz (score >= 85) """ import csv import logging import re from pathlib import Path from rapidfuzz import fuzz, process logger = logging.getLogger(__name__) # Common journal abbreviations for tier 2 expansion ABBREVIATIONS: dict[str, list[str]] = { "trans.": ["transactions"], "biomed.": ["biomedical"], "eng.": ["engineering"], "j.": ["journal"], "proc.": ["proceedings"], "int.": ["international"], "sci.": ["science", "sciences"], "rev.": ["review", "reviews"], "lett.": ["letters"], "comput.": ["computer", "computing", "computational"], "med.": ["medicine", "medical"], "phys.": ["physics", "physical"], "chem.": ["chemistry", "chemical"], "appl.": ["applied"], "res.": ["research"], "biol.": ["biology", "biological"], "conf.": ["conference"], "symp.": ["symposium"], "ann.": ["annual", "annals"], "eur.": ["european"], "am.": ["american"], "nat.": ["national", "nature", "natural"], "tech.": ["technology", "technical"], "syst.": ["systems"], "commun.": ["communications"], "electr.": ["electrical", "electronic", "electronics"], "rehabil.": ["rehabilitation"], "neurosci.": ["neuroscience"], "cardiovasc.": ["cardiovascular"], "physiol.": ["physiology", "physiological"], } def _normalize_title(title: str) -> str: """ Normalize a journal title for lookup. - Lowercase - Replace punctuation (& : - /) with spaces - Collapse multiple spaces - Strip """ title = title.lower() title = re.sub(r"[&:\-/]", " ", title) title = re.sub(r"\s+", " ", title) return title.strip() def _expand_abbreviations(title: str) -> list[str]: """ Generate all expansions of abbreviated journal name. Returns list of possible expanded forms. """ title_lower = title.lower() # Find all abbreviations in the title expansions = [title_lower] for abbrev, full_forms in ABBREVIATIONS.items(): new_expansions = [] for exp in expansions: if abbrev in exp: for full in full_forms: new_expansions.append(exp.replace(abbrev, full, 1)) else: new_expansions.append(exp) expansions = new_expansions # Normalize all expansions return [_normalize_title(e) for e in expansions] class JournalRanker: """ SCImago-based journal quartile lookup. Loads a pre-processed CSV with normalized journal titles and their best quartile (Q1/Q2/Q3/Q4). """ def __init__(self, csv_path: Path | None = None, overrides_path: Path | None = None): """ Load the lookup table. Args: csv_path: Path to scimago_quartiles.csv. If None, uses the bundled data file in the package data directory. overrides_path: Path to journal_overrides.csv. If None, uses the bundled overrides file in the package data directory. """ if csv_path is None: csv_path = Path(__file__).parent / "data" / "scimago_quartiles.csv" self._lookup: dict[str, str] = {} self._all_titles: list[str] = [] # For fuzzy matching self._cache: dict[str, str | None] = {} # Query cache self._overrides: dict[str, str] = {} # Manual override mappings self._csv_path: Path | None = csv_path self._csv_mtime: float | None = None if csv_path.exists(): self._load_csv(csv_path) self._csv_mtime = csv_path.stat().st_mtime else: logger.warning( f"SCImago CSV not found at {csv_path}. " "Journal quartile ranking will be disabled. " "Run scripts/prepare_scimago.py to generate the file." ) # Load overrides (takes precedence over SCImago lookups) if overrides_path is None: overrides_path = Path(__file__).parent / "data" / "journal_overrides.csv" if overrides_path.exists(): self._load_overrides(overrides_path) def _load_csv(self, csv_path: Path) -> None: """Load the lookup table from CSV.""" with open(csv_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: title = row.get("title_normalized", "").strip() quartile = row.get("quartile", "").strip() if title and quartile: self._lookup[title] = quartile self._all_titles.append(title) def _load_overrides(self, path: Path) -> None: """Load manual override mappings from CSV. Overrides take precedence over SCImago lookups. Use this to correct fuzzy matching mistakes or add journals not in the SCImago database. File format: input_title,correct_quartile (comments start with #) """ with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue parts = line.split(",", 1) if len(parts) == 2: title, quartile = parts normalized = _normalize_title(title.strip()) self._overrides[normalized] = quartile.strip() def lookup(self, publication: str) -> str | None: """ Look up journal quartile. Args: publication: Journal/publication name from Zotero Returns: 'Q1', 'Q2', 'Q3', 'Q4', or None if not found """ if not publication: return None # Check cache first if publication in self._cache: return self._cache[publication] result = self._lookup_uncached(publication) self._cache[publication] = result return result def _lookup_uncached(self, publication: str) -> str | None: """Perform the actual lookup (without caching).""" normalized = _normalize_title(publication) # Tier 0: Check manual overrides first (highest priority) if normalized in self._overrides: return self._overrides[normalized] # Tier 1: Exact match on normalized title if normalized in self._lookup: return self._lookup[normalized] # Tier 2: Expand abbreviations and try exact match for expanded in _expand_abbreviations(publication): if expanded in self._overrides: return self._overrides[expanded] if expanded in self._lookup: return self._lookup[expanded] # Tier 3: Fuzzy match using extractOne (optimized) # Threshold raised to 90 to reduce false positive matches if self._all_titles: match = process.extractOne( normalized, self._all_titles, scorer=fuzz.ratio, score_cutoff=90 ) if match: matched_title, score, _ = match return self._lookup[matched_title] return None @property def loaded(self) -> bool: """Check if lookup table is loaded.""" return len(self._lookup) > 0 def is_stale(self) -> bool: """Check if CSV has been modified since loading.""" if self._csv_path is None or not self._csv_path.exists(): return False current_mtime = self._csv_path.stat().st_mtime return current_mtime != self._csv_mtime def reload_if_stale(self) -> bool: """Reload CSV if modified since loading. Returns True if reloaded. Note: This is not called automatically. Callers should invoke this at appropriate points (e.g., before a batch indexing run) if they want hot-reload behavior. For the MCP server, reloading would require restarting the server process. """ if not self.is_stale(): return False self._lookup.clear() self._all_titles.clear() self._cache.clear() self._load_csv(self._csv_path) self._csv_mtime = self._csv_path.stat().st_mtime logger.info(f"Reloaded SCImago data: {len(self._lookup)} journals") return True def stats(self) -> dict: """Return statistics about the loaded data.""" quartile_counts = {"Q1": 0, "Q2": 0, "Q3": 0, "Q4": 0} for q in self._lookup.values(): if q in quartile_counts: quartile_counts[q] += 1 return { "total_journals": len(self._lookup), "quartile_counts": quartile_counts, }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

journal_ranker.py•8.71 KiB