Zotero Chunk RAG

split_notes_to_footnotes.py•16.9 KiB

#!/usr/bin/env python """Split the 'notes' field in GT JSON files into 'notes' (reviewer commentary) and 'footnotes' (paper footnotes). Reads every table_*_gt.json in tests/ground_truth_workspace/, applies per-file splitting rules derived from manual review of all 44 files, writes back with the same formatting (2-space indent, trailing newline). Run from repo root: "./.venv/Scripts/python.exe" tests/split_notes_to_footnotes.py """ from __future__ import annotations import json import pathlib import re import sys from typing import NamedTuple sys.stdout.reconfigure(encoding="utf-8") class SplitResult(NamedTuple): notes: str footnotes: str # ── Per-file overrides ──────────────────────────────────────────────────────── # Keys are (paper_slug, table_index). # Values are either: # "all_reviewer" → entire notes stays in notes, footnotes = "" # "all_footnotes" → entire notes becomes footnotes, notes = "" # a callable(notes_text) -> SplitResult for mixed cases def _split_at_marker(text: str, marker: str) -> SplitResult: """Split text at the first occurrence of *marker*. Everything before the marker is reviewer commentary, everything from the marker onward is paper footnotes. Leading/trailing whitespace is stripped from both parts. """ idx = text.find(marker) if idx == -1: # marker not found — treat entire text as reviewer commentary return SplitResult(notes=text.strip(), footnotes="") return SplitResult( notes=text[:idx].strip(), footnotes=text[idx:].strip(), ) def _split_at_regex(text: str, pattern: str) -> SplitResult: """Like _split_at_marker but uses a regex to find the split point.""" m = re.search(pattern, text) if m is None: return SplitResult(notes=text.strip(), footnotes="") idx = m.start() return SplitResult( notes=text[:idx].strip(), footnotes=text[idx:].strip(), ) # ── active-inference-tutorial ──────────────────────────────────────────────── def _ait_table1(notes: str) -> SplitResult: # notes: "Table continues on next page. The header ... readability only." # All reviewer commentary — no paper footnotes return SplitResult(notes=notes, footnotes="") def _ait_table2(notes: str) -> SplitResult: # "Continuation of Table 1 from page 15. Footnote: *While, for consistency..." return _split_at_marker(notes, "Footnote: *While") def _ait_table3(notes: str) -> SplitResult: # "Table continues on next page. Equations contain mathematical notation. # B† denotes the transpose of B (dagger notation). Aᵀ denotes A-transpose. # ½ denotes one-half." # All reviewer commentary — notation clarifications from the reviewer return SplitResult(notes=notes, footnotes="") def _ait_table4(notes: str) -> SplitResult: # "Table continues on next page. Equations contain complex mathematical # notation. B† denotes... The rawtext has 'sightly' ..." # All reviewer commentary return SplitResult(notes=notes, footnotes="") def _ait_table5(notes: str) -> SplitResult: # "This is the final row of Table 2. Variable names from rawtext: ... # Table note: The term B†π,τ denotes the transpose of Bπ,τ ..." # Split at "Table note:" — that's the paper's actual footnote return _split_at_marker(notes, "Table note:") def _ait_table6(notes: str) -> SplitResult: # "Table continues on next page. The table has 4 columns: MDP Field, ..." # All reviewer commentary return SplitResult(notes=notes, footnotes="") def _ait_table7(notes: str) -> SplitResult: # "Continuation of Table 3 from page 30. Page bottom contains a footnote # (partially visible): 'Constant α = 512...' with similar direction # notation." # This describes a footnote but doesn't transcribe it — reviewer commentary return SplitResult(notes=notes, footnotes="") # ── hallett-tms-primer ──────────────────────────────────────────────────────── def _hallett_table0(notes: str) -> SplitResult: # "The image shows the table title ... The PAS row has multi-line cell # content in both Excitatory and Inhibitory columns joined with a space." # All reviewer commentary return SplitResult(notes=notes, footnotes="") # ── helm-coregulation ──────────────────────────────────────────────────────── def _helm_table0(notes: str) -> SplitResult: # "Note (footnote, not transcribed as row): Models 1–7 are described in # the text. −2 LL = the models −2 log-likelihood; df = the number of ..." # The ENTIRE content is a paper footnote (it starts with "Note (footnote") return SplitResult(notes="", footnotes=notes) def _helm_table1(notes: str) -> SplitResult: # "The image shows ... 'Baseline' and 'Conversation' are sub-group headings # ... Coefficient subscripts use m (male) and f (female). Note (footnote): # β₀ = the average RSA ... male and female, respectively." return _split_at_marker(notes, "Note (footnote):") # ── laird-fick-polyps ──────────────────────────────────────────────────────── def _laird_table0(notes: str) -> SplitResult: # "Two-level header: 'Sexᵃ' spans ... Lowest-level headers used. # Footnote a (on Sex): Percent in row might not sum to 100 due to # rounding. Chi-square tests ... Footnote b (on Total): Percent of # all patients (n = 13,881)." return _split_at_marker(notes, "Footnote a") def _laird_table1(notes: str) -> SplitResult: # "Two-level header: 'Sexᵃ' spans ... Lowest-level headers used. # Footnote a (on Sex): Percent in row might not sum to 100 due to # rounding. Footnote b (on Total): Percent of total patients." return _split_at_marker(notes, "Footnote a") def _laird_table2(notes: str) -> SplitResult: # "Two-level header: 'Sex' spans ... Lowest-level headers used. # Footnote a (on Total): Percent of patients (n = 9301)." return _split_at_marker(notes, "Footnote a") def _laird_table3(notes: str) -> SplitResult: # "Header row has two levels: ... Sub-group header rows ... transcribed as # rows with empty cells in other columns. Referent rows have no confidence # limits or p-value. Footnote a (on Adjusted): Estimates from multiple # logistic regression with Sex and Age." return _split_at_marker(notes, "Footnote a") def _laird_table4(notes: str) -> SplitResult: # "Two-level header: 'Sexᵃ' spans ... Lowest-level headers used. # Footnote a (on Sex): Percent in row might not sum to 100 due to # rounding. Footnote b (on Total): Percent of total patients." return _split_at_marker(notes, "Footnote a") # ── reyes-lf-hrv ───────────────────────────────────────────────────────────── def _reyes_table0(notes: str) -> SplitResult: # "IBI, VLF, LF, HF are sub-group header rows ... BL = baseline. The last # column header contains two effect sizes: one for BL comparison (LB) and # one for Task comparison. Note at bottom of table: BL = baseline, IBI = # interbeat interval, VLF = very low frequency, LF = low frequency, HF = # high frequency." return _split_at_marker(notes, "Note at bottom of table:") def _reyes_table1(notes: str) -> SplitResult: # "IBI, VLF, LF, HF are sub-group header rows ... F, p, and η² values # appear only once per parameter group (on the A row), reflecting the # interaction effect; S rows have empty values for those columns. Note. # HRV was analyzed by means of adaptive autoregressive models ..." return _split_at_regex(notes, r"Note\. HRV") def _reyes_table2(notes: str) -> SplitResult: # "Panel A and Panel B are sub-group header rows; other columns in those # rows are empty. IBI-BL and IBI-Task rows have empty IBI column because # correlation of IBI with itself is not reported. Note. BL = baseline, # Task = mental arithmetic, VLF = very low frequency, LF = low frequency, # HF = high frequency. +p < .05. *p < .01." return _split_at_regex(notes, r"Note\. BL = baseline") def _reyes_table3(notes: str) -> SplitResult: # "Panel A and Panel B are sub-group header rows; other columns in those # rows are empty. IBI-BL and IBI-Task rows have empty IBI column because # correlation of IBI with itself is not reported. Note. VLF = very low # frequency, LF = low frequency, HF = high frequency. +p < .05. *p < .01." return _split_at_regex(notes, r"Note\. VLF") # ── yang-ppv-meta ──────────────────────────────────────────────────────────── def _yang_table0(notes: str) -> SplitResult: # "Footnotes: MICU, medical intensive care unit; NA, not available; SICU, # surgical intensive care unit. aTwenty-two fluid challenges included. # bTwenty-eight fluid challenges included." # Entire content is paper footnotes return SplitResult(notes="", footnotes=notes) def _yang_table1(notes: str) -> SplitResult: # "CI, cardiac index; CO, cardiac output; Echo, echocardiography; HES, ..." # Entire content is paper footnotes (abbreviation definitions + superscript notes) return SplitResult(notes="", footnotes=notes) def _yang_table2(notes: str) -> SplitResult: # "Footnotes: fn, false negative; fp, false positive; PPV, pulse pressure # variation; ..." # Entire content is paper footnotes return SplitResult(notes="", footnotes=notes) # ── roland-emg-filter ──────────────────────────────────────────────────────── def _roland_table8(notes: str) -> SplitResult: # "The table has no visible header row in the original; column headers # 'Abbreviation' and 'Meaning' are inferred from context. The section # heading 'The following abbreviations are used in this manuscript:' # appears above the table (partially visible at top of image). The µC # entry uses the micro sign (µ)." # All reviewer commentary — structural observation return SplitResult(notes=notes, footnotes="") # ── Mapping table ───────────────────────────────────────────────────────────── OVERRIDES: dict[tuple[str, int], str | callable] = { # active-inference-tutorial ("active-inference-tutorial", 0): "all_reviewer", ("active-inference-tutorial", 1): _ait_table1, ("active-inference-tutorial", 2): _ait_table2, ("active-inference-tutorial", 3): _ait_table3, ("active-inference-tutorial", 4): _ait_table4, ("active-inference-tutorial", 5): _ait_table5, ("active-inference-tutorial", 6): _ait_table6, ("active-inference-tutorial", 7): _ait_table7, # fortune-impedance — all reviewer ("fortune-impedance", 0): "all_reviewer", ("fortune-impedance", 1): "all_reviewer", ("fortune-impedance", 2): "all_reviewer", ("fortune-impedance", 3): "all_reviewer", ("fortune-impedance", 4): "all_reviewer", ("fortune-impedance", 5): "all_reviewer", ("fortune-impedance", 6): "all_reviewer", # friston-life ("friston-life", 0): "all_reviewer", # hallett-tms-primer ("hallett-tms-primer", 0): _hallett_table0, # helm-coregulation ("helm-coregulation", 0): _helm_table0, ("helm-coregulation", 1): _helm_table1, # huang-emd-1998 ("huang-emd-1998", 0): "all_reviewer", ("huang-emd-1998", 1): "all_reviewer", # laird-fick-polyps ("laird-fick-polyps", 0): _laird_table0, ("laird-fick-polyps", 1): _laird_table1, ("laird-fick-polyps", 2): _laird_table2, ("laird-fick-polyps", 3): _laird_table3, ("laird-fick-polyps", 4): _laird_table4, # reyes-lf-hrv ("reyes-lf-hrv", 0): _reyes_table0, ("reyes-lf-hrv", 1): _reyes_table1, ("reyes-lf-hrv", 2): _reyes_table2, ("reyes-lf-hrv", 3): _reyes_table3, ("reyes-lf-hrv", 4): "all_footnotes", # roland-emg-filter ("roland-emg-filter", 0): "all_reviewer", ("roland-emg-filter", 1): "all_reviewer", ("roland-emg-filter", 2): "all_reviewer", ("roland-emg-filter", 3): "all_reviewer", ("roland-emg-filter", 4): "all_reviewer", ("roland-emg-filter", 5): "all_reviewer", ("roland-emg-filter", 6): "all_reviewer", ("roland-emg-filter", 7): "all_reviewer", ("roland-emg-filter", 8): _roland_table8, # yang-ppv-meta ("yang-ppv-meta", 0): _yang_table0, ("yang-ppv-meta", 1): _yang_table1, ("yang-ppv-meta", 2): _yang_table2, ("yang-ppv-meta", 3): "all_reviewer", } def split_notes(paper: str, table_index: int, notes: str) -> SplitResult: """Apply the splitting rule for (paper, table_index).""" key = (paper, table_index) rule = OVERRIDES.get(key) if rule is None: # Safety: if a file is not in the override table, keep notes as-is # and set footnotes to empty. print(f" WARNING: no override for {key}, keeping notes as-is") return SplitResult(notes=notes, footnotes="") if rule == "all_reviewer": return SplitResult(notes=notes, footnotes="") if rule == "all_footnotes": return SplitResult(notes="", footnotes=notes) if callable(rule): return rule(notes) raise ValueError(f"Unknown rule type for {key}: {rule!r}") def main() -> None: workspace = pathlib.Path(__file__).resolve().parent / "ground_truth_workspace" if not workspace.is_dir(): print(f"ERROR: workspace not found at {workspace}", file=sys.stderr) sys.exit(1) gt_files = sorted(workspace.rglob("table_*_gt.json")) print(f"Found {len(gt_files)} GT JSON files.\n") stats = { "all_reviewer": 0, "all_footnotes": 0, "mixed_split": 0, "already_has_footnotes": 0, "empty_notes": 0, } for fp in gt_files: data = json.loads(fp.read_text(encoding="utf-8")) paper = data["paper"] table_index = data["table_index"] original_notes = data.get("notes", "") rel = fp.relative_to(workspace) # Skip files that already have a footnotes field (idempotent) if "footnotes" in data: stats["already_has_footnotes"] += 1 print(f" SKIP (already has footnotes): {rel}") continue if not original_notes: stats["empty_notes"] += 1 data["footnotes"] = "" fp.write_text( json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) print(f" EMPTY notes: {rel}") continue result = split_notes(paper, table_index, original_notes) data["notes"] = result.notes data["footnotes"] = result.footnotes # Write back with 2-space indent and trailing newline fp.write_text( json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # Classify for summary if result.footnotes == "" and result.notes == original_notes: category = "all_reviewer" elif result.notes == "" and result.footnotes == original_notes: category = "all_footnotes" else: category = "mixed_split" stats[category] += 1 # Print per-file report fn_preview = (result.footnotes[:80] + "...") if len(result.footnotes) > 80 else result.footnotes notes_preview = (result.notes[:80] + "...") if len(result.notes) > 80 else result.notes if category == "all_reviewer": print(f" REVIEWER ONLY: {rel}") elif category == "all_footnotes": print(f" FOOTNOTE ONLY: {rel}") print(f" footnotes = {fn_preview!r}") else: print(f" MIXED SPLIT: {rel}") print(f" notes = {notes_preview!r}") print(f" footnotes = {fn_preview!r}") # Print summary print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f" Total files processed: {len(gt_files)}") print(f" All reviewer commentary: {stats['all_reviewer']}") print(f" All paper footnotes: {stats['all_footnotes']}") print(f" Mixed (split): {stats['mixed_split']}") print(f" Empty notes: {stats['empty_notes']}") print(f" Already had footnotes: {stats['already_has_footnotes']}") print("\nDone.") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

split_notes_to_footnotes.py•16.9 KiB