Zotero Chunk RAG

extract_table_text.py•4.47 KiB

"""Extract raw text from PDF table regions for ground truth reconciliation. For each table in the ground truth workspace, this script: 1. Reads the bbox from the extraction JSON 2. Opens the PDF via ZoteroClient 3. Extracts text via page.get_text("text", clip=bbox) 4. Also extracts words via page.get_text("words", clip=bbox) for word-level analysis 5. Writes raw text to table_N_rawtext.txt alongside the GT files Usage:: "./.venv/Scripts/python.exe" tests/extract_table_text.py """ from __future__ import annotations import json import sys from pathlib import Path _PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(_PROJECT_ROOT / "src")) import pymupdf from zotero_chunk_rag.config import Config from zotero_chunk_rag.zotero_client import ZoteroClient def _resolve_pdf_paths() -> dict[str, Path]: config = Config.load() zotero = ZoteroClient(config.zotero_data_dir) items = zotero.get_all_items_with_pdfs() return { item.item_key: item.pdf_path for item in items if item.pdf_path and item.pdf_path.exists() } def extract_all(workspace_dir: Path) -> None: pdf_paths = _resolve_pdf_paths() paper_dirs = sorted( p for p in workspace_dir.iterdir() if p.is_dir() and (p / "manifest.json").exists() ) total = 0 skipped = 0 for paper_dir in paper_dirs: manifest = json.loads((paper_dir / "manifest.json").read_text(encoding="utf-8")) item_key = manifest["item_key"] short_name = manifest["paper"] pdf_path = pdf_paths.get(item_key) if not pdf_path: print(f" [{short_name}] SKIP — no PDF found for {item_key}") skipped += manifest["num_tables"] continue doc = pymupdf.open(str(pdf_path)) for table_info in manifest["tables"]: idx = table_info["table_index"] page_num = table_info["page_num"] # Read bbox from extraction JSON ext_path = paper_dir / table_info["extraction_path"] if not ext_path.exists(): print(f" [{short_name}] table_{idx} SKIP — no extraction JSON") skipped += 1 continue ext_data = json.loads(ext_path.read_text(encoding="utf-8")) bbox = ext_data.get("bbox") if not bbox or bbox == [0, 0, 0, 0]: print(f" [{short_name}] table_{idx} SKIP — no bbox") skipped += 1 continue # Extract text from PDF page = doc[page_num - 1] # 0-indexed clip = pymupdf.Rect(bbox) # Raw text extraction raw_text = page.get_text("text", clip=clip) # Word-level extraction (for detailed analysis) words = page.get_text("words", clip=clip) # words is list of (x0, y0, x1, y1, text, block_no, line_no, word_no) # Write output output = [] output.append(f"Paper: {short_name}") output.append(f"Table index: {idx}") output.append(f"Page: {page_num}") output.append(f"Caption: {table_info['caption']}") output.append(f"Artifact: {table_info.get('artifact_type', 'none')}") output.append(f"Bbox: {bbox}") output.append(f"Word count: {len(words)}") output.append("") output.append("=" * 60) output.append("RAW TEXT (get_text('text', clip=bbox))") output.append("=" * 60) output.append(raw_text) output.append("") output.append("=" * 60) output.append("WORDS (get_text('words', clip=bbox))") output.append("=" * 60) for w in words: x0, y0, x1, y1 = w[0], w[1], w[2], w[3] text = w[4] block_no, line_no, word_no = w[5], w[6], w[7] output.append(f" [{block_no}:{line_no}:{word_no}] ({x0:.1f},{y0:.1f})–({x1:.1f},{y1:.1f}) '{text}'") rawtext_path = paper_dir / f"table_{idx}_rawtext.txt" rawtext_path.write_text("\n".join(output), encoding="utf-8") total += 1 doc.close() print(f" [{short_name}] {len(manifest['tables'])} tables extracted") print(f"\nDone. {total} tables extracted, {skipped} skipped.") if __name__ == "__main__": workspace = _PROJECT_ROOT / "tests" / "ground_truth_workspace" print(f"Extracting text from table regions...") print(f"Workspace: {workspace}") extract_all(workspace)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extract_table_text.py•4.47 KiB