Zotero Chunk RAG

Overview Schema Related Servers Score Discussions

prepare_qa.py•6.03 KiB

"""Prepare a QA workspace from the stress test debug database. This standalone script reads ``_stress_test_debug.db``, renders every non-artifact table as a 300 DPI PNG via ``render_table_image()``, writes each table's extraction data as JSON, and produces a manifest mapping every table to its image and extraction files. The workspace lives at ``tests/agent_qa/workspace/`` (gitignored). Usage:: "./.venv/Scripts/python.exe" tests/agent_qa/prepare_qa.py Requires a recent ``_stress_test_debug.db`` in the project root (produced by running ``tests/stress_test_real_library.py``). """ from __future__ import annotations import json import sqlite3 import sys from pathlib import Path _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(_PROJECT_ROOT / "src")) from zotero_chunk_rag.feature_extraction.ground_truth import make_table_id from zotero_chunk_rag.feature_extraction.render import render_table_image # Re-use PDF path resolution from the ground truth workspace script. from tests.create_ground_truth import _resolve_pdf_paths def prepare_qa_workspace( debug_db_path: Path, output_dir: Path, *, pdf_paths: dict[str, Path] | None = None, ) -> Path: """Build a QA workspace from a stress test debug database. Parameters ---------- debug_db_path: Path to ``_stress_test_debug.db``. output_dir: Root directory for the workspace (one subdirectory per paper). pdf_paths: Optional pre-resolved mapping of ``item_key -> pdf_path``. When *None*, the function resolves paths via the Zotero client. Returns ------- Path Path to the generated ``manifest.json``. """ debug_db_path = Path(debug_db_path) output_dir = Path(output_dir) if not debug_db_path.exists(): raise FileNotFoundError(f"Debug database not found: {debug_db_path}") if pdf_paths is None: pdf_paths = _resolve_pdf_paths() conn = sqlite3.connect(str(debug_db_path)) conn.row_factory = sqlite3.Row try: rows = conn.execute( """ SELECT et.id, et.item_key, et.table_index, et.page_num, et.caption, et.num_rows, et.num_cols, et.fill_rate, et.headers_json, et.rows_json, et.bbox, et.artifact_type, et.extraction_strategy, p.short_name FROM extracted_tables et JOIN papers p ON et.item_key = p.item_key WHERE et.artifact_type IS NULL ORDER BY p.short_name, et.table_index """ ).fetchall() manifest_entries: list[dict] = [] for table_row in rows: item_key = table_row["item_key"] short_name = table_row["short_name"] idx = table_row["table_index"] page_num = table_row["page_num"] caption = table_row["caption"] bbox_raw = table_row["bbox"] bbox_list = json.loads(bbox_raw) if bbox_raw else [0, 0, 0, 0] bbox_tuple = tuple(float(v) for v in bbox_list) headers = json.loads(table_row["headers_json"]) if table_row["headers_json"] else [] rows_data = json.loads(table_row["rows_json"]) if table_row["rows_json"] else [] table_id = make_table_id(item_key, caption, page_num, idx) paper_dir = output_dir / short_name paper_dir.mkdir(parents=True, exist_ok=True) # Render table image image_path = paper_dir / f"table_{idx}.png" pdf_path = pdf_paths.get(item_key) if pdf_path and pdf_path.exists(): render_table_image( pdf_path, page_num, bbox_tuple, image_path, dpi=300, ) # Write extraction JSON extraction_data = { "table_id": table_id, "paper": short_name, "item_key": item_key, "page_num": page_num, "table_index": idx, "caption": caption, "headers": headers, "rows": rows_data, "num_rows": table_row["num_rows"], "num_cols": table_row["num_cols"], "fill_rate": table_row["fill_rate"], "bbox": bbox_list, "extraction_strategy": table_row["extraction_strategy"], } extraction_path = paper_dir / f"table_{idx}_extraction.json" extraction_path.write_text( json.dumps(extraction_data, indent=2, ensure_ascii=False), encoding="utf-8", ) # Collect manifest entry manifest_entries.append({ "table_id": table_id, "paper": short_name, "item_key": item_key, "page_num": page_num, "table_index": idx, "caption": caption, "image_path": f"{short_name}/table_{idx}.png", "extraction_path": f"{short_name}/table_{idx}_extraction.json", "num_rows": table_row["num_rows"], "num_cols": table_row["num_cols"], }) # Write manifest manifest_path = output_dir / "manifest.json" manifest_path.write_text( json.dumps(manifest_entries, indent=2, ensure_ascii=False), encoding="utf-8", ) print(f"QA workspace: {len(manifest_entries)} non-artifact tables across {len(set(e['paper'] for e in manifest_entries))} papers") finally: conn.close() return manifest_path if __name__ == "__main__": db_path = _PROJECT_ROOT / "_stress_test_debug.db" workspace_dir = _PROJECT_ROOT / "tests" / "agent_qa" / "workspace" print(f"Creating QA workspace from {db_path}") print(f"Output directory: {workspace_dir}") prepare_qa_workspace(db_path, workspace_dir) print("Done.")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ccam80/zotero-chunk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

prepare_qa.py•6.03 KiB