Oxenstierna

converter.py•7.59 KiB

import json import logging import uuid import xml.etree.ElementTree as ET logger = logging.getLogger("ra_mcp.label.converter") _NS_ALTO = {"a": "http://www.loc.gov/standards/alto/ns-v4#"} _BASELINE_ASCENT = 40 _BASELINE_DESCENT = 15 def _int(value: str | None, default: int = 0) -> int: try: return int(value) if value else default except ValueError: return default def _polygon_from_baseline(baseline: str, ascent: int = _BASELINE_ASCENT, descent: int = _BASELINE_DESCENT) -> str: """Create a polygon strip from a BASELINE attribute.""" try: points = [(int(x), int(y)) for x, y in (p.split(",") for p in baseline.split())] if len(points) < 2: return "" top = [f"{x},{max(0, y - ascent)}" for x, y in points] bottom = [f"{x},{y + descent}" for x, y in reversed(points)] return " ".join(top + bottom) except (ValueError, IndexError): return "" def _parse_points(points_str: str) -> list[tuple[int, int]]: """Parse space-separated 'x,y' pairs into a list of (x, y) tuples.""" points: list[tuple[int, int]] = [] for pair in points_str.strip().split(): x, y = pair.split(",") points.append((int(x), int(y))) return points def parse_alto(alto_xml: str) -> dict: """Parse ALTO v4 XML string into an intermediate dict. Returns dict with keys: filename, page_width, page_height, textlines. Each textline has: points (list of (x,y) tuples), text, id. """ root = ET.fromstring(alto_xml) # Fall back to no-namespace queries if the document lacks xmlns ns = _NS_ALTO if root.tag.startswith("{") else {} prefix = "a:" if ns else "" page = root.find(f".//{prefix}Page", ns) if page is None: raise ValueError("No <Page> element found in ALTO XML") page_width = _int(page.get("WIDTH")) page_height = _int(page.get("HEIGHT")) filename_el = root.find(f".//{prefix}fileName", ns) filename = filename_el.text if filename_el is not None and filename_el.text else "unknown" textlines: list[dict] = [] tag = f"{{{_NS_ALTO['a']}}}TextLine" if ns else "TextLine" for tl in root.iter(tag): # Polygon fallback chain: Shape/Polygon → BASELINE → bbox polygon_el = tl.find(f"{prefix}Shape/{prefix}Polygon", ns) points_str = polygon_el.get("POINTS", "") if polygon_el is not None else "" if not points_str: baseline = tl.get("BASELINE", "") if baseline: points_str = _polygon_from_baseline(baseline) else: h, v, w, ht = _int(tl.get("HPOS")), _int(tl.get("VPOS")), _int(tl.get("WIDTH")), _int(tl.get("HEIGHT")) if w and ht: points_str = f"{h},{v} {h + w},{v} {h + w},{v + ht} {h},{v + ht}" if not points_str: continue points = _parse_points(points_str) strings = tl.findall(f"{prefix}String", ns) words = [s.get("CONTENT", "") for s in strings] text = " ".join(w for w in words if w) textlines.append( { "points": points, "text": text, "id": tl.get("ID", ""), } ) logger.info(f"Parsed ALTO: {filename}, {page_width}x{page_height}, {len(textlines)} textlines") return { "filename": filename, "page_width": page_width, "page_height": page_height, "textlines": textlines, } def to_label_studio_task( parsed: dict, image_url: str | None = None, feedback: list[str] | None = None, ) -> dict: """Convert parsed ALTO data into a Label Studio task dict. Args: parsed: Output from parse_alto(). image_url: Full URL to the page image. Falls back to local-files path. feedback: List of feedback choice values (e.g. ["Transcription", "Segmentation"]) applied as per-region choices on every textline. Returns: Label Studio task dict with data and predictions. """ if image_url is None: image_url = f"/data/local-files/?d={parsed['filename']}" pw = parsed["page_width"] ph = parsed["page_height"] results: list[dict] = [] for tl in parsed["textlines"]: region_id = str(uuid.uuid4())[:8] vertices: list[dict] = [] for x, y in tl["points"]: point_id = f"pt-{uuid.uuid4().hex[:6]}" vertices.append( { "id": point_id, "x": x / pw * 100, "y": y / ph * 100, "prevPointId": vertices[-1]["id"] if vertices else None, "isBezier": False, } ) # VectorLabels region results.append( { "id": region_id, "type": "vectorlabels", "from_name": "label", "to_name": "image", "original_width": pw, "original_height": ph, "image_rotation": 0, "value": { "vertices": vertices, "closed": True, "vectorlabels": ["Textlines"], }, } ) # Transcription region (linked to same region_id) results.append( { "id": region_id, "type": "textarea", "from_name": "transcription", "to_name": "image", "original_width": pw, "original_height": ph, "image_rotation": 0, "value": { "vertices": vertices, "closed": True, "text": [tl["text"]], }, } ) # Feedback choices region (linked to same region_id) if feedback: results.append( { "id": region_id, "type": "choices", "from_name": "feedback", "to_name": "image", "original_width": pw, "original_height": ph, "image_rotation": 0, "value": { "vertices": vertices, "closed": True, "choices": feedback, }, } ) return { "data": {"ocr": image_url}, "predictions": [{"result": results}], } def convert_alto_to_tasks( alto_xmls: list[str], image_urls: list[str] | None = None, feedback_list: list[list[str]] | None = None, ) -> list[dict]: """Convert multiple ALTO XML strings to Label Studio tasks. Args: alto_xmls: List of ALTO XML strings. image_urls: List of image URLs paired by index with alto_xmls. feedback_list: List of feedback choice lists paired by index. Each entry is e.g. ["Transcription"] or ["Transcription", "Segmentation"]. Returns: List of Label Studio task dicts. """ tasks: list[dict] = [] for i, alto_xml in enumerate(alto_xmls): parsed = parse_alto(alto_xml) image_url = image_urls[i] if image_urls and i < len(image_urls) else None feedback = feedback_list[i] if feedback_list and i < len(feedback_list) else None tasks.append(to_label_studio_task(parsed, image_url=image_url, feedback=feedback)) logger.info(f"Converted {len(tasks)} ALTO file(s) to Label Studio tasks") return tasks def tasks_to_json(tasks: list[dict]) -> str: """Serialize Label Studio tasks to JSON string.""" return json.dumps(tasks, indent=2, ensure_ascii=False)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Riksarkivet/oxenstierna'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

converter.py•7.59 KiB