"""OCR core module using macOS Vision Framework."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import objc
import Quartz
import Vision
from PIL import Image
@dataclass
class OCRResult:
"""Result of OCR operation on a single page."""
page_number: int
text: str
confidence: float
def ocr_image(image_path: Path, languages: list[str] | None = None) -> str:
"""
Perform OCR on an image file using macOS Vision Framework.
Args:
image_path: Path to the image file
languages: List of language codes (e.g., ["ko-KR", "en-US"])
Returns:
Extracted text from the image
"""
# Load image as CIImage
image_url = Quartz.CFURLCreateWithFileSystemPath(
None, str(image_path), Quartz.kCFURLPOSIXPathStyle, False
)
ci_image = Quartz.CIImage.imageWithContentsOfURL_(image_url)
if ci_image is None:
raise ValueError(f"Failed to load image: {image_path}")
# Create text recognition request
request = Vision.VNRecognizeTextRequest.alloc().init()
# Configure request
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(True)
if languages:
request.setRecognitionLanguages_(languages)
else:
# Default: Korean + English
request.setRecognitionLanguages_(["ko-KR", "en-US"])
# Create request handler and perform OCR
handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(ci_image, None)
success, error = handler.performRequests_error_([request], None)
if not success:
error_msg = error.localizedDescription() if error else "Unknown error"
raise RuntimeError(f"OCR failed: {error_msg}")
# Extract text from results
results = request.results()
if not results:
return ""
# Sort by Y position (top to bottom) then X position (left to right)
def get_position(observation: objc.objc_object) -> tuple[float, float]:
bbox = observation.boundingBox()
# Vision uses bottom-left origin, so invert Y for top-to-bottom order
return (1 - bbox.origin.y - bbox.size.height, bbox.origin.x)
sorted_results = sorted(results, key=get_position)
lines = []
for observation in sorted_results:
top_candidate = observation.topCandidates_(1)
if top_candidate:
lines.append(top_candidate[0].string())
return "\n".join(lines)
def ocr_image_pil(pil_image: Image.Image, languages: list[str] | None = None) -> str:
"""
Perform OCR on a PIL Image using macOS Vision Framework.
Args:
pil_image: PIL Image object
languages: List of language codes
Returns:
Extracted text from the image
"""
# Convert PIL Image to CIImage via bytes
import io
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
image_data = buffer.getvalue()
ns_data = Quartz.NSData.dataWithBytes_length_(image_data, len(image_data))
ci_image = Quartz.CIImage.imageWithData_(ns_data)
if ci_image is None:
raise ValueError("Failed to convert PIL Image to CIImage")
# Create text recognition request
request = Vision.VNRecognizeTextRequest.alloc().init()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
request.setUsesLanguageCorrection_(True)
if languages:
request.setRecognitionLanguages_(languages)
else:
request.setRecognitionLanguages_(["ko-KR", "en-US"])
# Perform OCR
handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(ci_image, None)
success, error = handler.performRequests_error_([request], None)
if not success:
error_msg = error.localizedDescription() if error else "Unknown error"
raise RuntimeError(f"OCR failed: {error_msg}")
# Extract text
results = request.results()
if not results:
return ""
def get_position(observation: objc.objc_object) -> tuple[float, float]:
bbox = observation.boundingBox()
return (1 - bbox.origin.y - bbox.size.height, bbox.origin.x)
sorted_results = sorted(results, key=get_position)
lines = []
for observation in sorted_results:
top_candidate = observation.topCandidates_(1)
if top_candidate:
lines.append(top_candidate[0].string())
return "\n".join(lines)