"""MCP server for PDF OCR using macOS Vision Framework or OwlOCR CLI."""
from __future__ import annotations
import asyncio
from enum import Enum
from pathlib import Path
from typing import Literal
from mcp.server.fastmcp import FastMCP
from .ocr_owlocr import OwlOCRConfig, is_owlocr_available, ocr_pdf_owlocr, ocr_image_owlocr
from .pdf import PDFOCRConfig, ocr_pdf
from .ocr import ocr_image
# Create MCP server
mcp = FastMCP("owlocr-mcp")
class OCRBackend(str, Enum):
"""Available OCR backends."""
VISION = "vision" # macOS Vision Framework (PyObjC)
OWLOCR = "owlocr" # OwlOCR CLI (more accurate)
AUTO = "auto" # Auto-select: OwlOCR if available, else Vision
def _select_backend(backend: str) -> str:
"""Select the actual backend to use."""
if backend == "auto":
return "owlocr" if is_owlocr_available() else "vision"
return backend
@mcp.tool()
async def ocr_pdf_to_text(
pdf_path: str,
pages: list[int] | None = None,
dpi: int = 200,
backend: Literal["auto", "vision", "owlocr"] = "auto",
languages: list[str] | None = None,
) -> str:
"""
Extract text from a PDF file using OCR.
Args:
pdf_path: Absolute path to the PDF file
pages: Optional list of 1-based page numbers to process.
If not provided, all pages are processed.
dpi: Resolution for rendering PDF pages (default: 200).
Higher values give better OCR but are slower.
backend: OCR backend to use:
- "auto": Use OwlOCR if available, else Vision Framework (default)
- "owlocr": OwlOCR CLI (more accurate, requires OwlOCR.app)
- "vision": macOS Vision Framework (no external dependencies)
languages: List of language codes for OCR (e.g., ["ko-KR", "en-US"]).
Only used with Vision backend. Default: Korean + English.
Returns:
Extracted text from the PDF with page separators.
"""
path = Path(pdf_path)
if not path.exists():
raise ValueError(f"PDF file not found: {pdf_path}")
if not path.suffix.lower() == ".pdf":
raise ValueError(f"Not a PDF file: {pdf_path}")
selected_backend = _select_backend(backend)
loop = asyncio.get_event_loop()
if selected_backend == "owlocr":
if not is_owlocr_available():
raise RuntimeError(
"OwlOCR.app not found at /Applications/OwlOCR.app. "
"Install OwlOCR or use backend='vision'."
)
config = OwlOCRConfig(dpi=dpi)
combined_text, results = await loop.run_in_executor(
None, lambda: ocr_pdf_owlocr(path, config, pages)
)
backend_name = "OwlOCR CLI"
else:
config = PDFOCRConfig(dpi=dpi, languages=languages)
combined_text, results = await loop.run_in_executor(
None, lambda: ocr_pdf(path, config, pages)
)
backend_name = "Vision Framework"
# Add summary at the end
page_count = len(results)
summary = f"\n\n--- OCR Complete: {page_count} page(s) processed using {backend_name} ---"
return combined_text + summary
@mcp.tool()
async def ocr_image_to_text(
image_path: str,
backend: Literal["auto", "vision", "owlocr"] = "auto",
languages: list[str] | None = None,
) -> str:
"""
Extract text from an image file using OCR.
Args:
image_path: Absolute path to the image file (PNG, JPEG, etc.)
backend: OCR backend to use:
- "auto": Use OwlOCR if available, else Vision Framework (default)
- "owlocr": OwlOCR CLI (more accurate, requires OwlOCR.app)
- "vision": macOS Vision Framework (no external dependencies)
languages: List of language codes for OCR (e.g., ["ko-KR", "en-US"]).
Only used with Vision backend. Default: Korean + English.
Returns:
Extracted text from the image.
"""
path = Path(image_path)
if not path.exists():
raise ValueError(f"Image file not found: {image_path}")
selected_backend = _select_backend(backend)
loop = asyncio.get_event_loop()
if selected_backend == "owlocr":
if not is_owlocr_available():
raise RuntimeError(
"OwlOCR.app not found at /Applications/OwlOCR.app. "
"Install OwlOCR or use backend='vision'."
)
text = await loop.run_in_executor(None, lambda: ocr_image_owlocr(path))
else:
text = await loop.run_in_executor(None, lambda: ocr_image(path, languages))
return text
@mcp.tool()
async def check_ocr_backends() -> str:
"""
Check which OCR backends are available on this system.
Returns:
Status of available OCR backends.
"""
lines = ["OCR Backend Status:", ""]
# Vision Framework (always available on macOS)
lines.append("✅ Vision Framework: Available (macOS built-in)")
# OwlOCR
if is_owlocr_available():
lines.append("✅ OwlOCR CLI: Available (/Applications/OwlOCR.app)")
lines.append("")
lines.append("Recommendation: Use backend='owlocr' for best accuracy")
else:
lines.append("❌ OwlOCR CLI: Not found")
lines.append(" Install from: https://owlocr.com")
lines.append("")
lines.append("Recommendation: Install OwlOCR for better accuracy")
return "\n".join(lines)
def main():
"""Entry point for the MCP server."""
mcp.run()
if __name__ == "__main__":
main()