"""
DeepSeek-OCR Backend Implementation
Integrates DeepSeek-OCR model for high-accuracy OCR processing
"""
import logging
from pathlib import Path
from typing import Any
import torch
from PIL import Image
from ..core.backend_manager import OCRBackend
from ..core.config import OCRConfig
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
logger = logging.getLogger(__name__)
class DeepSeekOCRBackend(OCRBackend):
"""DeepSeek-OCR backend for high-accuracy document processing"""
def __init__(self, config: OCRConfig):
super().__init__("deepseek-ocr", config)
self.model = None
self.tokenizer = None
self.device = getattr(config, "ocr_device", None) or (
"cuda" if torch.cuda.is_available() else "cpu"
)
self.model_name = "deepseek-ai/DeepSeek-OCR"
self.cache_dir = Path(
getattr(config, "ocr_cache_dir", None) or Path.home() / ".cache" / "ocr_mcp"
)
self.cache_dir.mkdir(parents=True, exist_ok=True)
def is_available(self) -> bool:
"""Check if DeepSeek-OCR is available"""
if not TRANSFORMERS_AVAILABLE:
return False
try:
# Check if model can be loaded
from huggingface_hub import model_info
model_info(self.model_name)
return True
except Exception as e:
logger.warning(f"DeepSeek-OCR model check failed: {e}")
return False
async def load_model(self) -> bool:
"""Load the DeepSeek-OCR model"""
if not TRANSFORMERS_AVAILABLE:
logger.error("Transformers not available for DeepSeek-OCR")
return False
try:
logger.info(f"Loading DeepSeek-OCR model on {self.device}")
# Load tokenizer and model
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
cache_dir=str(self.cache_dir),
trust_remote_code=True,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map="auto" if self.device == "cuda" else None,
)
if self.device == "cpu":
self.model = self.model.to(self.device)
logger.info("DeepSeek-OCR model loaded successfully")
return True
except Exception as e:
logger.error(f"Failed to load DeepSeek-OCR model: {e}")
return False
async def process_document(
self,
image_path: str,
ocr_mode: str = "text",
region: list[int] | None = None,
) -> dict[str, Any]:
"""Process document with DeepSeek-OCR"""
if not self.model or not self.tokenizer:
raise RuntimeError("DeepSeek-OCR model not loaded")
try:
# Load and preprocess image
image = Image.open(image_path).convert("RGB")
# Apply region cropping if specified
if region and len(region) == 4:
x1, y1, x2, y2 = region
image = image.crop((x1, y1, x2, y2))
# Convert to tensor
inputs = self.tokenizer(image, return_tensors="pt")
if self.device == "cuda":
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate OCR results
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=1024,
num_beams=4,
early_stopping=True,
do_sample=False,
)
# Decode results
text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Format results based on mode
if ocr_mode == "text":
result = {
"text": text.strip(),
"backend": "deepseek",
"confidence": 0.95, # DeepSeek typically has high confidence
"regions": [],
}
elif ocr_mode == "format":
# Parse structured output if available
result = {
"text": text.strip(),
"backend": "deepseek",
"confidence": 0.95,
"structured": self._parse_structured_output(text),
"regions": [],
}
else: # fine-grained
result = {
"text": text.strip(),
"backend": "deepseek",
"confidence": 0.95,
"regions": self._extract_regions(text, image.size),
"structured": {},
}
return result
except Exception as e:
logger.error(f"DeepSeek-OCR processing failed: {e}")
raise RuntimeError(f"OCR processing failed: {str(e)}")
def _parse_structured_output(self, text: str) -> dict[str, Any]:
"""Parse structured output from DeepSeek-OCR"""
# DeepSeek-OCR may provide structured output
# This is a placeholder for actual parsing logic
return {
"title": "",
"paragraphs": text.split("\n\n"),
"tables": [],
"figures": [],
}
def _extract_regions(self, text: str, image_size: tuple) -> list[dict[str, Any]]:
"""Extract text regions from DeepSeek-OCR output"""
# This would parse region information if available
# For now, return a single region covering the whole image
width, height = image_size
return [{"bbox": [0, 0, width, height], "text": text.strip(), "confidence": 0.95}]
def get_capabilities(self) -> dict[str, Any]:
"""Get backend capabilities"""
return {
"name": "DeepSeek-OCR",
"available": self.is_available(),
"modes": ["text", "format", "fine-grained"],
"languages": ["en", "zh", "multilingual"],
"gpu_support": True,
"strengths": ["high_accuracy", "complex_layouts", "mathematical_formulas"],
"limitations": ["gpu_required", "large_model_size"],
"model_size": "~7GB",
}