OCR-MCP

deepseek_backend.py•6.36 KiB

""" DeepSeek-OCR Backend Implementation Integrates DeepSeek-OCR model for high-accuracy OCR processing """ import logging from pathlib import Path from typing import Any import torch from PIL import Image from ..core.backend_manager import OCRBackend from ..core.config import OCRConfig try: from transformers import AutoModelForCausalLM, AutoTokenizer TRANSFORMERS_AVAILABLE = True except ImportError: TRANSFORMERS_AVAILABLE = False logger = logging.getLogger(__name__) class DeepSeekOCRBackend(OCRBackend): """DeepSeek-OCR backend for high-accuracy document processing""" def __init__(self, config: OCRConfig): super().__init__("deepseek-ocr", config) self.model = None self.tokenizer = None self.device = getattr(config, "ocr_device", None) or ( "cuda" if torch.cuda.is_available() else "cpu" ) self.model_name = "deepseek-ai/DeepSeek-OCR" self.cache_dir = Path( getattr(config, "ocr_cache_dir", None) or Path.home() / ".cache" / "ocr_mcp" ) self.cache_dir.mkdir(parents=True, exist_ok=True) def is_available(self) -> bool: """Check if DeepSeek-OCR is available""" if not TRANSFORMERS_AVAILABLE: return False try: # Check if model can be loaded from huggingface_hub import model_info model_info(self.model_name) return True except Exception as e: logger.warning(f"DeepSeek-OCR model check failed: {e}") return False async def load_model(self) -> bool: """Load the DeepSeek-OCR model""" if not TRANSFORMERS_AVAILABLE: logger.error("Transformers not available for DeepSeek-OCR") return False try: logger.info(f"Loading DeepSeek-OCR model on {self.device}") # Load tokenizer and model self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True ) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None, ) if self.device == "cpu": self.model = self.model.to(self.device) logger.info("DeepSeek-OCR model loaded successfully") return True except Exception as e: logger.error(f"Failed to load DeepSeek-OCR model: {e}") return False async def process_document( self, image_path: str, ocr_mode: str = "text", region: list[int] | None = None, ) -> dict[str, Any]: """Process document with DeepSeek-OCR""" if not self.model or not self.tokenizer: raise RuntimeError("DeepSeek-OCR model not loaded") try: # Load and preprocess image image = Image.open(image_path).convert("RGB") # Apply region cropping if specified if region and len(region) == 4: x1, y1, x2, y2 = region image = image.crop((x1, y1, x2, y2)) # Convert to tensor inputs = self.tokenizer(image, return_tensors="pt") if self.device == "cuda": inputs = {k: v.cuda() for k, v in inputs.items()} # Generate OCR results with torch.no_grad(): outputs = self.model.generate( **inputs, max_length=1024, num_beams=4, early_stopping=True, do_sample=False, ) # Decode results text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Format results based on mode if ocr_mode == "text": result = { "text": text.strip(), "backend": "deepseek", "confidence": 0.95, # DeepSeek typically has high confidence "regions": [], } elif ocr_mode == "format": # Parse structured output if available result = { "text": text.strip(), "backend": "deepseek", "confidence": 0.95, "structured": self._parse_structured_output(text), "regions": [], } else: # fine-grained result = { "text": text.strip(), "backend": "deepseek", "confidence": 0.95, "regions": self._extract_regions(text, image.size), "structured": {}, } return result except Exception as e: logger.error(f"DeepSeek-OCR processing failed: {e}") raise RuntimeError(f"OCR processing failed: {str(e)}") def _parse_structured_output(self, text: str) -> dict[str, Any]: """Parse structured output from DeepSeek-OCR""" # DeepSeek-OCR may provide structured output # This is a placeholder for actual parsing logic return { "title": "", "paragraphs": text.split("\n\n"), "tables": [], "figures": [], } def _extract_regions(self, text: str, image_size: tuple) -> list[dict[str, Any]]: """Extract text regions from DeepSeek-OCR output""" # This would parse region information if available # For now, return a single region covering the whole image width, height = image_size return [{"bbox": [0, 0, width, height], "text": text.strip(), "confidence": 0.95}] def get_capabilities(self) -> dict[str, Any]: """Get backend capabilities""" return { "name": "DeepSeek-OCR", "available": self.is_available(), "modes": ["text", "format", "fine-grained"], "languages": ["en", "zh", "multilingual"], "gpu_support": True, "strengths": ["high_accuracy", "complex_layouts", "mathematical_formulas"], "limitations": ["gpu_required", "large_model_size"], "model_size": "~7GB", }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sandraschi/ocr-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

deepseek_backend.py•6.36 KiB