OCR-MCP

ocr-mcp
scripts

install_models.py•31.2 KiB

#!/usr/bin/env python3 """ OCR-MCP Model Installation Script Downloads and installs all OCR models and dependencies for the OCR-MCP server. This script handles model downloads, cache setup, and verification for all supported OCR backends. Usage: python scripts/install_models.py [options] Options: --backends BACKEND [BACKEND ...] Install specific backends (default: all) --skip-verification Skip model verification after download --force-redownload Force redownload of existing models --cache-dir DIR Custom cache directory for models --gpu-only Only install GPU-compatible models --cpu-only Only install CPU-compatible models --dry-run Show what would be installed without doing it --verbose, -v Increase verbosity Supported backends: - mistral-ocr: Mistral OCR 3 (state-of-the-art, API-based) - deepseek-ocr: DeepSeek-OCR (4.7M+ downloads) - florence-2: Microsoft Florence-2 vision model - dots-ocr: DOTS.OCR document understanding - pp-ocrv5: PaddlePaddle PP-OCRv5 - qwen-image-layered: Qwen image decomposition - got-ocr: GOT-OCR2.0 legacy - tesseract: Tesseract OCR - easyocr: EasyOCR Examples: # Install all models python scripts/install_models.py # Install only specific backends python scripts/install_models.py --backends mistral-ocr deepseek-ocr florence-2 # Force redownload and use custom cache python scripts/install_models.py --force-redownload --cache-dir ./models # Dry run to see what would be installed python scripts/install_models.py --dry-run --verbose """ import argparse import asyncio import json import logging import os import sys import time from pathlib import Path # Configure logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) class ModelInstaller: """Handles downloading and installation of OCR models.""" def __init__(self, cache_dir: Path | None = None, force_redownload: bool = False): self.cache_dir = cache_dir or Path.home() / ".cache" / "ocr-mcp" self.cache_dir.mkdir(parents=True, exist_ok=True) self.force_redownload = force_redownload self.installed_models = set() # Track installation status self.status_file = self.cache_dir / "installation_status.json" self.load_status() def load_status(self): """Load previous installation status.""" if self.status_file.exists(): try: with open(self.status_file) as f: data = json.load(f) self.installed_models = set(data.get("installed_models", [])) except Exception as e: logger.warning(f"Could not load status file: {e}") async def _install_python_package(self, package_name: str, dry_run: bool = False) -> bool: """Install a Python package using pip.""" logger.info(f"Installing Python package: {package_name}...") if dry_run: logger.info(f"DRY RUN: Would install package: {package_name}") return True try: result = await asyncio.create_subprocess_exec( sys.executable, "-m", "pip", "install", package_name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await result.communicate() if result.returncode == 0: logger.info(f"SUCCESS: Package {package_name} installed") return True else: logger.error(f"FAILED: Package {package_name} installation failed") logger.error(f"STDOUT: {stdout.decode().strip()}") logger.error(f"STDERR: {stderr.decode().strip()}") return False except Exception as e: logger.error(f"Exception during package installation: {e}") return False def save_status(self): """Save installation status.""" try: data = {"installed_models": list(self.installed_models), "last_updated": time.time()} with open(self.status_file, "w") as f: json.dump(data, f, indent=2) except Exception as e: logger.warning(f"Could not save status file: {e}") async def install_deepseek_ocr(self, dry_run: bool = False) -> bool: """Install DeepSeek-OCR model.""" logger.info("Installing DeepSeek-OCR...") if not dry_run: try: # Install required dependencies required_deps = [ "transformers", "torch", "accelerate", "pillow", "easydict", "einops", ] for dep in required_deps: try: __import__(dep) except ImportError: logger.info(f"Installing missing dependency: {dep}") if not await self._install_python_package(dep): logger.error(f"Failed to install dependency {dep}") return False import torch from transformers import pipeline # Check GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") # Download model model_path = self.cache_dir / "deepseek-ocr" if self.force_redownload and model_path.exists(): import shutil shutil.rmtree(model_path) logger.info("Downloading DeepSeek-OCR model (this may take a while)...") pipeline( "image-to-text", model="deepseek-ai/DeepSeek-OCR", cache_dir=str(self.cache_dir), device=device, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, trust_remote_code=True, ) self.installed_models.add("deepseek-ocr") logger.info("SUCCESS: DeepSeek-OCR installed") except Exception as e: logger.error(f"FAILED: DeepSeek-OCR installation failed: {e}") return False return True async def install_mistral_ocr(self, dry_run: bool = False) -> bool: """Install Mistral OCR 3 (API-based).""" logger.info("Installing Mistral OCR 3...") if not dry_run: try: # Check for API key api_key = os.getenv("MISTRAL_API_KEY") if not api_key: logger.error( "MISTRAL_API_KEY environment variable not set. Get your key from https://mistral.ai/" ) return False # Test API connection import httpx response = httpx.get( "https://api.mistral.ai/v1/models", headers={"Authorization": f"Bearer {api_key}"}, timeout=10, ) if response.status_code == 200: logger.info("Successfully connected to Mistral AI API") self.installed_models.add("mistral-ocr") return True else: logger.error( f"Failed to connect to Mistral API: {response.status_code} - {response.text}" ) return False except ImportError: logger.error("httpx not available. Install with: pip install httpx") return False except Exception as e: logger.error(f"Mistral OCR setup failed: {e}") return False return True async def install_florence_2(self, dry_run: bool = False) -> bool: """Install Florence-2 model.""" logger.info("Installing Florence-2...") if not dry_run: try: import torch from transformers import AutoModelForCausalLM, AutoProcessor device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") logger.info("Downloading Florence-2 model (this may take a while)...") processor = AutoProcessor.from_pretrained( "microsoft/Florence-2-base", cache_dir=str(self.cache_dir), trust_remote_code=True, ) model = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-base", cache_dir=str(self.cache_dir), trust_remote_code=True, ) # Test model loading model.to(device) logger.info("Testing Florence-2 model...") # Quick test with dummy input dummy_input = processor( text="<OCR>", images=[[[[0] * 3] * 64] * 64], return_tensors="pt" ).to(device) with torch.no_grad(): _ = model.generate(**dummy_input, max_new_tokens=1) self.installed_models.add("florence-2") logger.info("SUCCESS: Florence-2 installed") except Exception as e: logger.error(f"FAILED: Florence-2 installation failed: {e}") return False return True async def install_dots_ocr(self, dry_run: bool = False) -> bool: """Install DOTS.OCR model.""" logger.info("Installing DOTS.OCR...") if not dry_run: try: # DOTS.OCR uses a custom model, we'll use a placeholder for now # In a real implementation, this would download from the official repository logger.info("DOTS.OCR requires manual installation from their repository") logger.info("Please visit: https://github.com/dots-ocr/dots-ocr") logger.warning("DOTS.OCR installation skipped - requires manual setup") # For now, we'll mark it as "installed" but note it's manual self.installed_models.add("dots-ocr-manual") return True except Exception as e: logger.error(f"FAILED: DOTS.OCR installation failed: {e}") return False return True async def install_pp_ocrv5(self, dry_run: bool = False) -> bool: """Install PP-OCRv5 (PaddlePaddle).""" logger.info("Installing PP-OCRv5...") if not dry_run: try: # Install required dependencies required_deps = ["paddlepaddle", "paddleocr"] for dep in required_deps: try: __import__(dep.replace("-", "").replace("_", "")) except ImportError: logger.info(f"Installing missing dependency: {dep}") if not await self._install_python_package(dep): logger.error(f"Failed to install dependency {dep}") return False logger.info("PP-OCRv5 dependencies installed successfully") logger.info( "Note: PaddleOCR API has changed, models will be downloaded on first use" ) # Due to changing PaddleOCR APIs, we'll mark as installed since dependencies are ready # Real initialization will happen when the OCR backend is used except Exception as e: logger.error(f"FAILED: PP-OCRv5 installation failed: {e}") return False return True async def install_qwen_image_layered(self, dry_run: bool = False) -> bool: """Install Qwen-Image-Layered model.""" logger.info("Installing Qwen-Image-Layered...") if not dry_run: try: import torch from transformers import pipeline device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") logger.info("Downloading Qwen-Image-Layered model (this may take a while)...") # Note: This is a placeholder - the actual model path may differ # In practice, you'd need to check the official repository pipeline( "image-to-image", model="Qwen/Qwen-Image-Layered", # Placeholder model path cache_dir=str(self.cache_dir), device=device, ) self.installed_models.add("qwen-image-layered") logger.info("SUCCESS: Qwen-Image-Layered installed") except Exception as e: logger.error(f"FAILED: Qwen-Image-Layered installation failed: {e}") logger.warning("Qwen-Image-Layered may require manual installation") return False return True async def install_got_ocr(self, dry_run: bool = False) -> bool: """Install GOT-OCR2.0.""" logger.info("Installing GOT-OCR2.0...") if not dry_run: try: import torch from transformers import pipeline device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") logger.info("Downloading GOT-OCR2.0 model (this may take a while)...") pipeline( "image-to-text", model="Ucas-HaoranWei/GOT-OCR2.0", # Updated model path cache_dir=str(self.cache_dir), device=device, ) self.installed_models.add("got-ocr") logger.info("SUCCESS: GOT-OCR2.0 installed") except Exception as e: logger.error(f"FAILED: GOT-OCR2.0 installation failed: {e}") return False return True async def install_tesseract(self, dry_run: bool = False) -> bool: """Install Tesseract OCR.""" logger.info("Installing Tesseract OCR...") if not dry_run: try: import pytesseract from PIL import Image # Try to get tesseract version try: version = pytesseract.get_tesseract_version() logger.info(f"Tesseract version: {version}") except Exception: logger.warning("Tesseract binary not found in PATH") logger.info( "Please install Tesseract OCR from: https://github.com/UB-Mannheim/tesseract/wiki" ) logger.info( "Or run: choco install tesseract (Windows) / brew install tesseract (macOS)" ) return False # Test with a simple image logger.info("Testing Tesseract...") test_img = Image.new("RGB", (100, 50), color="white") from PIL import ImageDraw draw = ImageDraw.Draw(test_img) draw.text((10, 10), "TEST", fill="black") test_img.save(self.cache_dir / "test_tesseract.png") text = pytesseract.image_to_string(str(self.cache_dir / "test_tesseract.png")) if text.strip(): logger.info("SUCCESS: Tesseract installed and working") self.installed_models.add("tesseract") return True else: logger.warning("Tesseract installed but test failed") return False except Exception as e: logger.error(f"FAILED: Tesseract installation failed: {e}") return False return True async def install_easyocr(self, dry_run: bool = False) -> bool: """Install EasyOCR.""" logger.info("Installing EasyOCR...") if not dry_run: try: import easyocr logger.info("Downloading EasyOCR models (this may take a while)...") reader = easyocr.Reader(["en"], gpu=False) # Start with CPU # Test with simple text logger.info("Testing EasyOCR...") test_img = [[0] * 100 for _ in range(50)] # Simple test array results = reader.readtext(test_img) logger.info("SUCCESS: EasyOCR installed and working") self.installed_models.add("easyocr") return True except Exception as e: logger.error(f"FAILED: EasyOCR installation failed: {e}") return False return True async def install_system_dependencies(self, dry_run: bool = False) -> bool: """Install system-level dependencies.""" logger.info("Checking system dependencies...") import platform system = platform.system().lower() if system == "windows": logger.info("Windows detected - checking for required packages...") # Check if required Windows packages are available import importlib.util if importlib.util.find_spec("comtypes") is not None: logger.info("SUCCESS: comtypes available for WIA scanner support") else: logger.warning("comtypes not available - scanner support limited") elif system == "linux": logger.info("Linux detected - checking for system packages...") # Could check for tesseract, etc. elif system == "darwin": # macOS logger.info("macOS detected - checking for system packages...") return True async def verify_installation(self, backends: list[str]) -> bool: """Verify that all requested backends are properly installed.""" logger.info("Verifying installations...") success = True for backend in backends: if backend == "deepseek-ocr": try: from transformers import pipeline pipeline( "image-to-text", model="deepseek-ai/DeepSeek-OCR", cache_dir=str(self.cache_dir), ) logger.info("✓ DeepSeek-OCR: Verified") except Exception as e: logger.error(f"✗ DeepSeek-OCR: Verification failed: {e}") success = False elif backend == "florence-2": try: from transformers import AutoModelForCausalLM, AutoProcessor AutoProcessor.from_pretrained( "microsoft/Florence-2-base", cache_dir=str(self.cache_dir) ) AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-base", cache_dir=str(self.cache_dir) ) logger.info("✓ Florence-2: Verified") except Exception as e: logger.error(f"✗ Florence-2: Verification failed: {e}") success = False elif backend == "mistral-ocr": try: import os import httpx api_key = os.getenv("MISTRAL_API_KEY") if not api_key: logger.error("✗ Mistral OCR: MISTRAL_API_KEY not set") success = False else: response = httpx.get( "https://api.mistral.ai/v1/models", headers={"Authorization": f"Bearer {api_key}"}, timeout=10, ) if response.status_code == 200: logger.info("✓ Mistral OCR: API connection verified") else: logger.error(f"✗ Mistral OCR: API test failed: {response.status_code}") success = False except Exception as e: logger.error(f"✗ Mistral OCR: Verification failed: {e}") success = False elif backend == "pp-ocrv5": try: # Just verify the import works - actual initialization may vary by PaddleOCR version logger.info("✓ PP-OCRv5: Import verified (models download on first use)") except Exception as e: logger.error(f"✗ PP-OCRv5: Verification failed: {e}") success = False elif backend == "tesseract": try: import pytesseract pytesseract.get_tesseract_version() logger.info("✓ Tesseract: Verified") except Exception as e: logger.error(f"✗ Tesseract: Verification failed: {e}") success = False elif backend == "easyocr": try: import easyocr easyocr.Reader(["en"], gpu=False) logger.info("✓ EasyOCR: Verified") except Exception as e: logger.error(f"✗ EasyOCR: Verification failed: {e}") success = False # For other backends, just check if they're marked as installed elif backend in self.installed_models: logger.info(f"✓ {backend}: Marked as installed") else: logger.warning(f"? {backend}: Installation status unknown") success = False return success async def install_all( self, backends: list[str], dry_run: bool = False, skip_verification: bool = False ) -> bool: """Install all specified backends.""" logger.info(f"Installing backends: {', '.join(backends)}") if dry_run: logger.info("DRY RUN MODE - No actual installation will occur") # Install system dependencies first await self.install_system_dependencies(dry_run) # Backend installation mapping backend_installers = { "mistral-ocr": self.install_mistral_ocr, "deepseek-ocr": self.install_deepseek_ocr, "florence-2": self.install_florence_2, "dots-ocr": self.install_dots_ocr, "pp-ocrv5": self.install_pp_ocrv5, "qwen-image-layered": self.install_qwen_image_layered, "got-ocr": self.install_got_ocr, "tesseract": self.install_tesseract, "easyocr": self.install_easyocr, } success = True installed_backends = [] for backend in backends: if backend not in backend_installers: logger.error(f"Unknown backend: {backend}") success = False continue logger.info(f"\n{'=' * 50}") logger.info(f"Installing {backend}") logger.info("=" * 50) try: installer = backend_installers[backend] if await installer(dry_run): installed_backends.append(backend) logger.info(f"SUCCESS: {backend} installation completed") else: logger.error(f"FAILED: {backend} installation failed") success = False except Exception as e: logger.error(f"ERROR: {backend} installation crashed: {e}") success = False # Save installation status if not dry_run: self.save_status() # Verify installations if not skip_verification and not dry_run and installed_backends: logger.info(f"\n{'=' * 50}") logger.info("Verifying installations") logger.info("=" * 50) if await self.verify_installation(installed_backends): logger.info("SUCCESS: All installations verified") else: logger.warning("WARNING: Some installations failed verification") success = False # Summary logger.info(f"\n{'=' * 50}") logger.info("Installation Summary") logger.info("=" * 50) logger.info(f"Cache directory: {self.cache_dir}") logger.info(f"Backends requested: {len(backends)}") logger.info(f"Backends installed: {len(installed_backends)}") logger.info(f"Overall success: {'YES' if success else 'NO'}") if installed_backends: logger.info("Installed backends:") for backend in installed_backends: logger.info(f" ✓ {backend}") else: logger.info("No backends were installed") return success async def main(): """Main installation function.""" parser = argparse.ArgumentParser( description="OCR-MCP Model Installation Script", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--backends", nargs="+", choices=[ "mistral-ocr", "deepseek-ocr", "florence-2", "dots-ocr", "pp-ocrv5", "qwen-image-layered", "got-ocr", "tesseract", "easyocr", "all", ], default=["all"], help="Backends to install (default: all)", ) parser.add_argument( "--skip-verification", action="store_true", help="Skip verification after installation" ) parser.add_argument( "--force-redownload", action="store_true", help="Force redownload of existing models" ) parser.add_argument("--cache-dir", type=Path, help="Custom cache directory for models") parser.add_argument( "--gpu-only", action="store_true", help="Only install GPU-compatible models" ) parser.add_argument( "--cpu-only", action="store_true", help="Only install CPU-compatible models" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be installed without doing it" ) parser.add_argument("--verbose", "-v", action="store_true", help="Increase verbosity") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Handle "all" backends if "all" in args.backends: all_backends = [ "mistral-ocr", "deepseek-ocr", "florence-2", "dots-ocr", "pp-ocrv5", "qwen-image-layered", "got-ocr", "tesseract", "easyocr", ] else: all_backends = args.backends # Filter by GPU/CPU preferences (simplified) if args.gpu_only: # Only install backends that support GPU gpu_backends = [ "mistral-ocr", "deepseek-ocr", "florence-2", "pp-ocrv5", "qwen-image-layered", "got-ocr", ] all_backends = [b for b in all_backends if b in gpu_backends] logger.info("GPU-only mode: Only installing GPU-compatible backends") if args.cpu_only: # All backends can run on CPU, but prefer CPU-optimized ones cpu_backends = ["tesseract", "easyocr", "pp-ocrv5"] all_backends = [b for b in all_backends if b in cpu_backends] logger.info("CPU-only mode: Only installing CPU-optimized backends") # Initialize installer installer = ModelInstaller(cache_dir=args.cache_dir, force_redownload=args.force_redownload) # Run installation success = await installer.install_all( all_backends, dry_run=args.dry_run, skip_verification=args.skip_verification ) return success async def main(): """Main installation function (renamed to avoid conflict).""" parser = argparse.ArgumentParser( description="OCR-MCP Model Installation Script", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--backends", nargs="+", choices=[ "mistral-ocr", "deepseek-ocr", "florence-2", "dots-ocr", "pp-ocrv5", "qwen-image-layered", "got-ocr", "tesseract", "easyocr", "all", ], default=["all"], help="Backends to install (default: all)", ) parser.add_argument( "--skip-verification", action="store_true", help="Skip verification after installation" ) parser.add_argument( "--force-redownload", action="store_true", help="Force redownload of existing models" ) parser.add_argument("--cache-dir", type=Path, help="Custom cache directory for models") parser.add_argument( "--gpu-only", action="store_true", help="Only install GPU-compatible models" ) parser.add_argument( "--cpu-only", action="store_true", help="Only install CPU-compatible models" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be installed without doing it" ) parser.add_argument("--verbose", "-v", action="store_true", help="Increase verbosity") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Handle "all" backends if "all" in args.backends: all_backends = [ "mistral-ocr", "deepseek-ocr", "florence-2", "dots-ocr", "pp-ocrv5", "qwen-image-layered", "got-ocr", "tesseract", "easyocr", ] else: all_backends = args.backends # Filter by GPU/CPU preferences (simplified) if args.gpu_only: # Only install backends that support GPU gpu_backends = [ "mistral-ocr", "deepseek-ocr", "florence-2", "pp-ocrv5", "qwen-image-layered", "got-ocr", ] all_backends = [b for b in all_backends if b in gpu_backends] logger.info("GPU-only mode: Only installing GPU-compatible backends") if args.cpu_only: # All backends can run on CPU, but prefer CPU-optimized ones cpu_backends = ["tesseract", "easyocr", "pp-ocrv5"] all_backends = [b for b in all_backends if b in cpu_backends] logger.info("CPU-only mode: Only installing CPU-optimized backends") # Initialize installer installer = ModelInstaller(cache_dir=args.cache_dir, force_redownload=args.force_redownload) # Run installation success = await installer.install_all( all_backends, dry_run=args.dry_run, skip_verification=args.skip_verification ) return success if __name__ == "__main__": try: success = asyncio.run(main()) sys.exit(0 if success else 1) except KeyboardInterrupt: logger.info("Installation interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Installation failed with error: {e}") sys.exit(1)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sandraschi/ocr-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

install_models.py•31.2 KiB