OCR-MCP

florence_backend.py•7.25 KiB

""" Florence-2 Backend Implementation Integrates Microsoft's Florence-2 vision foundation model for OCR """ import logging from pathlib import Path from typing import Any import torch from PIL import Image from ..core.backend_manager import OCRBackend from ..core.config import OCRConfig try: from transformers import AutoModelForCausalLM, AutoProcessor TRANSFORMERS_AVAILABLE = True except ImportError: TRANSFORMERS_AVAILABLE = False logger = logging.getLogger(__name__) class FlorenceBackend(OCRBackend): """Florence-2 backend for comprehensive vision-language OCR""" def __init__(self, config: OCRConfig): super().__init__("florence-2", config) self.model = None self.processor = None self.device = config.device or ("cuda" if torch.cuda.is_available() else "cpu") self.model_name = "microsoft/Florence-2-base" self.cache_dir = Path(config.cache_dir or Path.home() / ".cache" / "ocr_mcp") self.cache_dir.mkdir(parents=True, exist_ok=True) def is_available(self) -> bool: """Check if Florence-2 is available""" if not TRANSFORMERS_AVAILABLE: return False try: from huggingface_hub import model_info model_info(self.model_name) return True except Exception as e: logger.warning(f"Florence-2 model check failed: {e}") return False async def load_model(self) -> bool: """Load the Florence-2 model""" if not TRANSFORMERS_AVAILABLE: logger.error("Transformers not available for Florence-2") return False try: logger.info(f"Loading Florence-2 model on {self.device}") # Load processor and model self.processor = AutoProcessor.from_pretrained( self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True ) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None, ) if self.device == "cpu": self.model = self.model.to(self.device) logger.info("Florence-2 model loaded successfully") return True except Exception as e: logger.error(f"Failed to load Florence-2 model: {e}") return False async def process_document( self, image_path: str, ocr_mode: str = "text", region: list[int] | None = None, ) -> dict[str, Any]: """Process document with Florence-2""" if not self.model or not self.processor: raise RuntimeError("Florence-2 model not loaded") try: # Load and preprocess image image = Image.open(image_path).convert("RGB") # Apply region cropping if specified if region and len(region) == 4: x1, y1, x2, y2 = region image = image.crop((x1, y1, x2, y2)) # Prepare inputs based on task if ocr_mode == "text": task = "<OCR>" elif ocr_mode == "format": task = "<OCR_WITH_REGION>" else: # fine-grained task = "<OCR_WITH_REGION>" inputs = self.processor(text=task, images=image, return_tensors="pt") if self.device == "cuda": inputs = {k: v.cuda() for k, v in inputs.items()} # Generate OCR results with torch.no_grad(): generated_ids = self.model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_length=1024, num_beams=4, early_stopping=True, do_sample=False, ) # Decode results generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[ 0 ] # Parse Florence-2 output format result = self._parse_florence_output(generated_text, ocr_mode, image.size) return result except Exception as e: logger.error(f"Florence-2 processing failed: {e}") raise RuntimeError(f"OCR processing failed: {str(e)}") def _parse_florence_output( self, raw_output: str, ocr_mode: str, image_size: tuple ) -> dict[str, Any]: """Parse Florence-2 output format""" # Florence-2 returns structured output with regions # This is a simplified parser - actual implementation would handle # the specific output format of Florence-2 # Extract text content text_content = self._extract_text_from_florence_output(raw_output) if ocr_mode == "text": return { "text": text_content, "backend": "florence", "confidence": 0.92, "regions": [], } elif ocr_mode == "format": return { "text": text_content, "backend": "florence", "confidence": 0.92, "structured": self._parse_structured_florence_output(raw_output), "regions": [], } else: # fine-grained return { "text": text_content, "backend": "florence", "confidence": 0.92, "regions": self._extract_florence_regions(raw_output, image_size), "structured": {}, } def _extract_text_from_florence_output(self, raw_output: str) -> str: """Extract text content from Florence-2 output""" # This would parse the actual Florence-2 output format # For now, return a cleaned version return raw_output.replace("<pad>", "").replace("</s>", "").strip() def _parse_structured_florence_output(self, raw_output: str) -> dict[str, Any]: """Parse structured output from Florence-2""" # Florence-2 can provide layout information return {"layout": "document", "sections": [], "reading_order": []} def _extract_florence_regions(self, raw_output: str, image_size: tuple) -> list[dict[str, Any]]: """Extract region information from Florence-2 output""" # Florence-2 provides bounding box information # This would parse actual region data width, height = image_size return [ { "bbox": [0, 0, width, height], "text": self._extract_text_from_florence_output(raw_output), "confidence": 0.92, } ] def get_capabilities(self) -> dict[str, Any]: """Get backend capabilities""" return { "name": "Florence-2", "available": self.is_available(), "modes": ["text", "format", "fine-grained"], "languages": ["en", "multilingual"], "gpu_support": True, "strengths": ["layout_understanding", "multi_task", "region_detection"], "limitations": ["model_size", "complexity"], "model_size": "~2.5GB", }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sandraschi/ocr-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

florence_backend.py•7.25 KiB