"""
Florence-2 Backend Implementation
Integrates Microsoft's Florence-2 vision foundation model for OCR
"""
import logging
from pathlib import Path
from typing import Any
import torch
from PIL import Image
from ..core.backend_manager import OCRBackend
from ..core.config import OCRConfig
try:
from transformers import AutoModelForCausalLM, AutoProcessor
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
logger = logging.getLogger(__name__)
class FlorenceBackend(OCRBackend):
"""Florence-2 backend for comprehensive vision-language OCR"""
def __init__(self, config: OCRConfig):
super().__init__("florence-2", config)
self.model = None
self.processor = None
self.device = config.device or ("cuda" if torch.cuda.is_available() else "cpu")
self.model_name = "microsoft/Florence-2-base"
self.cache_dir = Path(config.cache_dir or Path.home() / ".cache" / "ocr_mcp")
self.cache_dir.mkdir(parents=True, exist_ok=True)
def is_available(self) -> bool:
"""Check if Florence-2 is available"""
if not TRANSFORMERS_AVAILABLE:
return False
try:
from huggingface_hub import model_info
model_info(self.model_name)
return True
except Exception as e:
logger.warning(f"Florence-2 model check failed: {e}")
return False
async def load_model(self) -> bool:
"""Load the Florence-2 model"""
if not TRANSFORMERS_AVAILABLE:
logger.error("Transformers not available for Florence-2")
return False
try:
logger.info(f"Loading Florence-2 model on {self.device}")
# Load processor and model
self.processor = AutoProcessor.from_pretrained(
self.model_name, cache_dir=str(self.cache_dir), trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
cache_dir=str(self.cache_dir),
trust_remote_code=True,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
device_map="auto" if self.device == "cuda" else None,
)
if self.device == "cpu":
self.model = self.model.to(self.device)
logger.info("Florence-2 model loaded successfully")
return True
except Exception as e:
logger.error(f"Failed to load Florence-2 model: {e}")
return False
async def process_document(
self,
image_path: str,
ocr_mode: str = "text",
region: list[int] | None = None,
) -> dict[str, Any]:
"""Process document with Florence-2"""
if not self.model or not self.processor:
raise RuntimeError("Florence-2 model not loaded")
try:
# Load and preprocess image
image = Image.open(image_path).convert("RGB")
# Apply region cropping if specified
if region and len(region) == 4:
x1, y1, x2, y2 = region
image = image.crop((x1, y1, x2, y2))
# Prepare inputs based on task
if ocr_mode == "text":
task = "<OCR>"
elif ocr_mode == "format":
task = "<OCR_WITH_REGION>"
else: # fine-grained
task = "<OCR_WITH_REGION>"
inputs = self.processor(text=task, images=image, return_tensors="pt")
if self.device == "cuda":
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate OCR results
with torch.no_grad():
generated_ids = self.model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_length=1024,
num_beams=4,
early_stopping=True,
do_sample=False,
)
# Decode results
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[
0
]
# Parse Florence-2 output format
result = self._parse_florence_output(generated_text, ocr_mode, image.size)
return result
except Exception as e:
logger.error(f"Florence-2 processing failed: {e}")
raise RuntimeError(f"OCR processing failed: {str(e)}")
def _parse_florence_output(
self, raw_output: str, ocr_mode: str, image_size: tuple
) -> dict[str, Any]:
"""Parse Florence-2 output format"""
# Florence-2 returns structured output with regions
# This is a simplified parser - actual implementation would handle
# the specific output format of Florence-2
# Extract text content
text_content = self._extract_text_from_florence_output(raw_output)
if ocr_mode == "text":
return {
"text": text_content,
"backend": "florence",
"confidence": 0.92,
"regions": [],
}
elif ocr_mode == "format":
return {
"text": text_content,
"backend": "florence",
"confidence": 0.92,
"structured": self._parse_structured_florence_output(raw_output),
"regions": [],
}
else: # fine-grained
return {
"text": text_content,
"backend": "florence",
"confidence": 0.92,
"regions": self._extract_florence_regions(raw_output, image_size),
"structured": {},
}
def _extract_text_from_florence_output(self, raw_output: str) -> str:
"""Extract text content from Florence-2 output"""
# This would parse the actual Florence-2 output format
# For now, return a cleaned version
return raw_output.replace("<pad>", "").replace("</s>", "").strip()
def _parse_structured_florence_output(self, raw_output: str) -> dict[str, Any]:
"""Parse structured output from Florence-2"""
# Florence-2 can provide layout information
return {"layout": "document", "sections": [], "reading_order": []}
def _extract_florence_regions(self, raw_output: str, image_size: tuple) -> list[dict[str, Any]]:
"""Extract region information from Florence-2 output"""
# Florence-2 provides bounding box information
# This would parse actual region data
width, height = image_size
return [
{
"bbox": [0, 0, width, height],
"text": self._extract_text_from_florence_output(raw_output),
"confidence": 0.92,
}
]
def get_capabilities(self) -> dict[str, Any]:
"""Get backend capabilities"""
return {
"name": "Florence-2",
"available": self.is_available(),
"modes": ["text", "format", "fine-grained"],
"languages": ["en", "multilingual"],
"gpu_support": True,
"strengths": ["layout_understanding", "multi_task", "region_detection"],
"limitations": ["model_size", "complexity"],
"model_size": "~2.5GB",
}