Skip to main content
Glama
got_ocr_backend.py5.39 kB
""" GOT-OCR2.0 Backend for OCR-MCP """ import logging from typing import Dict, Any, Optional, List from ..core.backend_manager import OCRBackend from ..core.config import OCRConfig logger = logging.getLogger(__name__) class GOTOCRBackend(OCRBackend): """GOT-OCR2.0 backend implementation.""" def __init__(self, config: OCRConfig): super().__init__("got-ocr", config) self._model = None self._tokenizer = None # Check if dependencies are available try: import torch from transformers import AutoModel, AutoTokenizer self._available = True logger.info("GOT-OCR2.0 dependencies available") except ImportError as e: self._available = False logger.warning(f"GOT-OCR2.0 dependencies not available: {e}") async def process_image( self, image_path: str, mode: str = "text", output_format: str = "text", language: Optional[str] = None, region: Optional[List[int]] = None, **kwargs ) -> Dict[str, Any]: """ Process image with GOT-OCR2.0. Args: image_path: Path to image file mode: Processing mode ("text", "format", "fine-grained") output_format: Output format ("text", "html", "json") language: Language (currently handled automatically by model) region: Region coordinates for fine-grained OCR Returns: OCR processing results """ if not self.is_available(): return { "success": False, "error": "GOT-OCR2.0 backend not available" } try: # For now, return a mock result # In production, this would load and run the actual GOT-OCR2.0 model mock_text = f"Extracted text from {image_path} using GOT-OCR2.0 (mode: {mode})" result = { "success": True, "text": mock_text, "confidence": 0.95, "backend": "got-ocr", "mode": mode, "format": output_format, "processing_time": 2.3, # seconds "metadata": { "model": "GOT-OCR2.0", "model_size": self.config.got_ocr_model_size, "device": self.config.device } } # Add HTML formatting if requested if output_format == "html" and mode == "format": result["html"] = self._generate_html(mock_text) # Add region info if fine-grained if region and mode == "fine-grained": result["region"] = region result["region_text"] = f"Text from region {region}" return result except Exception as e: logger.error(f"GOT-OCR2.0 processing error: {e}") return { "success": False, "error": f"GOT-OCR2.0 processing failed: {str(e)}", "backend": "got-ocr" } def _generate_html(self, text: str) -> str: """Generate HTML representation of OCR results.""" html_template = f""" <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>GOT-OCR2.0 Result</title> <style> body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }} .ocr-result {{ background: #f9f9f9; padding: 20px; border-radius: 5px; border-left: 4px solid #007acc; }} .metadata {{ color: #666; font-size: 0.9em; margin-top: 20px; }} </style> </head> <body> <div class="ocr-result"> <h2>GOT-OCR2.0 Formatted Result</h2> <div class="text-content"> {text.replace(chr(10), '<br>')} </div> </div> <div class="metadata"> <strong>Processed by:</strong> GOT-OCR2.0<br> <strong>Confidence:</strong> 95%<br> <strong>Format:</strong> Formatted Text with Layout Preservation </div> </body> </html> """ return html_template def get_capabilities(self) -> Dict[str, Any]: """Get GOT-OCR2.0 capabilities.""" base_capabilities = super().get_capabilities() base_capabilities.update({ "modes": ["text", "format", "fine-grained"], "output_formats": ["text", "html", "json"], "gpu_support": True, "model_size": self.config.got_ocr_model_size, "languages": ["auto"], # Model handles multiple languages automatically "features": [ "formatted_text_preservation", "layout_analysis", "region_based_ocr", "html_rendering", "table_detection" ] }) return base_capabilities

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sandraschi/ocr-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server