MCP Image Recognition Server
by mario-andreschak
Verified
- src
- image_recognition_server
import base64
import io
import logging
import os
from typing import Union
from dotenv import load_dotenv
from mcp.server.fastmcp import FastMCP
from PIL import Image
from .utils.image import image_to_base64, validate_base64_image
from .utils.ocr import OCRError, extract_text_from_image
from .vision.anthropic import AnthropicVision
from .vision.openai import OpenAIVision
# Load environment variables
load_dotenv()
# Configure encoding, defaulting to UTF-8
DEFAULT_ENCODING = "utf-8"
ENCODING = os.getenv("MCP_OUTPUT_ENCODING", DEFAULT_ENCODING)
# Configure logging to file
log_file_path = os.path.join(os.path.dirname(__file__), "mcp_server.log")
logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO"),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
filename=log_file_path,
filemode="a", # Append to log file
)
logger = logging.getLogger(__name__)
logger.info(f"Using encoding: {ENCODING}")
def sanitize_output(text: str) -> str:
"""Sanitize output string to replace problematic characters."""
if text is None:
return "" # Return empty string for None
try:
return text.encode(ENCODING, "replace").decode(ENCODING)
except Exception as e:
logger.error(f"Error during sanitization: {str(e)}", exc_info=True)
return text # Return original text if sanitization fails
# Create MCP server
mcp = FastMCP(
"mcp-image-recognition",
description="MCP server for image recognition using Anthropic and OpenAI vision APIs",
)
# Initialize vision clients
def get_vision_client() -> Union[AnthropicVision, OpenAIVision]:
"""Get the configured vision client based on environment settings."""
provider = os.getenv("VISION_PROVIDER", "anthropic").lower()
try:
if provider == "anthropic":
return AnthropicVision()
elif provider == "openai":
return OpenAIVision()
else:
raise ValueError(f"Invalid vision provider: {provider}")
except Exception as e:
# Try fallback provider if configured
fallback = os.getenv("FALLBACK_PROVIDER")
if fallback and fallback.lower() != provider:
logger.warning(
f"Primary provider failed: {str(e)}. Trying fallback: {fallback}"
)
if fallback.lower() == "anthropic":
return AnthropicVision()
elif fallback.lower() == "openai":
return OpenAIVision()
raise
async def process_image_with_ocr(image_data: str, prompt: str) -> str:
"""Process image with both vision AI and OCR.
Args:
image_data: Base64 encoded image data
prompt: Prompt for vision AI
Returns:
str: Combined description from vision AI and OCR
"""
# Get vision AI description
client = get_vision_client()
# Handle both sync (Anthropic) and async (OpenAI) clients
if isinstance(client, OpenAIVision):
description = await client.describe_image(image_data, prompt)
else:
description = client.describe_image(image_data, prompt)
# Check for empty or default response
if not description or description == "No description available.":
raise ValueError("Vision API returned empty or default response")
# Handle OCR if enabled
ocr_enabled = os.getenv("ENABLE_OCR", "false").lower() == "true"
if ocr_enabled:
try:
# Convert base64 to PIL Image
image_bytes = base64.b64decode(image_data)
image = Image.open(io.BytesIO(image_bytes))
# Extract text with OCR required flag
if ocr_text := extract_text_from_image(image, ocr_required=True):
description += (
f"\n\nAdditionally, this is the output of tesseract-ocr: {ocr_text}"
)
except OCRError as e:
# Propagate OCR errors when OCR is enabled
logger.error(f"OCR processing failed: {str(e)}")
raise ValueError(f"OCR Error: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error during OCR: {str(e)}")
raise
return sanitize_output(description)
@mcp.tool()
async def describe_image(
image: str, prompt: str = "Please describe this image in detail."
) -> str:
"""Describe the contents of an image using vision AI.
Args:
image: Image data and MIME type
prompt: Optional prompt to use for the description.
Returns:
str: Detailed description of the image
"""
try:
logger.info(f"Processing image description request with prompt: {prompt}")
logger.debug(f"Image data length: {len(image)}")
# Validate image data
if not validate_base64_image(image):
raise ValueError("Invalid base64 image data")
result = await process_image_with_ocr(image, prompt)
if not result:
raise ValueError("Received empty response from processing")
logger.info("Successfully processed image")
return sanitize_output(result)
except ValueError as e:
logger.error(f"Input error: {str(e)}")
raise
except Exception as e:
logger.error(f"Error describing image: {str(e)}", exc_info=True)
raise
@mcp.tool()
async def describe_image_from_file(
filepath: str, prompt: str = "Please describe this image in detail."
) -> str:
"""Describe the contents of an image file using vision AI.
Args:
filepath: Path to the image file
prompt: Optional prompt to use for the description.
Returns:
str: Detailed description of the image
"""
try:
logger.info(f"Processing image file: {filepath}")
# Convert image to base64
image_data, mime_type = image_to_base64(filepath)
logger.info(f"Successfully converted image to base64. MIME type: {mime_type}")
logger.debug(f"Base64 data length: {len(image_data)}")
# Use describe_image tool
result = await describe_image(image=image_data, prompt=prompt)
if not result:
raise ValueError("Received empty response from processing")
return sanitize_output(result)
except FileNotFoundError:
logger.error(f"Image file not found: {filepath}")
raise
except ValueError as e:
logger.error(f"Input error: {str(e)}")
raise
except Exception as e:
logger.error(f"Error processing image file: {str(e)}", exc_info=True)
raise
if __name__ == "__main__":
mcp.run()