image-description-mcp_server.py•9.5 kB
#!/usr/bin/env python3
"""
Simple image-description-mcp MCP Server - AI-powered image analysis using Grok API
"""
import os
import sys
import logging
import base64
import json
from datetime import datetime, timezone
import httpx
from PIL import Image
import io
import pytesseract
import cv2
import numpy as np
from mcp.server.fastmcp import FastMCP
# Configure logging to stderr
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
stream=sys.stderr
)
logger = logging.getLogger("image-description-mcp-server")
# Initialize MCP server - NO PROMPT PARAMETER!
mcp = FastMCP("image-description-mcp")
# Configuration
GROK_API_KEY = os.environ.get("GROK_API_KEY", "")
GROK_API_URL = "https://api.x.ai/v1/images/generations"
# === UTILITY FUNCTIONS ===
def encode_image_to_base64(image_path_or_url: str, is_url: bool = False) -> str:
"""Convert image to base64 string for API transmission."""
try:
if is_url:
# Download image from URL
response = httpx.get(image_path_or_url, timeout=30)
response.raise_for_status()
image_data = response.content
else:
# Read local file
with open(image_path_or_url, 'rb') as f:
image_data = f.read()
# Convert to base64
return base64.b64encode(image_data).decode('utf-8')
except Exception as e:
logger.error(f"Error encoding image: {e}")
raise
def get_image_metadata(image_path_or_url: str, is_url: bool = False) -> dict:
"""Extract technical metadata from image."""
try:
if is_url:
response = httpx.get(image_path_or_url, timeout=30)
response.raise_for_status()
image_data = response.content
img = Image.open(io.BytesIO(image_data))
else:
img = Image.open(image_path_or_url)
return {
"format": img.format,
"size": img.size,
"mode": img.mode,
"file_size_bytes": len(image_data) if 'image_data' in locals() else os.path.getsize(image_path_or_url)
}
except Exception as e:
logger.error(f"Error getting metadata: {e}")
return {"error": str(e)}
def extract_text_from_image_ocr(image_path_or_url: str, is_url: bool = False) -> str:
"""Extract text from image using OCR."""
try:
if is_url:
response = httpx.get(image_path_or_url, timeout=30)
response.raise_for_status()
image_data = response.content
img = Image.open(io.BytesIO(image_data))
else:
img = Image.open(image_path_or_url)
# Convert PIL to numpy array for OpenCV
img_array = np.array(img)
# Convert to grayscale if needed
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Apply OCR
text = pytesseract.image_to_string(gray)
return text.strip()
except Exception as e:
logger.error(f"OCR error: {e}")
return ""
async def call_grok_api(image_base64: str, prompt: str) -> str:
"""Call Grok API for image analysis."""
if not GROK_API_KEY:
return "❌ Error: GROK_API_KEY environment variable not set"
try:
headers = {
"Authorization": f"Bearer {GROK_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "grok-4-0709",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
}
]
}
],
"max_tokens": 1000
}
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.x.ai/v1/chat/completions",
headers=headers,
json=payload,
timeout=60
)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"]
except httpx.HTTPStatusError as e:
return f"❌ API Error: {e.response.status_code} - {e.response.text}"
except Exception as e:
return f"❌ Error calling Grok API: {str(e)}"
# === MCP TOOLS ===
@mcp.tool()
async def describe_image_url(url: str = "", detail_level: str = "basic") -> str:
"""Analyze image from URL and provide AI-generated description using Grok."""
logger.info(f"Executing describe_image_url with {url}, detail_level: {detail_level}")
if not url.strip():
return "❌ Error: URL is required"
if not GROK_API_KEY:
return "❌ Error: GROK_API_KEY environment variable not set"
try:
# Get image metadata
metadata = get_image_metadata(url, is_url=True)
# Encode image
image_base64 = encode_image_to_base64(url, is_url=True)
# Create prompt based on detail level
if detail_level == "comprehensive":
prompt = "Provide a comprehensive analysis of this image including: main subjects, colors, composition, mood, technical details, and any notable features. Be detailed and thorough."
elif detail_level == "detailed":
prompt = "Provide a detailed description of this image including: main subjects, setting, colors, composition, and key features."
else:
prompt = "Provide a clear, concise description of what you see in this image."
# Call Grok API
description = await call_grok_api(image_base64, prompt)
# Format response
result = {
"description": description,
"metadata": metadata,
"source": url,
"analysis_level": detail_level
}
return f"✅ Image Analysis Complete:\n{json.dumps(result, indent=2)}"
except Exception as e:
logger.error(f"Error: {e}")
return f"❌ Error analyzing image: {str(e)}"
@mcp.tool()
async def describe_image_file(file_path: str = "", detail_level: str = "basic") -> str:
"""Analyze local image file and provide AI-generated description using Grok."""
logger.info(f"Executing describe_image_file with {file_path}, detail_level: {detail_level}")
if not file_path.strip():
return "❌ Error: File path is required"
if not os.path.exists(file_path):
return f"❌ Error: File not found: {file_path}"
if not GROK_API_KEY:
return "❌ Error: GROK_API_KEY environment variable not set"
try:
# Get image metadata
metadata = get_image_metadata(file_path, is_url=False)
# Encode image
image_base64 = encode_image_to_base64(file_path, is_url=False)
# Create prompt based on detail level
if detail_level == "comprehensive":
prompt = "Provide a comprehensive analysis of this image including: main subjects, colors, composition, mood, technical details, and any notable features. Be detailed and thorough."
elif detail_level == "detailed":
prompt = "Provide a detailed description of this image including: main subjects, setting, colors, composition, and key features."
else:
prompt = "Provide a clear, concise description of what you see in this image."
# Call Grok API
description = await call_grok_api(image_base64, prompt)
# Format response
result = {
"description": description,
"metadata": metadata,
"source": file_path,
"analysis_level": detail_level
}
return f"✅ Image Analysis Complete:\n{json.dumps(result, indent=2)}"
except Exception as e:
logger.error(f"Error: {e}")
return f"❌ Error analyzing image file: {str(e)}"
@mcp.tool()
async def extract_text_from_image(url: str = "") -> str:
"""Extract readable text from image using OCR."""
logger.info(f"Executing extract_text_from_image with {url}")
if not url.strip():
return "❌ Error: URL is required"
try:
# Extract text using OCR
extracted_text = extract_text_from_image_ocr(url, is_url=True)
if not extracted_text:
return "⚠️ No text found in the image"
# Get basic metadata
metadata = get_image_metadata(url, is_url=True)
result = {
"extracted_text": extracted_text,
"metadata": metadata,
"source": url,
"method": "OCR"
}
return f"✅ Text Extraction Complete:\n{json.dumps(result, indent=2)}"
except Exception as e:
logger.error(f"Error: {e}")
return f"❌ Error extracting text: {str(e)}"
# === SERVER STARTUP ===
if __name__ == "__main__":
logger.info("Starting image-description-mcp MCP server...")
# Check for required environment variables
if not GROK_API_KEY:
logger.warning("GROK_API_KEY not set - image analysis tools will not work")
try:
mcp.run(transport='stdio')
except Exception as e:
logger.error(f"Server error: {e}", exc_info=True)
sys.exit(1)