#!/usr/bin/env python3
"""
豆包图片描述 MCP 服务器
使用豆包大模型的视觉理解能力来描述图片。
"""
import os
import base64
import logging
import json
import hashlib
import time
import io
from typing import Optional
from pathlib import Path
# 配置日志记录到 stderr (避免干扰 stdio 传输)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
stream=__import__('sys').stderr
)
logger = logging.getLogger(__name__)
try:
from PIL import Image
PIL_AVAILABLE = True
logger.info("PIL/Pillow 已安装,图片优化功能可用")
except ImportError:
PIL_AVAILABLE = False
logger.warning("PIL/Pillow 未安装,图片优化功能不可用。安装: pip install Pillow")
# 从环境变量获取 API Key
# 重要:请替换为你自己的豆包 API Key!
# 获取方式:https://console.volcengine.com/
API_KEY = os.environ.get("VOLCENGINE_API_KEY", "YOUR_DOUBAO_API_KEY_HERE")
MODEL_ID = os.environ.get("DOUBAO_MODEL_ID", "doubao-seed-1-6-251015")
try:
from mcp.server.fastmcp import FastMCP
from volcenginesdkarkruntime import Ark
import httpx
# 初始化 MCP 服务器
mcp = FastMCP("doubao-image-describer")
# 支持的图片格式
SUPPORTED_IMAGE_FORMATS = {
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp',
'.tiff', '.tif', '.ico', '.svg', '.heic', '.heif',
'.raw', '.cr2', '.nef', '.arw', '.dng'
}
# 初始化豆包客户端
try:
client = Ark(api_key=API_KEY)
logger.info("豆包客户端初始化成功")
except Exception as e:
logger.error(f"豆包客户端初始化失败: {e}")
client = None
except ImportError as e:
logger.error(f"导入依赖失败: {e}")
logger.error("请运行: pip install mcp[cli] volcengine-python-sdk[ark] httpx")
raise
def check_image_format(file_path_or_url: str, is_url: bool = False) -> tuple[bool, str]:
"""检查文件扩展名是否为支持的图片格式
Args:
file_path_or_url: 文件路径或 URL
is_url: 是否为 URL(URL 如果没有扩展名也会通过)
Returns:
(is_valid, error_message): 是否有效和错误信息
"""
# 提取文件扩展名
# 处理 URL 查询参数(如 image.jpg?width=800)
path_part = file_path_or_url.split('?')[0].split('#')[0]
ext = Path(path_part).suffix.lower()
# 对于 URL,如果没有扩展名,允许通过(某些图片 URL 不带扩展名)
if is_url and not ext:
return True, ""
if not ext:
return False, f"错误: 无法识别文件扩展名。请提供有效的图片文件路径"
if ext not in SUPPORTED_IMAGE_FORMATS:
supported_list = ', '.join(sorted(SUPPORTED_IMAGE_FORMATS))
return False, (
f"错误: 不支持的文件格式 '{ext}'\n"
f"支持的图片格式: {supported_list}\n"
f"请确保输入的是图片文件(如 .jpg, .png, .gif 等)"
)
return True, ""
# 图片描述缓存
class ImageDescriptionCache:
"""图片描述缓存类"""
def __init__(self, cache_dir: str = None):
if cache_dir is None:
cache_dir = os.path.expanduser("~/.iflow/cache/doubao-image-mcp")
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"缓存目录: {self.cache_dir}")
def _get_file_hash(self, file_path: str) -> str:
"""获取文件内容哈希(用于检测文件变化)"""
try:
with open(file_path, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()[:8]
except Exception:
return hashlib.md5(str(file_path).encode()).hexdigest()[:8]
def get_cache_key(self, image_path: str, model: str, prompt: str) -> str:
"""生成缓存键"""
file_hash = self._get_file_hash(image_path)
key_str = f"{model}_{prompt}_{file_hash}"
return hashlib.md5(key_str.encode()).hexdigest()
def get(self, image_path: str, model: str, prompt: str) -> str | None:
"""获取缓存结果"""
key = self.get_cache_key(image_path, model, prompt)
cache_file = self.cache_dir / f"{key}.json"
if cache_file.exists():
try:
with open(cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"缓存命中: {image_path}")
return data.get('description')
except Exception as e:
logger.warning(f"读取缓存失败: {e}")
return None
def set(self, image_path: str, model: str, prompt: str, description: str):
"""保存结果到缓存"""
key = self.get_cache_key(image_path, model, prompt)
cache_file = self.cache_dir / f"{key}.json"
try:
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
'description': description,
'timestamp': time.time(),
'image_path': image_path,
'model': model,
'prompt': prompt
}, f, ensure_ascii=False, indent=2)
logger.info(f"结果已缓存: {image_path}")
except Exception as e:
logger.warning(f"保存缓存失败: {e}")
# 全局缓存实例
cache = ImageDescriptionCache()
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
def check_file_size(file_path: str) -> tuple[bool, str]:
"""检查文件大小是否在限制内
Args:
file_path: 文件路径
Returns:
(is_valid, error_message): 是否有效和错误信息
"""
try:
size = Path(file_path).stat().st_size
size_mb = size / (1024 * 1024)
if size > MAX_FILE_SIZE:
return False, (
f"错误: 文件过大 ({size_mb:.1f}MB)\n"
f"最大支持 {MAX_FILE_SIZE / (1024 * 1024):.0f}MB\n"
f"建议: 压缩图片或使用较小的文件"
)
if size == 0:
return False, "错误: 文件为空"
return True, ""
except Exception as e:
return False, f"错误: 无法检查文件大小: {e}"
# 图片优化配置
OPTIMIZE_SIZE = (1920, 1080) # 最大尺寸
OPTIMIZE_QUALITY = 85 # JPEG 质量
OPTIMIZE_MAX_SIZE_MB = 2 # 超过 2MB 自动优化
def optimize_image(image_path: str) -> tuple[bytes, str]:
"""优化图片:调整大小、转换格式、压缩
Args:
image_path: 图片文件路径
Returns:
(optimized_bytes, status_message): 优化后的图片数据和状态信息
"""
if not PIL_AVAILABLE:
# PIL 未安装,直接返回原始数据
with open(image_path, 'rb') as f:
return f.read(), "未优化(PIL未安装)"
try:
start_time = time.time()
with Image.open(image_path) as img:
original_size = len(img.tobytes())
original_mb = original_size / (1024 * 1024)
# 转换为 RGB(如果是 RGBA/P)
if img.mode != 'RGB':
img = img.convert('RGB')
# 调整大小(如果超过限制)
if img.size[0] > OPTIMIZE_SIZE[0] or img.size[1] > OPTIMIZE_SIZE[1]:
img.thumbnail(OPTIMIZE_SIZE, Image.LANCZOS)
logger.info(f"图片已调整: {img.size}")
# 压缩为 JPEG
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=OPTIMIZE_QUALITY, optimize=True)
optimized = buffer.getvalue()
optimized_size = len(optimized)
optimized_mb = optimized_size / (1024 * 1024)
compress_time = time.time() - start_time
reduction = (1 - optimized_size / original_size) * 100
status = (
f"优化完成: {original_mb:.1f}MB → {optimized_mb:.1f}MB "
f"({reduction:.0f}% 压缩), 耗时 {compress_time:.2f}秒"
)
logger.info(status)
return optimized, status
except Exception as e:
logger.warning(f"图片优化失败: {e},使用原始图片")
with open(image_path, 'rb') as f:
return f.read(), f"优化失败: {e}"
def encode_image_to_base64(image_path: str) -> str:
"""将本地图片文件编码为 Base64(自动优化大图片)"""
try:
# 检查文件大小
file_size = Path(image_path).stat().st_size
file_size_mb = file_size / (1024 * 1024)
# 决定是否需要优化
if file_size_mb > OPTIMIZE_MAX_SIZE_MB:
logger.info(f"文件较大 ({file_size_mb:.1f}MB),启用自动优化...")
image_data, status = optimize_image(image_path)
logger.info(status)
else:
with open(image_path, 'rb') as image_file:
image_data = image_file.read()
logger.info(f"文件大小适中 ({file_size_mb:.1f}MB),无需优化")
# 编码为 Base64
encoded = base64.b64encode(image_data).decode('utf-8')
logger.info(f"成功编码图片: {image_path} (Base64: {len(encoded):,} 字符)")
return encoded
except FileNotFoundError:
raise ValueError(f"文件不存在: {image_path}")
except Exception as e:
raise ValueError(f"编码图片失败: {str(e)}")
def call_doubao_vision_api(image_input: dict, prompt: str = "请详细描述这张图片的内容,包括主要物体、场景、颜色、布局等细节。") -> str:
"""调用豆包视觉理解 API"""
if not client:
return "错误: 豆包客户端未正确初始化,请检查 API Key 配置"
try:
content = [
{"type": "text", "text": prompt},
image_input
]
logger.info(f"调用豆包 API,模型: {MODEL_ID}")
response = client.chat.completions.create(
model=MODEL_ID,
messages=[{"role": "user", "content": content}],
)
if response.choices and len(response.choices) > 0:
result = response.choices[0].message.content
logger.info("豆包 API 调用成功")
return result
else:
return "错误: 未收到有效的响应"
except Exception as e:
error_msg = f"调用豆包 API 失败: {str(e)}"
logger.error(error_msg)
return error_msg
@mcp.tool()
async def describe_image_from_file(file_path: str) -> str:
"""从本地文件路径描述图片"""
try:
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return f"错误: 文件不存在: {file_path}"
# 检查是否为文件
if not path.is_file():
return f"错误: 路径不是文件: {file_path}"
# 检查图片格式
is_valid, error_msg = check_image_format(file_path)
if not is_valid:
logger.warning(f"图片格式检查失败: {file_path}")
return error_msg
# 编码图片
base64_data = encode_image_to_base64(file_path)
# 构建图片输入
image_input = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_data}"}
}
# 调用 API
result = call_doubao_vision_api(image_input)
return result
except Exception as e:
error_msg = f"处理图片失败: {str(e)}"
logger.error(error_msg)
return error_msg
@mcp.tool()
async def describe_image_from_base64(base64_data: str, prompt: str = "请详细描述这张图片的内容") -> str:
"""从 Base64 编码描述图片"""
try:
# 清理 base64 数据
clean_base64 = base64_data.strip()
if clean_base64.startswith('data:image'):
clean_base64 = clean_base64.split(',', 1)[1] if ',' in clean_base64 else clean_base64
# 验证 base64 格式
try:
base64.b64decode(clean_base64, validate=True)
except Exception:
return "错误: 无效的 Base64 编码"
# 构建图片输入
image_input = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{clean_base64}"}
}
# 调用 API
result = call_doubao_vision_api(image_input, prompt)
return result
except Exception as e:
error_msg = f"处理 Base64 图片失败: {str(e)}"
logger.error(error_msg)
return error_msg
@mcp.tool()
async def describe_image_from_url(url: str, prompt: str = "请详细描述这张图片的内容") -> str:
"""从网络 URL 描述图片"""
try:
# 检查 URL 格式
if not url.startswith(('http://', 'https://')):
return "错误: URL 必须以 http:// 或 https:// 开头"
# 检查图片格式(从 URL 中提取扩展名)
is_valid, error_msg = check_image_format(url, is_url=True)
if not is_valid:
logger.warning(f"图片格式检查失败: {url}")
return error_msg
# 直接使用 URL
image_input = {
"type": "image_url",
"image_url": {"url": url}
}
# 调用 API
result = call_doubao_vision_api(image_input, prompt)
return result
except Exception as e:
error_msg = f"处理 URL 图片失败: {str(e)}"
logger.error(error_msg)
return error_msg
@mcp.tool()
async def describe(image_input: str, prompt: str = "请详细描述这张图片的内容") -> str:
"""智能描述图片(自动识别 URL 或本地文件路径)
这是推荐的简化接口,会自动判断输入类型并调用相应的方法。
Args:
image_input: 图片输入,可以是:
- 网络 URL(以 http:// 或 https:// 开头)
- 本地文件路径(如 D:\\download\\photo.jpg)
prompt: 描述提示词(可选,默认为详细描述)
Returns:
豆包视觉理解模型返回的图片描述
Examples:
描述网络图片:
describe("https://example.com/image.jpg")
描述本地文件:
describe("D:\\\\download\\\\photo.jpg")
describe("D:/download/photo.jpg")
使用自定义提示词:
describe("image.jpg", "请描述图片中的颜色和构图")
"""
try:
# 步骤 1: 自动检测输入类型
is_url = image_input.startswith(('http://', 'https://'))
# 步骤 2: 检查图片格式(URL 和文件路径有不同的检查规则)
is_valid, error_msg = check_image_format(image_input, is_url=is_url)
if not is_valid:
logger.warning(f"图片格式检查失败: {image_input}")
return error_msg
# 步骤 3: 检查缓存(仅对本地文件)
if not is_url:
cached_result = cache.get(image_input, MODEL_ID, prompt)
if cached_result:
logger.info(f"使用缓存结果: {image_input}")
return f"[缓存] {cached_result}"
# 检查文件大小
size_valid, size_error = check_file_size(image_input)
if not size_valid:
logger.warning(f"文件大小检查失败: {image_input}")
return size_error
# 步骤 4: 根据类型调用相应方法
if is_url:
# 网络 URL
logger.info(f"检测到网络 URL,使用 describe_image_from_url")
result = await describe_image_from_url(image_input, prompt)
else:
# 本地文件路径
logger.info(f"检测到本地文件路径,使用 describe_image_from_file")
result = await describe_image_from_file(image_input)
# 保存到缓存
if result and "错误" not in result and "失败" not in result:
cache.set(image_input, MODEL_ID, prompt, result)
return result
except Exception as e:
error_msg = f"处理图片失败: {str(e)}"
logger.error(error_msg)
return error_msg
def main():
"""启动 MCP 服务器"""
logger.info("启动豆包图片描述 MCP 服务器")
logger.info(f"API Key: {API_KEY[:20]}...")
logger.info(f"Model ID: {MODEL_ID}")
mcp.run(transport="stdio")
if __name__ == "__main__":
main()