image
Generate images from text prompts or edit existing images using AI models, supporting multiple aspect ratios and resolutions, with results saved as files.
Instructions
Generate images via OpenRouter-compatible or OpenAI-compatible endpoints.
CAPABILITIES:
Text-to-image generation with multiple providers
Image editing and transformation with reference images
Multiple aspect ratios and resolutions (1K/2K/4K)
RESPONSE FORMAT:
Returns XML with file paths to generated images
Images saved to disk (no base64 in response)
Includes text descriptions when available
BEST PRACTICES:
Be descriptive: describe scenes, lighting, style, composition
Use negative constraints in prompt: "no text", "no watermark", "no blur"
For editing: provide reference image and specify what to keep
Supports: reference images for editing.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| prompt | Yes | Image generation prompt. Structure: <goal>what you want to generate (can be a statement)</goal> <context>detailed background info - the more the better</context> <hope>desired visual outcome, can be abstract</hope>. Example: <goal>Create a 4-panel comic about debugging</goal> <context>Developer finds a bug at 3am, tries multiple fixes, finally discovers it was a typo, comedic relief for tech blog</context> <hope>simple black-white line art, speech bubbles, exaggerated tired expressions</hope> | |
| images | No | Reference images for editing or style transfer. | |
| model | No | Model to use (default: from IMAGE_MODEL env). | |
| aspect_ratio | No | Output image aspect ratio. Default: 1:1 (square). | 1:1 |
| resolution | No | Output resolution. 1K (1024px), 2K (2048px), 4K (4096px). Default: 1K. | 1K |
| quality | No | Image quality (OpenAI generations API). Options: standard, hd. | standard |
| save_path | Yes | Base directory for saving images. Files saved to {save_path}/{task_note}/. | |
| api_type | No | API type to use. Default: from IMAGE_API_TYPE env var (openrouter_chat). | openrouter_chat |
| task_note | Yes | Subdirectory name for saving images (English recommended, e.g., 'hero-banner', 'product-shot'). Also shown in GUI. |
Implementation Reference
- The ImageHandler class implements the core logic for the 'image' tool as a ToolHandler subclass. It defines the tool name, description, input schema, validation, and the async handle method that pushes prompts to GUI, creates ImageInvoker, executes image generation, formats XML response with debug info, and pushes results to GUI.class ImageHandler(ToolHandler): """Image 图像生成工具处理器。""" @property def name(self) -> str: return "image" @property def description(self) -> str: return """Generate images via OpenRouter-compatible or OpenAI-compatible endpoints. CAPABILITIES: - Text-to-image generation with multiple providers - Image editing and transformation with reference images - Multiple aspect ratios and resolutions (1K/2K/4K) RESPONSE FORMAT: - Returns XML with file paths to generated images - Images saved to disk (no base64 in response) - Includes text descriptions when available BEST PRACTICES: - Be descriptive: describe scenes, lighting, style, composition - Use negative constraints in prompt: "no text", "no watermark", "no blur" - For editing: provide reference image and specify what to keep Supports: reference images for editing.""" def get_input_schema(self) -> dict[str, Any]: return { "type": "object", "properties": { "prompt": {"type": "string", "description": "Image generation prompt"}, "save_path": {"type": "string", "description": "Base directory for saving images"}, "task_note": {"type": "string", "description": "Subdirectory name for saving images"}, "model": {"type": "string", "default": ""}, "images": {"type": "array", "items": {"type": "object"}, "default": []}, "aspect_ratio": {"type": "string", "default": "1:1"}, "resolution": {"type": "string", "default": "1K", "enum": ["1K", "2K", "4K"]}, "quality": {"type": "string", "default": "standard"}, "api_type": {"type": "string", "default": ""}, "debug": {"type": "boolean", "description": "Enable debug output"}, }, "required": ["prompt", "save_path", "task_note"], } def validate(self, arguments: dict[str, Any]) -> str | None: if not arguments.get("prompt"): return "Missing required argument: 'prompt'" if not arguments.get("save_path"): return "Missing required argument: 'save_path'" if not arguments.get("task_note"): return "Missing required argument: 'task_note'" return None async def handle( self, arguments: dict[str, Any], ctx: ToolContext, ) -> list[TextContent]: prompt = arguments.get("prompt", "") task_note = arguments.get("task_note", "") # 推送用户 prompt 到 GUI ctx.push_user_prompt("image", prompt, task_note) # 创建事件回调 def event_callback(event: Any) -> None: if ctx.gui_manager and ctx.gui_manager.is_running: event_dict = event.model_dump() if hasattr(event, "model_dump") else dict(event.__dict__) event_dict["source"] = "image" ctx.gui_manager.push_event(event_dict) # 创建 invoker 并执行 invoker = ImageInvoker(event_callback=event_callback) params = ImageParams( prompt=prompt, model=arguments.get("model", ""), images=arguments.get("images", []), save_path=arguments.get("save_path", ""), task_note=task_note, aspect_ratio=arguments.get("aspect_ratio", "1:1"), resolution=arguments.get("resolution", "1K"), quality=arguments.get("quality", "standard"), api_type=arguments.get("api_type", ""), ) try: result = await invoker.execute(params) if result.success: response = result.response_xml # 添加 debug_info(仅当 debug 开启时) debug_enabled = ctx.resolve_debug(arguments) if debug_enabled: response += ( f"\n<debug_info>" f"\n <image_count>{len(result.artifacts) if result.artifacts else 0}</image_count>" f"\n <duration_sec>{result.duration_sec:.3f}</duration_sec>" f"\n <model>{params.model or 'env:IMAGE_MODEL'}</model>" f"\n <api_type>{params.api_type or 'env:IMAGE_API_TYPE'}</api_type>" f"\n</debug_info>" ) # 推送结果到 GUI gui_metadata: dict[str, Any] = { "artifacts": result.artifacts, "task_note": task_note, } if debug_enabled: gui_metadata["debug"] = { "image_count": len(result.artifacts) if result.artifacts else 0, "duration_sec": result.duration_sec, "model": params.model or "env:IMAGE_MODEL", "api_type": params.api_type or "env:IMAGE_API_TYPE", } ctx.push_to_gui({ "category": "operation", "operation_type": "tool_call", "source": "image", "session_id": f"image_{result.request_id}", "name": "image", "status": "success", "output": response, "metadata": gui_metadata, }) return [TextContent(type="text", text=response)] else: return format_error_response(result.error or "Unknown error") except asyncio.CancelledError: raise except Exception as e: logger.exception(f"Image tool error: {e}") return format_error_response(str(e))
- The JSON schema for the 'image' tool is defined in create_tool_schema(cli_type='image'), incorporating IMAGE_PROPERTIES (lines 385-433) and a structured prompt description. This schema is used in the MCP Tool's inputSchema.if cli_type == "image": properties: dict[str, Any] = { "prompt": { "type": "string", "description": ( "Image generation prompt. Structure: " "<goal>what you want to generate (can be a statement)</goal> " "<context>detailed background info - the more the better</context> " "<hope>desired visual outcome, can be abstract</hope>. " "Example: <goal>Create a 4-panel comic about debugging</goal> " "<context>Developer finds a bug at 3am, tries multiple fixes, finally discovers it was a typo, comedic relief for tech blog</context> " "<hope>simple black-white line art, speech bubbles, exaggerated tired expressions</hope>" ), }, } properties.update(IMAGE_PROPERTIES) properties["task_note"] = { "type": "string", "description": "Subdirectory name for saving images (English recommended, e.g., 'hero-banner', 'product-shot'). Also shown in GUI.", } return { "type": "object", "properties": properties, "required": ["prompt", "save_path", "task_note"], }
- src/cli_agent_mcp/server.py:104-112 (registration)In the MCP server's list_tools() method, the 'image' tool is registered by adding it to the list of available tools if allowed by config, using name='image', description from TOOL_DESCRIPTIONS, and schema from create_tool_schema.for cli_type in ["codex", "gemini", "claude", "opencode", "banana", "image"]: if config.is_tool_allowed(cli_type): tools.append( Tool( name=cli_type, description=TOOL_DESCRIPTIONS[cli_type], inputSchema=create_tool_schema(cli_type), ) )
- src/cli_agent_mcp/server.py:211-213 (registration)In the MCP server's call_tool() method, when base_name=='image', it instantiates ImageHandler and calls its handle method to execute the tool.if base_name == "image": handler = ImageHandler() return await handler.handle(arguments, tool_ctx)
- The ImageInvoker class provides the supporting logic for image generation, executing ImageClient.generate(), processing results into XML format with file paths, emitting events, and returning ImageExecutionResult used by the handler.class ImageInvoker: """Image 调用器。 封装 Image API,提供与 CLI Invoker 一致的接口。 Example: invoker = ImageInvoker() result = await invoker.execute(ImageParams( prompt="A beautiful sunset", task_note="sunset-wallpaper", )) """ def __init__( self, event_callback: EventCallback | None = None, ) -> None: self._event_callback = event_callback self._client: ImageClient | None = None @property def cli_type(self) -> str: return "image" @property def cli_name(self) -> str: return "image" def _get_client(self) -> ImageClient: if self._client is None: self._client = ImageClient( event_callback=self._on_client_event, ) return self._client def _on_client_event(self, event: dict[str, Any]) -> None: if not self._event_callback: return event_type = event.get("type", "") if event_type == "generation_started": unified = make_fallback_event( CLISource.UNKNOWN, { "type": "system", "subtype": "info", "message": f"Processing: {event.get('prompt', '')[:50]}...", "source": "image", }, ) elif event_type == "generation_completed": unified = make_fallback_event( CLISource.UNKNOWN, { "type": "system", "subtype": "info", "message": f"Generated {event.get('artifact_count', 0)} image(s)", "source": "image", }, ) elif event_type == "generation_failed": unified = make_fallback_event( CLISource.UNKNOWN, { "type": "system", "subtype": "error", "severity": "error", "message": f"Failed: {event.get('error', 'Unknown error')}", "source": "image", }, ) elif event_type == "api_retry": unified = make_fallback_event( CLISource.UNKNOWN, { "type": "system", "subtype": "warning", "severity": "warning", "message": f"API error {event.get('status_code')}, retrying in {event.get('delay')}s...", "source": "image", }, ) else: return self._event_callback(unified) def _parse_images(self, images: list[dict[str, Any]]) -> list[ImageInput]: """解析图片输入列表。 图片路径必须是绝对路径且存在。 """ result = [] for img in images: source = img.get("source", "") if not source: continue source_path = Path(source) if not source_path.is_absolute(): logger.warning(f"Skipping non-absolute image path: {source}") continue # Resolve to real path (follows symlinks) try: resolved_path = source_path.resolve() except (OSError, ValueError) as e: logger.warning(f"Skipping invalid image path {source}: {e}") continue if not resolved_path.exists(): logger.warning(f"Skipping non-existent image: {resolved_path}") continue result.append(ImageInput(source=str(resolved_path))) return result def _build_response_xml(self, response: ImageResponse) -> str: request_id = html.escape(response.request_id, quote=True) model = html.escape(response.model, quote=True) lines = [ f'<image-response request_id="{request_id}" model="{model}">' ] if response.text_content: lines.append(f' <text>{_escape_xml(response.text_content)}</text>') for artifact in response.artifacts: artifact_id = html.escape(artifact.id, quote=True) kind = html.escape(artifact.kind, quote=True) mime_type = html.escape(artifact.mime_type, quote=True) path = html.escape(artifact.path, quote=True) sha256 = html.escape(artifact.sha256, quote=True) lines.append( f' <artifact id="{artifact_id}" kind="{kind}" ' f'mime_type="{mime_type}" path="{path}" ' f'sha256="{sha256}"/>' ) lines.append('</image-response>') return '\n'.join(lines) async def execute(self, params: ImageParams) -> ImageExecutionResult: start_time = time.time() if not params.prompt: return ImageExecutionResult( success=False, error="prompt is required", ) # 处理 save_path(不再创建子目录) output_dir = params.save_path # 清理 task_note 用于文件名前缀 task_note = sanitize_task_note(params.task_note) request = ImageRequest( prompt=params.prompt, model=params.model, images=self._parse_images(params.images), output_dir=output_dir, task_note=task_note, aspect_ratio=params.aspect_ratio, resolution=params.resolution, quality=params.quality, api_type=params.api_type, ) client = self._get_client() try: response = await client.generate(request) duration = time.time() - start_time if not response.success: return ImageExecutionResult( success=False, request_id=response.request_id, error=response.error, duration_sec=duration, ) response_xml = self._build_response_xml(response) return ImageExecutionResult( success=True, request_id=response.request_id, response_xml=response_xml, artifacts=[a.path for a in response.artifacts], duration_sec=duration, ) except asyncio.CancelledError: raise except Exception as e: logger.exception(f"Image execution failed: {e}") return ImageExecutionResult( success=False, error=str(e), duration_sec=time.time() - start_time, ) finally: if self._client: await self._client.close() self._client = None