generate_with_viewpoint
Generate game assets with precise camera viewpoint control using depth maps for consistent angles like top-down, side, or front views.
Instructions
Generate a game asset with precise camera viewpoint control using ControlNet.
This tool uses depth maps to guide the generation, ensuring consistent camera angles
like top-down, side view, front view, etc.
Args:
prompt: Description of the asset to generate (e.g., "a wooden barrel")
view_type: Camera angle - "topdown", "side", "front", "3/4"
shape: Object shape hint - "flat", "sphere", "cylinder", "box"
preset: Style preset to use (default: topdown_prop)
controlnet_model: ControlNet model (default: diffusers_xl_depth_full.safetensors)
control_strength: How strongly to follow viewpoint (0.0-1.0, default: 0.8)
width: Output width in pixels
height: Output height in pixels
seed: Random seed for reproducibility
save_to_file: Whether to save the image to disk
Returns:
JSON with base64 image data and metadata
Note:
Requires ControlNet models installed in ComfyUI. Common depth models:
- diffusers_xl_depth_full.safetensors (SDXL)
- control_v11f1p_sd15_depth.pth (SD1.5)
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| prompt | Yes | ||
| view_type | No | topdown | |
| shape | No | flat | |
| preset | No | topdown_prop | |
| controlnet_model | No | diffusers_xl_depth_full.safetensors | |
| control_strength | No | ||
| width | No | ||
| height | No | ||
| seed | No | ||
| save_to_file | No |
Implementation Reference
- server/main.py:602-742 (handler)Primary handler for the 'generate_with_viewpoint' MCP tool. Decorated with @mcp.tool() for automatic registration and schema inference. Implements viewpoint-controlled asset generation using ControlNet depth maps.@mcp.tool() async def generate_with_viewpoint( prompt: str, view_type: str = "topdown", shape: str = "flat", preset: str = "topdown_prop", controlnet_model: str = "diffusers_xl_depth_full.safetensors", control_strength: float = 0.95, width: int = 1024, height: int = 1024, seed: Optional[int] = None, save_to_file: bool = False ) -> str: """Generate a game asset with precise camera viewpoint control using ControlNet. This tool uses depth maps to guide the generation, ensuring consistent camera angles like top-down, side view, front view, etc. Args: prompt: Description of the asset to generate (e.g., "a wooden barrel") view_type: Camera angle - "topdown", "side", "front", "3/4" shape: Object shape hint - "flat", "sphere", "cylinder", "box" preset: Style preset to use (default: topdown_prop) controlnet_model: ControlNet model (default: diffusers_xl_depth_full.safetensors) control_strength: How strongly to follow viewpoint (0.0-1.0, default: 0.8) width: Output width in pixels height: Output height in pixels seed: Random seed for reproducibility save_to_file: Whether to save the image to disk Returns: JSON with base64 image data and metadata Note: Requires ControlNet models installed in ComfyUI. Common depth models: - diffusers_xl_depth_full.safetensors (SDXL) - control_v11f1p_sd15_depth.pth (SD1.5) """ preset_config = get_preset(preset) # Build full prompt with preset full_prompt = f"{preset_config.prompt_prefix}{prompt}{preset_config.prompt_suffix}" full_negative = preset_config.negative_prompt img_width = width img_height = height render_width = img_width render_height = img_height should_downscale = (img_width < preset_config.default_width) or (img_height < preset_config.default_height) if should_downscale: scale = max(preset_config.default_width / max(1, img_width), preset_config.default_height / max(1, img_height)) render_width = int(round(img_width * scale)) render_height = int(round(img_height * scale)) # Clamp render dimensions to match backend constraints (SDXL-safe) render_width = max(512, min(2048, (render_width // 8) * 8)) render_height = max(512, min(2048, (render_height // 8) * 8)) # Create depth map for the specified viewpoint depth_map = create_depth_map(render_width, render_height, view_type=view_type, shape=shape) try: # Generate with timeout to prevent hanging image_bytes = await asyncio.wait_for( backend.generate_with_controlnet( prompt=full_prompt, control_image=depth_map, controlnet_model=controlnet_model, control_strength=control_strength, negative_prompt=full_negative, width=render_width, height=render_height, seed=seed, steps=preset_config.steps, cfg_scale=preset_config.cfg_scale, sampler=preset_config.sampler, scheduler=preset_config.scheduler ), timeout=300.0 # 5 minute timeout ) except asyncio.TimeoutError: return json.dumps({ "success": False, "error": "Generation timed out after 5 minutes", "backend": backend.get_name(), "backend_type": BACKEND_TYPE }, indent=2) except NotImplementedError as e: return json.dumps({ "success": False, "error": str(e), "hint": "ControlNet requires ComfyUI backend with ControlNet models installed", "backend": backend.get_name(), "backend_type": BACKEND_TYPE }, indent=2) except Exception as e: return json.dumps({ "success": False, "error": str(e), "hint": "Check if ControlNet model exists in ComfyUI/models/controlnet/", "backend": backend.get_name() }, indent=2) if should_downscale: resample = Image.Resampling.NEAREST if preset.startswith("pixel") else Image.Resampling.LANCZOS image_bytes = resize_image(image_bytes, img_width, img_height, resample=resample) image_b64 = image_to_base64(image_bytes) result = { "success": True, "backend": backend.get_name(), "width": img_width, "height": img_height, "view_type": view_type, "shape": shape, "control_strength": control_strength, "preset": preset, "prompt": full_prompt, "hash": hash_image(image_bytes) } # ControlNet images are always saved to file to ensure reliable MCP response # (large base64 payloads can cause MCP stdio transport issues) output_dir = ensure_directory(OUTPUT_DIR / "controlnet") fname = generate_filename(prefix=f"cn_{view_type}", suffix=shape) file_path = output_dir / fname file_path.write_bytes(image_bytes) result["file_path"] = str(file_path) depth_path = output_dir / f"depth_{fname}" depth_path.write_bytes(depth_map) result["depth_map_path"] = str(depth_path) # Never include base64 for ControlNet - always use file_path # This prevents MCP stdio blocking and ensures agent receives response result["image_base64_omitted"] = True result["image_base64_omitted_reason"] = "controlnet_always_saves_to_file" return json.dumps(result, indent=2)
- server/main.py:744-813 (helper)Convenience wrapper tool 'generate_topdown_asset' that invokes 'generate_with_viewpoint' with top-down specific parameters and asset type adjustments.@mcp.tool() async def generate_topdown_asset( prompt: str, asset_type: str = "prop", size: int = 512, control_strength: float = 0.65, seed: Optional[int] = None, save_to_file: bool = True ) -> str: """Simplified tool to generate top-down 2D game assets with guaranteed viewpoint. This is a convenience wrapper around generate_with_viewpoint specifically for top-down games (RPG, strategy, etc.). Args: prompt: Description of the asset (e.g., "wooden treasure chest", "stone well") asset_type: Type of asset - "prop", "character", "creature", "tile", "effect" size: Output size in pixels (square) control_strength: How strictly to enforce top-down view (0.5-1.0) seed: Random seed for reproducibility save_to_file: Whether to save the image to disk (default: True for reliability) Returns: JSON with file_path to generated image """ # Map asset type to preset and shape preset_map = { "prop": ("topdown_prop", "box"), "character": ("topdown_character", "humanoid"), "creature": ("topdown_creature", "humanoid"), "tile": ("topdown_tile", "flat"), "effect": ("effect", "sphere"), } preset, shape = preset_map.get(asset_type, ("topdown_prop", "flat")) effective_strength = control_strength effective_prompt = prompt if asset_type == "character": effective_strength = min(control_strength, 0.70) effective_prompt = ( f"{prompt}, single character, one body, one head, full body, " f"no visible face, no eyes, no mouth, helmet top view, " f"no duplicated weapons, no duplicated armor, no floating parts, no separate objects" ) elif asset_type == "creature": effective_strength = min(control_strength, 0.70) effective_prompt = ( f"{prompt}, single creature, one body, full body, " f"no duplicated limbs, no floating parts, no separate objects" ) elif asset_type == "prop": # Stardew Valley style - allow more artistic freedom effective_strength = min(control_strength, 0.60) effective_prompt = f"{prompt}, single object" elif asset_type == "effect": effective_prompt = f"{prompt}, centered effect, radial glow" # Use the viewpoint tool return await generate_with_viewpoint( prompt=effective_prompt, view_type="topdown", shape=shape, preset=preset, control_strength=effective_strength, width=size, height=size, seed=seed, save_to_file=save_to_file )