generate_with_viewpoint

Generate game assets with precise camera viewpoint control using depth maps for consistent angles like top-down, side, or front views.

Instructions

Generate a game asset with precise camera viewpoint control using ControlNet.

This tool uses depth maps to guide the generation, ensuring consistent camera angles
like top-down, side view, front view, etc.

Args:
    prompt: Description of the asset to generate (e.g., "a wooden barrel")
    view_type: Camera angle - "topdown", "side", "front", "3/4"
    shape: Object shape hint - "flat", "sphere", "cylinder", "box"
    preset: Style preset to use (default: topdown_prop)
    controlnet_model: ControlNet model (default: diffusers_xl_depth_full.safetensors)
    control_strength: How strongly to follow viewpoint (0.0-1.0, default: 0.8)
    width: Output width in pixels
    height: Output height in pixels
    seed: Random seed for reproducibility
    save_to_file: Whether to save the image to disk

Returns:
    JSON with base64 image data and metadata

Note:
    Requires ControlNet models installed in ComfyUI. Common depth models:
    - diffusers_xl_depth_full.safetensors (SDXL)
    - control_v11f1p_sd15_depth.pth (SD1.5)

Input Schema

TableJSON Schema

Name	Required	Default
`prompt`	Yes
`view_type`	No	topdown
`shape`	No	flat
`preset`	No	topdown_prop
`controlnet_model`	No	diffusers_xl_depth_full.safetensors
`control_strength`	No
`width`	No
`height`	No
`seed`	No
`save_to_file`	No

Implementation Reference

server/main.py:602-742 (handler)

Primary handler for the 'generate_with_viewpoint' MCP tool. Decorated with @mcp.tool() for automatic registration and schema inference. Implements viewpoint-controlled asset generation using ControlNet depth maps.

@mcp.tool()
async def generate_with_viewpoint(
    prompt: str,
    view_type: str = "topdown",
    shape: str = "flat",
    preset: str = "topdown_prop",
    controlnet_model: str = "diffusers_xl_depth_full.safetensors",
    control_strength: float = 0.95,
    width: int = 1024,
    height: int = 1024,
    seed: Optional[int] = None,
    save_to_file: bool = False
) -> str:
    """Generate a game asset with precise camera viewpoint control using ControlNet.
    
    This tool uses depth maps to guide the generation, ensuring consistent camera angles
    like top-down, side view, front view, etc.
    
    Args:
        prompt: Description of the asset to generate (e.g., "a wooden barrel")
        view_type: Camera angle - "topdown", "side", "front", "3/4"
        shape: Object shape hint - "flat", "sphere", "cylinder", "box"
        preset: Style preset to use (default: topdown_prop)
        controlnet_model: ControlNet model (default: diffusers_xl_depth_full.safetensors)
        control_strength: How strongly to follow viewpoint (0.0-1.0, default: 0.8)
        width: Output width in pixels
        height: Output height in pixels
        seed: Random seed for reproducibility
        save_to_file: Whether to save the image to disk
    
    Returns:
        JSON with base64 image data and metadata
    
    Note:
        Requires ControlNet models installed in ComfyUI. Common depth models:
        - diffusers_xl_depth_full.safetensors (SDXL)
        - control_v11f1p_sd15_depth.pth (SD1.5)
    """
    preset_config = get_preset(preset)
    
    # Build full prompt with preset
    full_prompt = f"{preset_config.prompt_prefix}{prompt}{preset_config.prompt_suffix}"
    full_negative = preset_config.negative_prompt

    img_width = width
    img_height = height

    render_width = img_width
    render_height = img_height
    should_downscale = (img_width < preset_config.default_width) or (img_height < preset_config.default_height)
    if should_downscale:
        scale = max(preset_config.default_width / max(1, img_width), preset_config.default_height / max(1, img_height))
        render_width = int(round(img_width * scale))
        render_height = int(round(img_height * scale))

    # Clamp render dimensions to match backend constraints (SDXL-safe)
    render_width = max(512, min(2048, (render_width // 8) * 8))
    render_height = max(512, min(2048, (render_height // 8) * 8))
    
    # Create depth map for the specified viewpoint
    depth_map = create_depth_map(render_width, render_height, view_type=view_type, shape=shape)
    
    try:
        # Generate with timeout to prevent hanging
        image_bytes = await asyncio.wait_for(
            backend.generate_with_controlnet(
                prompt=full_prompt,
                control_image=depth_map,
                controlnet_model=controlnet_model,
                control_strength=control_strength,
                negative_prompt=full_negative,
                width=render_width,
                height=render_height,
                seed=seed,
                steps=preset_config.steps,
                cfg_scale=preset_config.cfg_scale,
                sampler=preset_config.sampler,
                scheduler=preset_config.scheduler
            ),
            timeout=300.0  # 5 minute timeout
        )
    except asyncio.TimeoutError:
        return json.dumps({
            "success": False,
            "error": "Generation timed out after 5 minutes",
            "backend": backend.get_name(),
            "backend_type": BACKEND_TYPE
        }, indent=2)
    except NotImplementedError as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "hint": "ControlNet requires ComfyUI backend with ControlNet models installed",
            "backend": backend.get_name(),
            "backend_type": BACKEND_TYPE
        }, indent=2)
    except Exception as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "hint": "Check if ControlNet model exists in ComfyUI/models/controlnet/",
            "backend": backend.get_name()
        }, indent=2)
    
    if should_downscale:
        resample = Image.Resampling.NEAREST if preset.startswith("pixel") else Image.Resampling.LANCZOS
        image_bytes = resize_image(image_bytes, img_width, img_height, resample=resample)

    image_b64 = image_to_base64(image_bytes)
    result = {
        "success": True,
        "backend": backend.get_name(),
        "width": img_width,
        "height": img_height,
        "view_type": view_type,
        "shape": shape,
        "control_strength": control_strength,
        "preset": preset,
        "prompt": full_prompt,
        "hash": hash_image(image_bytes)
    }
    
    # ControlNet images are always saved to file to ensure reliable MCP response
    # (large base64 payloads can cause MCP stdio transport issues)
    output_dir = ensure_directory(OUTPUT_DIR / "controlnet")
    fname = generate_filename(prefix=f"cn_{view_type}", suffix=shape)
    file_path = output_dir / fname
    file_path.write_bytes(image_bytes)
    result["file_path"] = str(file_path)
    
    depth_path = output_dir / f"depth_{fname}"
    depth_path.write_bytes(depth_map)
    result["depth_map_path"] = str(depth_path)
    
    # Never include base64 for ControlNet - always use file_path
    # This prevents MCP stdio blocking and ensures agent receives response
    result["image_base64_omitted"] = True
    result["image_base64_omitted_reason"] = "controlnet_always_saves_to_file"
    
    return json.dumps(result, indent=2)

server/main.py:744-813 (helper)

Convenience wrapper tool 'generate_topdown_asset' that invokes 'generate_with_viewpoint' with top-down specific parameters and asset type adjustments.

@mcp.tool()
async def generate_topdown_asset(
    prompt: str,
    asset_type: str = "prop",
    size: int = 512,
    control_strength: float = 0.65,
    seed: Optional[int] = None,
    save_to_file: bool = True
) -> str:
    """Simplified tool to generate top-down 2D game assets with guaranteed viewpoint.
    
    This is a convenience wrapper around generate_with_viewpoint specifically for
    top-down games (RPG, strategy, etc.).
    
    Args:
        prompt: Description of the asset (e.g., "wooden treasure chest", "stone well")
        asset_type: Type of asset - "prop", "character", "creature", "tile", "effect"
        size: Output size in pixels (square)
        control_strength: How strictly to enforce top-down view (0.5-1.0)
        seed: Random seed for reproducibility
        save_to_file: Whether to save the image to disk (default: True for reliability)
    
    Returns:
        JSON with file_path to generated image
    """
    # Map asset type to preset and shape
    preset_map = {
        "prop": ("topdown_prop", "box"),
        "character": ("topdown_character", "humanoid"),
        "creature": ("topdown_creature", "humanoid"),
        "tile": ("topdown_tile", "flat"),
        "effect": ("effect", "sphere"),
    }
    
    preset, shape = preset_map.get(asset_type, ("topdown_prop", "flat"))

    effective_strength = control_strength
    effective_prompt = prompt
    if asset_type == "character":
        effective_strength = min(control_strength, 0.70)
        effective_prompt = (
            f"{prompt}, single character, one body, one head, full body, "
            f"no visible face, no eyes, no mouth, helmet top view, "
            f"no duplicated weapons, no duplicated armor, no floating parts, no separate objects"
        )
    elif asset_type == "creature":
        effective_strength = min(control_strength, 0.70)
        effective_prompt = (
            f"{prompt}, single creature, one body, full body, "
            f"no duplicated limbs, no floating parts, no separate objects"
        )
    elif asset_type == "prop":
        # Stardew Valley style - allow more artistic freedom
        effective_strength = min(control_strength, 0.60)
        effective_prompt = f"{prompt}, single object"
    elif asset_type == "effect":
        effective_prompt = f"{prompt}, centered effect, radial glow"
    
    # Use the viewpoint tool
    return await generate_with_viewpoint(
        prompt=effective_prompt,
        view_type="topdown",
        shape=shape,
        preset=preset,
        control_strength=effective_strength,
        width=size,
        height=size,
        seed=seed,
        save_to_file=save_to_file
    )

ComfyMCP Studio