health_overview
Get a complete health snapshot of your deployment, including gateway, resources, skill registry, upgrade status, cron, and disk usage. Identifies critical issues with per-component breakdown.
Instructions
Full deployment health snapshot — gateway + resources + skill registry + upgrade status + cron + disk in one call. Returns overall HealthLevel (healthy/degraded/critical/unknown) plus per-component breakdown plus ranked critical_findings list. Use this first for a single-pane summary.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- src/openclaw_health_mcp/server.py:39-49 (registration)Tool registration for 'health_overview' in list_tools() — defines name, description, and empty inputSchema.
return [ Tool( name="health_overview", description=( "Full deployment health snapshot — gateway + resources + skill registry + " "upgrade status + cron + disk in one call. Returns overall HealthLevel " "(healthy/degraded/critical/unknown) plus per-component breakdown plus " "ranked critical_findings list. Use this first for a single-pane summary." ), inputSchema={"type": "object", "properties": {}, "required": []}, ), - Handler for 'health_overview' in call_tool() — gathers 6 component metrics from backend, calls build_snapshot(), and serializes the result.
@server.call_tool() # type: ignore[untyped-decorator] async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]: logger.debug("call_tool name=%s args=%s", name, arguments) if name == "health_overview": gateway = await backend.get_gateway_status() resources = await backend.get_resource_metrics() skill_registry = await backend.get_skill_registry_check() upgrade = await backend.get_upgrade_status() cron = await backend.get_cron_health() disk = await backend.get_disk_usage() snapshot = build_snapshot(gateway, resources, skill_registry, upgrade, cron, disk) return _serialize(snapshot) - build_snapshot() — composes HealthSnapshot from per-component data, classifies overall health, and extracts critical findings.
def build_snapshot( gateway: GatewayStatus, resources: ResourceMetrics, skill_registry: SkillRegistryCheck, upgrade: UpgradeStatus, cron: CronHealthSummary, disk: DiskUsage, ) -> HealthSnapshot: """Compose a HealthSnapshot from per-component results.""" component_summary: dict[str, HealthLevel] = { "gateway": gateway.health, "resources": resources.overall_health, "skill_registry": skill_registry.health, "upgrade": upgrade.health, "cron": cron.health, "disk": disk.health, } overall = classify_overall(list(component_summary.values())) findings = extract_critical_findings(gateway, resources, skill_registry, upgrade, cron, disk) return HealthSnapshot( captured_at=datetime.now(UTC), overall_health=overall, gateway=gateway, resources=resources, skill_registry=skill_registry, upgrade=upgrade, cron=cron, disk=disk, component_summary=component_summary, critical_findings=findings, ) - classify_overall() — reduces a list of component HealthLevels to an overall classification (CRITICAL > DEGRADED > HEALTHY > UNKNOWN).
def classify_overall(component_levels: list[HealthLevel]) -> HealthLevel: """Reduce a list of per-component HealthLevels to one overall classification.""" if any(level == HealthLevel.CRITICAL for level in component_levels): return HealthLevel.CRITICAL if any(level == HealthLevel.DEGRADED for level in component_levels): return HealthLevel.DEGRADED if any(level == HealthLevel.HEALTHY for level in component_levels): return HealthLevel.HEALTHY return HealthLevel.UNKNOWN - extract_critical_findings() — builds flat human-readable list of critical/DEGRADED findings ordered by severity from all 6 components.
def extract_critical_findings( gateway: GatewayStatus, resources: ResourceMetrics, skill_registry: SkillRegistryCheck, upgrade: UpgradeStatus, cron: CronHealthSummary, disk: DiskUsage, ) -> list[str]: """Build a flat human-readable list of critical findings, ordered by severity. A finding is "critical" if (a) the component reports CRITICAL, or (b) a specific pattern triggers regardless of component classification (e.g., gateway down, OOM events, disk >95%). DEGRADED components contribute findings only if they have notes worth surfacing. """ findings: list[str] = [] # Gateway if not gateway.is_alive: findings.append("[CRITICAL] Gateway is not alive — agents cannot receive or dispatch requests.") elif gateway.health == HealthLevel.CRITICAL: findings.extend(f"[CRITICAL] Gateway: {note}" for note in gateway.notes) elif gateway.crashes_24h is not None and gateway.crashes_24h >= 3: findings.append( f"[CRITICAL] Gateway crashed {gateway.crashes_24h} times in 24h — investigate crash-loop pattern." ) # Resources if resources.oom_events_24h > 0: findings.append( f"[CRITICAL] {resources.oom_events_24h} OOM kill(s) in 24h " f"— kernel killed processes due to memory pressure." ) if resources.memory_percent is not None and resources.memory_percent >= 95.0: findings.append( f"[CRITICAL] Memory at {resources.memory_percent:.1f}% — OOM imminent." ) if resources.swap_percent is not None and resources.swap_percent >= 50.0: findings.append( f"[CRITICAL] Swap at {resources.swap_percent:.1f}% — system thrashing, expect severe latency." ) # Skill registry if skill_registry.health == HealthLevel.CRITICAL: for skill in skill_registry.skills_flagged_suspicious: findings.append( f"[CRITICAL] Skill flagged suspicious: '{skill}'. Disable until vetted (see openclaw-skill-vetter-mcp)." ) if not skill_registry.skills_flagged_suspicious: for note in skill_registry.notes: findings.append(f"[CRITICAL] Skill registry: {note}") # Upgrade if upgrade.last_upgrade_outcome == "failure": findings.append( f"[CRITICAL] Last upgrade ({upgrade.last_upgrade_from_version}→{upgrade.last_upgrade_to_version}) " f"failed. System may be in inconsistent state." ) elif upgrade.last_upgrade_outcome == "rollback": markers = ( ", ".join(upgrade.regression_markers) if upgrade.regression_markers else "no markers recorded" ) findings.append( f"[DEGRADED] Last upgrade rolled back " f"({upgrade.last_upgrade_from_version}→{upgrade.last_upgrade_to_version}). " f"Regression markers: {markers}." ) # Disk if disk.root_used_percent is not None and disk.root_used_percent >= 95.0: findings.append(f"[CRITICAL] Root disk at {disk.root_used_percent:.1f}% — risk of write failure imminent.") elif disk.root_used_percent is not None and disk.root_used_percent >= 85.0: findings.append( f"[DEGRADED] Root disk at {disk.root_used_percent:.1f}% — set up log rotation before reaching 95%." ) # Cron — basic; richer findings come from silentwatch-mcp if cron.overdue_jobs > 0: findings.append( f"[DEGRADED] {cron.overdue_jobs} cron job(s) overdue. " f"Install silentwatch-mcp for silent-failure detection." ) return findings