"""
Health Domain Models
This module contains all models related to Ceph cluster health,
extracted from the original ceph_models.py and enhanced.
"""
from datetime import datetime
from enum import Enum
from pydantic import BaseModel, Field
from ceph_mcp.models.base import BaseComponentInfo
class HealthStatus(str, Enum):
"""
Standardized health status values that Ceph uses.
Using an enum ensures we only work with valid status values and makes
it easier to handle status-specific logic throughout the application.
"""
OK = "HEALTH_OK"
WARN = "HEALTH_WARN"
ERR = "HEALTH_ERR"
class HealthCheckSeverity(str, Enum):
"""Severity levels for individual health checks."""
INFO = "HEALTH_INFO"
WARN = "HEALTH_WARN"
ERR = "HEALTH_ERR"
class HealthCheck(BaseModel):
"""
Individual health check information.
Represents a single health check with its details and recommendations.
"""
type: str = Field(..., description="Health check type")
severity: HealthCheckSeverity = Field(description="Severity level of this check")
summary: str = Field(description="Human-readable summary of the issue")
details: str = Field(default="", description="Detailed information about the issue")
count: int = Field(default=0, description="Number of affected components")
def is_critical(self) -> bool:
"""
Check if this health check represents a critical issue.
"""
return self.severity == HealthCheckSeverity.ERR
def is_warning(self) -> bool:
"""
Check if this health check represents a warning.
"""
return self.severity == HealthCheckSeverity.WARN
def get_priority_score(self) -> int:
"""
Get a numeric priority score for sorting (higher = more urgent).
"""
priority_map = {
HealthCheckSeverity.ERR: 100,
HealthCheckSeverity.WARN: 50,
HealthCheckSeverity.INFO: 10,
}
return priority_map.get(self.severity, 0)
class ClusterHealth(BaseComponentInfo):
"""
Represents the overall health status of a Ceph cluster.
"""
name: str = Field(
default="cluster_health", description="Identifier for the Component"
)
cluster_fsid: str = Field(description="FSID of Ceph cluster")
status: HealthStatus = Field(description="Overall cluster health status")
checks: list[HealthCheck] = Field(
default_factory=list, description="Individual health checks"
)
overall_status_description: str | None = Field(
None, description="Detailed explanation of current status"
)
collected_at: datetime = Field(
default_factory=datetime.now,
description="Timestamp when metrics were collected",
)
def is_healthy(self) -> bool:
"""
Convenience method to check if cluster is in good health.
"""
return self.status == HealthStatus.OK
def has_warnings(self) -> bool:
"""
Check if cluster has warnings that need attention.
"""
return self.status == HealthStatus.WARN or any(
check.is_warning() for check in self.checks
)
def has_errors(self) -> bool:
"""
Check if cluster has errors requiring immediate attention.
"""
return self.status == HealthStatus.ERR or any(
check.is_critical() for check in self.checks
)
def get_critical_checks(self) -> list[HealthCheck]:
"""Get all critical health checks."""
return [check for check in self.checks if check.is_critical()]
def get_warning_checks(self) -> list[HealthCheck]:
"""Get all warning health checks."""
return [check for check in self.checks if check.is_warning()]
def get_checks_by_priority(self) -> list[HealthCheck]:
"""Get health checks sorted by priority (most urgent first)."""
return sorted(self.checks, key=lambda x: x.get_priority_score(), reverse=True)
def get_health_score(self) -> int:
"""
Get a numeric health score (0-100, where 100 is perfect health).
This provides a single number that can be used for monitoring
and trend analysis.
"""
if self.status == HealthStatus.OK:
return 100
# Deduct points based on check severity
score = 100
for check in self.checks:
if check.is_critical():
score -= 30 # Critical issues heavily impact score
elif check.is_warning():
score -= 10 # Warnings have moderate impact
return max(0, score)
def get_recommendations(self) -> list[str]:
"""Get actionable recommendations based on current health status."""
recommendations = []
if self.is_healthy():
recommendations.append("Cluster is healthy - continue regular monitoring")
return recommendations
critical_checks = self.get_critical_checks()
warning_checks = self.get_warning_checks()
if critical_checks:
recommendations.append(
f"🔴 Address {len(critical_checks)} critical issue(s) immediately"
)
for check in critical_checks[:3]: # Show top 3 critical issues
recommendations.append(f" - {check.summary}")
if warning_checks:
recommendations.append(
f"🟡 Investigate {len(warning_checks)} warning(s) when possible"
)
for check in warning_checks[:2]: # Show top 2 warnings
recommendations.append(f" - {check.summary}")
recommendations.append("📊 Monitor cluster status regularly for changes")
return recommendations
def get_executive_summary(self) -> str:
"""Get a one-line executive summary suitable for dashboards."""
status_emoji = {
HealthStatus.OK: "🟢",
HealthStatus.WARN: "🟡",
HealthStatus.ERR: "🔴",
}
emoji = status_emoji.get(self.status, "⚪")
score = self.get_health_score()
if self.is_healthy():
return f"{emoji} Cluster is healthy (Score: {score}/100)"
issues = len(self.checks)
return (
f"{emoji} Cluster has {issues} issue(s) requiring attention "
f"(Score: {score}/100)"
)
class ClusterCapacity(BaseModel):
"""
Cluster capacity information.
This model represents the overall storage capacity and usage
statistics for the entire Ceph cluster.
"""
total_avail_bytes: int = Field(
..., description="Total available bytes in the cluster"
)
total_bytes: int = Field(..., description="Total bytes in the cluster")
total_used_raw_bytes: int = Field(
..., description="Total raw bytes used in the cluster"
)
total_objects: int = Field(
..., description="Total number of objects in the cluster"
)
total_pool_bytes_used: int = Field(
..., description="Total bytes used across all pools"
)
average_object_size: int = Field(..., description="Average object size in bytes")
def get_total_capacity_gb(self) -> float:
"""Get total capacity in GB."""
return round(self.total_bytes / (1024**3), 2)
def get_used_capacity_gb(self) -> float:
"""Get used capacity in GB."""
return round(self.total_used_raw_bytes / (1024**3), 2)
def get_available_capacity_gb(self) -> float:
"""Get available capacity in GB."""
return round(self.total_avail_bytes / (1024**3), 2)
def get_pool_bytes_used_gb(self) -> float:
"""Get total pool bytes used in GB."""
return round(self.total_pool_bytes_used / (1024**3), 2)
def get_usage_percentage(self) -> float:
"""Get cluster usage percentage."""
if self.total_bytes == 0:
return 0.0
return round((self.total_used_raw_bytes / self.total_bytes) * 100, 1)
def get_average_object_size_kb(self) -> float:
"""Get average object size in KB."""
return round(self.average_object_size / 1024, 2)
def get_capacity_summary(self) -> str:
"""Get a human-readable capacity summary."""
return f"{self.get_used_capacity_gb()}GB used of {self.get_total_capacity_gb()}GB total ({self.get_usage_percentage()}% used)"