Skip to main content
Glama
health_calculation.go18 kB
package kiali import ( "encoding/json" "fmt" "strings" "time" ) // computeMeshHealthSummary processes the health JSON and creates an aggregated summary. // The health data corresponds to the type specified in queryParams (app, workload, or service). func computeMeshHealthSummary(healthData json.RawMessage, requestedNamespaces []string, queryParams map[string]string) *MeshHealthSummary { // Determine the health type from queryParams (defaults to "app") healthType := "app" if gt, ok := queryParams["graphType"]; ok && strings.TrimSpace(gt) != "" { v := strings.TrimSpace(gt) if strings.EqualFold(v, "versionedApp") { healthType = "app" } else if v == "workload" || v == "service" { healthType = v } } rateInterval := queryParams["rateInterval"] if rateInterval == "" { rateInterval = DefaultRateInterval } // Parse the health JSON into ClustersNamespaceHealth structure var clustersHealth ClustersNamespaceHealth if err := json.Unmarshal(healthData, &clustersHealth); err != nil { // If parsing fails, return empty summary return &MeshHealthSummary{ EntityCounts: EntityHealthCounts{}, NamespaceSummary: make(map[string]NamespaceSummary), TopUnhealthy: []UnhealthyEntity{}, Timestamp: time.Now().UTC().Format(time.RFC3339), RateInterval: rateInterval, } } // Create empty health structures for types we don't have emptyHealth := ClustersNamespaceHealth{ AppHealth: make(map[string]NamespaceAppHealth), ServiceHealth: make(map[string]NamespaceServiceHealth), WorkloadHealth: make(map[string]NamespaceWorkloadHealth), } // Use the appropriate health data based on type var appHealth, svcHealth, wlHealth ClustersNamespaceHealth switch healthType { case "app": appHealth = clustersHealth svcHealth = emptyHealth wlHealth = emptyHealth case "service": appHealth = emptyHealth svcHealth = clustersHealth wlHealth = emptyHealth case "workload": appHealth = emptyHealth svcHealth = emptyHealth wlHealth = clustersHealth default: appHealth = clustersHealth svcHealth = emptyHealth wlHealth = emptyHealth } // Compute summary using the same logic as the old branch summary := computeHealthSummary(appHealth, svcHealth, wlHealth, rateInterval) return &summary } // computeHealthSummary aggregates health data (same logic as old branch) func computeHealthSummary( appHealth ClustersNamespaceHealth, svcHealth ClustersNamespaceHealth, wlHealth ClustersNamespaceHealth, rateInterval string, ) MeshHealthSummary { summary := MeshHealthSummary{ EntityCounts: EntityHealthCounts{}, NamespaceSummary: make(map[string]NamespaceSummary), TopUnhealthy: []UnhealthyEntity{}, Timestamp: time.Now().UTC().Format(time.RFC3339), RateInterval: rateInterval, } // Collect all namespace names nsSet := make(map[string]bool) for ns := range appHealth.AppHealth { nsSet[ns] = true } for ns := range svcHealth.ServiceHealth { nsSet[ns] = true } for ns := range wlHealth.WorkloadHealth { nsSet[ns] = true } summary.NamespaceCount = len(nsSet) // Aggregate per namespace for ns := range nsSet { nsSummary := NamespaceSummary{} // Process apps if nsApps, ok := appHealth.AppHealth[ns]; ok { for appName, app := range nsApps { summary.EntityCounts.Apps.Total++ nsSummary.Apps.Total++ status, issue := evaluateAppHealth(app) switch status { case "HEALTHY": summary.EntityCounts.Apps.Healthy++ nsSummary.Apps.Healthy++ case "NOT_READY": summary.EntityCounts.Apps.NotReady++ nsSummary.Apps.NotReady++ case "DEGRADED": summary.EntityCounts.Apps.Degraded++ nsSummary.Apps.Degraded++ case "UNHEALTHY": summary.EntityCounts.Apps.Unhealthy++ nsSummary.Apps.Unhealthy++ summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ Type: "app", Namespace: ns, Name: appName, Status: status, Issue: issue, ErrorRate: calculateErrorRate(app.Requests), }) } nsSummary.ErrorRate += calculateErrorRate(app.Requests) } } // Process services if nsSvcs, ok := svcHealth.ServiceHealth[ns]; ok { for svcName, svc := range nsSvcs { summary.EntityCounts.Services.Total++ nsSummary.Services.Total++ status, issue := evaluateServiceHealth(svc) switch status { case "HEALTHY": summary.EntityCounts.Services.Healthy++ nsSummary.Services.Healthy++ case "NOT_READY": summary.EntityCounts.Services.NotReady++ nsSummary.Services.NotReady++ case "DEGRADED": summary.EntityCounts.Services.Degraded++ nsSummary.Services.Degraded++ case "UNHEALTHY": summary.EntityCounts.Services.Unhealthy++ nsSummary.Services.Unhealthy++ summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ Type: "service", Namespace: ns, Name: svcName, Status: status, Issue: issue, ErrorRate: calculateErrorRate(svc.Requests), }) } nsSummary.ErrorRate += calculateErrorRate(svc.Requests) } } // Process workloads if nsWls, ok := wlHealth.WorkloadHealth[ns]; ok { for wlName, wl := range nsWls { summary.EntityCounts.Workloads.Total++ nsSummary.Workloads.Total++ status, issue := evaluateWorkloadHealth(wl) switch status { case "HEALTHY": summary.EntityCounts.Workloads.Healthy++ nsSummary.Workloads.Healthy++ case "NOT_READY": summary.EntityCounts.Workloads.NotReady++ nsSummary.Workloads.NotReady++ case "DEGRADED": summary.EntityCounts.Workloads.Degraded++ nsSummary.Workloads.Degraded++ case "UNHEALTHY": summary.EntityCounts.Workloads.Unhealthy++ nsSummary.Workloads.Unhealthy++ summary.TopUnhealthy = append(summary.TopUnhealthy, UnhealthyEntity{ Type: "workload", Namespace: ns, Name: wlName, Status: status, Issue: issue, ErrorRate: calculateErrorRate(wl.Requests), }) } nsSummary.ErrorRate += calculateErrorRate(wl.Requests) } } // Compute namespace status and availability nsSummary.Status = computeNamespaceStatus(nsSummary) nsSummary.Availability = computeAvailability(nsSummary) summary.NamespaceSummary[ns] = nsSummary } // Compute overall stats summary.OverallStatus = computeOverallStatus(summary.EntityCounts) summary.Availability = computeOverallAvailability(summary.EntityCounts) summary.TotalErrorRate = computeTotalErrorRate(summary.NamespaceSummary) // Sort and limit top unhealthy sortUnhealthyByImpact(summary.TopUnhealthy) if len(summary.TopUnhealthy) > 10 { summary.TopUnhealthy = summary.TopUnhealthy[:10] } return summary } // evaluateAppHealth determines app health status func evaluateAppHealth(app AppHealth) (status string, issue string) { // Check workload statuses totalWorkloads := len(app.WorkloadStatuses) if totalWorkloads == 0 { return "UNKNOWN", "no workloads found" } workloadStatus := "HEALTHY" unhealthyCount := 0 for _, ws := range app.WorkloadStatuses { // User has scaled down a workload, then desired replicas will be 0 and it's not an error condition // This matches Kiali frontend logic: return NOT_READY when desiredReplicas === 0 if ws.DesiredReplicas == 0 { workloadStatus = "NOT_READY" issue = "scaled to 0 replicas" continue } if ws.AvailableReplicas < ws.DesiredReplicas { unhealthyCount++ issue = fmt.Sprintf("%d/%d replicas available", ws.AvailableReplicas, ws.DesiredReplicas) if ws.AvailableReplicas == 0 { workloadStatus = "UNHEALTHY" } else if workloadStatus != "UNHEALTHY" { workloadStatus = "DEGRADED" } } if ws.SyncedProxies >= 0 && ws.SyncedProxies < ws.AvailableReplicas { if issue == "" { issue = fmt.Sprintf("%d/%d proxies synced", ws.SyncedProxies, ws.AvailableReplicas) } if workloadStatus == "HEALTHY" { workloadStatus = "DEGRADED" } } } // Evaluate request health using tolerance-based logic (Kiali tolerances) requestStatus, errorRate := evaluateRequestHealth(app.Requests) if errorRate > 0 && issue == "" { issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) } // Merge workload and request statuses (worst wins) finalStatus := mergeHealthStatus(workloadStatus, requestStatus) return finalStatus, issue } // evaluateServiceHealth determines service health status func evaluateServiceHealth(svc ServiceHealth) (status string, issue string) { // If there is no inbound or outbound traffic data, service health is UNKNOWN if !hasAnyRequests(svc.Requests) { return "UNKNOWN", "" } // Evaluate request health using tolerance-based logic (Kiali tolerances) status, errorRate := evaluateRequestHealth(svc.Requests) if errorRate > 0 && issue == "" { issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) } return status, issue } // hasAnyRequests returns true if there is any non-zero request count in inbound or outbound func hasAnyRequests(req RequestHealth) bool { // Check inbound for _, codes := range req.Inbound { for _, count := range codes { if count > 0 { return true } } } // Check outbound for _, codes := range req.Outbound { for _, count := range codes { if count > 0 { return true } } } return false } // evaluateWorkloadHealth determines workload health status func evaluateWorkloadHealth(wl WorkloadHealth) (status string, issue string) { workloadStatus := "HEALTHY" if wl.WorkloadStatus != nil { ws := wl.WorkloadStatus // User has scaled down a workload, then desired replicas will be 0 and it's not an error condition // This matches Kiali frontend logic: return NOT_READY when desiredReplicas === 0 if ws.DesiredReplicas == 0 { workloadStatus = "NOT_READY" issue = "scaled to 0 replicas" } else if ws.AvailableReplicas < ws.DesiredReplicas { issue = fmt.Sprintf("%d/%d replicas available", ws.AvailableReplicas, ws.DesiredReplicas) if ws.AvailableReplicas == 0 { workloadStatus = "UNHEALTHY" } else { workloadStatus = "DEGRADED" } } if ws.SyncedProxies >= 0 && ws.SyncedProxies < ws.AvailableReplicas { if issue == "" { issue = fmt.Sprintf("%d/%d proxies synced", ws.SyncedProxies, ws.AvailableReplicas) } if workloadStatus == "HEALTHY" { workloadStatus = "DEGRADED" } } } // Evaluate request health using tolerance-based logic (Kiali tolerances) requestStatus, errorRate := evaluateRequestHealth(wl.Requests) // If there is no inbound or outbound traffic data and no workload status info, mark UNKNOWN if !hasAnyRequests(wl.Requests) && wl.WorkloadStatus == nil { return "UNKNOWN", "" } if errorRate > 0 && issue == "" { issue = fmt.Sprintf("error rate: %.2f%%", errorRate*100) } // Merge workload and request statuses (worst wins) finalStatus := mergeHealthStatus(workloadStatus, requestStatus) return finalStatus, issue } // mergeHealthStatus returns the worst of two health statuses // Priority matches Kiali frontend: UNHEALTHY(4) > DEGRADED(3) > NOT_READY(2) > HEALTHY(1) > UNKNOWN(0) func mergeHealthStatus(s1, s2 string) string { priority := map[string]int{ "UNHEALTHY": 4, "DEGRADED": 3, "NOT_READY": 2, "HEALTHY": 1, "UNKNOWN": 0, } if priority[s1] > priority[s2] { return s1 } return s2 } // calculateErrorRate computes error percentage from request health // This uses a simplified approach - for each protocol/code combination, // it checks against tolerance thresholds to determine if it's an error func calculateErrorRate(req RequestHealth) float64 { totalRequests := 0.0 errorRequests := 0.0 // Count inbound for protocol, codes := range req.Inbound { for code, count := range codes { totalRequests += count if isErrorCode(protocol, code) { errorRequests += count } } } // Count outbound for protocol, codes := range req.Outbound { for code, count := range codes { totalRequests += count if isErrorCode(protocol, code) { errorRequests += count } } } if totalRequests == 0 { return 0.0 } return errorRequests / totalRequests } // isErrorCode checks if a status code represents an error // Based on Kiali's default tolerance configuration func isErrorCode(protocol, code string) bool { switch protocol { case "http": // "-" represents aborted/fault-injected requests (always an error) if code == "-" { return true } // 4xx client errors if len(code) == 3 && code[0] == '4' { return true } // 5xx server errors if len(code) == 3 && code[0] == '5' { return true } case "grpc": // "-" represents aborted requests if code == "-" { return true } // gRPC error codes (1-16, non-zero) if code != "0" { return true } } return false } // evaluateRequestHealth evaluates health status based on request metrics // Returns status and worst error ratio found func evaluateRequestHealth(req RequestHealth) (status string, worstRatio float64) { status = "HEALTHY" worstRatio = 0.0 // Helper to process requests (inbound or outbound) processRequests := func(requests map[string]map[string]float64) { for protocol, codes := range requests { totalForProtocol := 0.0 // Calculate totals for _, count := range codes { totalForProtocol += count } if totalForProtocol == 0 { continue } // Calculate error ratios for each code for code, count := range codes { if isErrorCode(protocol, code) { ratio := count / totalForProtocol // Track worst ratio if ratio > worstRatio { worstRatio = ratio } // Evaluate against tolerance thresholds // Based on Kiali defaults: // - Code "-": degraded=0%, failure=10% // - 5xx: degraded=0%, failure=10% // - 4xx: degraded=10%, failure=20% // - grpc errors: degraded=0%, failure=10% codeStatus := getStatusForCodeRatio(protocol, code, ratio) if codeStatus == "UNHEALTHY" { status = "UNHEALTHY" } else if codeStatus == "DEGRADED" && status == "HEALTHY" { status = "DEGRADED" } } } } } processRequests(req.Inbound) processRequests(req.Outbound) return status, worstRatio } // getStatusForCodeRatio determines health status based on error code and ratio // Implements Kiali's default tolerance configuration func getStatusForCodeRatio(protocol, code string, ratio float64) string { percentage := ratio * 100 switch protocol { case "http": if code == "-" { // Aborted/fault-injected: degraded=0%, failure=10% if percentage >= 10 { return "UNHEALTHY" } else if percentage > 0 { return "DEGRADED" } } else if len(code) == 3 && code[0] == '5' { // 5xx errors: degraded=0%, failure=10% if percentage >= 10 { return "UNHEALTHY" } else if percentage > 0 { return "DEGRADED" } } else if len(code) == 3 && code[0] == '4' { // 4xx errors: degraded=10%, failure=20% if percentage >= 20 { return "UNHEALTHY" } else if percentage >= 10 { return "DEGRADED" } } case "grpc": // gRPC errors (including "-"): degraded=0%, failure=10% if code != "0" { if percentage >= 10 { return "UNHEALTHY" } else if percentage > 0 { return "DEGRADED" } } } return "HEALTHY" } // computeNamespaceStatus determines namespace overall status func computeNamespaceStatus(ns NamespaceSummary) string { totalUnhealthy := ns.Apps.Unhealthy + ns.Services.Unhealthy + ns.Workloads.Unhealthy totalEntities := ns.Apps.Total + ns.Services.Total + ns.Workloads.Total if totalEntities == 0 { return "UNKNOWN" } if totalUnhealthy == 0 && ns.ErrorRate < 0.01 { return "HEALTHY" } else if totalUnhealthy > totalEntities/2 || ns.ErrorRate > 0.05 { return "UNHEALTHY" } return "DEGRADED" } // computeAvailability computes availability percentage for a namespace func computeAvailability(ns NamespaceSummary) float64 { total := ns.Apps.Total + ns.Services.Total + ns.Workloads.Total if total == 0 { return 100.0 } healthy := ns.Apps.Healthy + ns.Services.Healthy + ns.Workloads.Healthy degraded := ns.Apps.Degraded + ns.Services.Degraded + ns.Workloads.Degraded return (float64(healthy) + float64(degraded)*0.5) / float64(total) * 100.0 } // computeOverallStatus determines overall mesh status func computeOverallStatus(counts EntityHealthCounts) string { total := counts.Apps.Total + counts.Services.Total + counts.Workloads.Total unhealthy := counts.Apps.Unhealthy + counts.Services.Unhealthy + counts.Workloads.Unhealthy degraded := counts.Apps.Degraded + counts.Services.Degraded + counts.Workloads.Degraded if total == 0 { return "UNKNOWN" } // If there are any unhealthy entities if unhealthy > 0 { if unhealthy > total/2 { return "UNHEALTHY" } return "DEGRADED" } // If there are degraded entities but no unhealthy if degraded > 0 { return "DEGRADED" } return "HEALTHY" } // computeOverallAvailability computes overall mesh availability func computeOverallAvailability(counts EntityHealthCounts) float64 { total := counts.Apps.Total + counts.Services.Total + counts.Workloads.Total if total == 0 { return 100.0 } healthy := counts.Apps.Healthy + counts.Services.Healthy + counts.Workloads.Healthy degraded := counts.Apps.Degraded + counts.Services.Degraded + counts.Workloads.Degraded return (float64(healthy) + float64(degraded)*0.5) / float64(total) * 100.0 } // computeTotalErrorRate sums error rates across namespaces func computeTotalErrorRate(nsSummaries map[string]NamespaceSummary) float64 { total := 0.0 for _, ns := range nsSummaries { total += ns.ErrorRate } return total } // sortUnhealthyByImpact sorts unhealthy entities by error rate func sortUnhealthyByImpact(unhealthy []UnhealthyEntity) { // Simple bubble sort by error rate descending for i := 0; i < len(unhealthy); i++ { for j := i + 1; j < len(unhealthy); j++ { if unhealthy[j].ErrorRate > unhealthy[i].ErrorRate { unhealthy[i], unhealthy[j] = unhealthy[j], unhealthy[i] } } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/containers/kubernetes-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server