Skip to main content
Glama
cluster-health.ts7.49 kB
/** * Cluster-wide health diagnostics * * @author zerry */ import * as k8s from '@kubernetes/client-node'; import type { ClusterHealth, DiagnosticIssue } from '../types.js'; /** * Diagnose overall cluster health * * Comprehensively analyzes nodes, pods, and resource utilization */ export async function diagnoseClusterHealth( coreApi: k8s.CoreV1Api, namespace?: string ): Promise<ClusterHealth> { const issues: DiagnosticIssue[] = []; // 1. Node health const nodesResponse = await coreApi.listNode(); const nodes = nodesResponse.items; const readyNodes = nodes.filter((n: any) => n.status?.conditions?.some((c: any) => c.type === 'Ready' && c.status === 'True') ); const notReadyNodes = nodes.filter((n: any) => !n.status?.conditions?.some((c: any) => c.type === 'Ready' && c.status === 'True') ); // Not Ready node issues for (const node of notReadyNodes) { issues.push({ type: 'Node Not Ready', severity: 'critical', message: `Node "${node.metadata?.name}" is not in Ready state`, rootCause: 'Node has encountered a problem', solution: 'Check detailed cause with: kubectl describe node <node-name>', resource: { kind: 'Node', name: node.metadata?.name || 'unknown', namespace: '', }, timestamp: new Date().toISOString(), }); } // 2. Pod health const podsResponse = namespace ? await coreApi.listNamespacedPod({ namespace }) : await coreApi.listPodForAllNamespaces(); const pods = podsResponse.items; const podStats = { total: pods.length, running: pods.filter((p: any) => p.status?.phase === 'Running').length, pending: pods.filter((p: any) => p.status?.phase === 'Pending').length, failed: pods.filter((p: any) => p.status?.phase === 'Failed').length, crashLooping: pods.filter((p: any) => p.status?.containerStatuses?.some((c: any) => c.state?.waiting?.reason === 'CrashLoopBackOff' || (c.restartCount || 0) > 5 ) ).length, }; // Pending pod issues const pendingPods = pods.filter((p: any) => p.status?.phase === 'Pending'); for (const pod of pendingPods.slice(0, 5)) { // Max 5 pods issues.push({ type: 'Pod Pending', severity: 'high', message: `Pod "${pod.metadata?.name}" is in Pending state`, rootCause: 'Cannot be scheduled or image cannot be pulled', solution: `kubectl describe pod ${pod.metadata?.name} -n ${pod.metadata?.namespace}`, resource: { kind: 'Pod', name: pod.metadata?.name || 'unknown', namespace: pod.metadata?.namespace || 'default', }, timestamp: new Date().toISOString(), }); } // CrashLoop pod issues const crashLoopPods = pods.filter((p: any) => p.status?.containerStatuses?.some((c: any) => c.state?.waiting?.reason === 'CrashLoopBackOff' || (c.restartCount || 0) > 5 ) ); for (const pod of crashLoopPods.slice(0, 5)) { issues.push({ type: 'CrashLoopBackOff', severity: 'critical', message: `Pod "${pod.metadata?.name}" is in CrashLoop state`, rootCause: 'Container is repeatedly failing', solution: `kubectl logs ${pod.metadata?.name} -n ${pod.metadata?.namespace} --previous`, resource: { kind: 'Pod', name: pod.metadata?.name || 'unknown', namespace: pod.metadata?.namespace || 'default', }, timestamp: new Date().toISOString(), }); } // 3. Resource utilization (basic calculation) const resourceUtilization = { cpu: 0, memory: 0, storage: 0, }; // 4. Overall health score const overallScore = calculateOverallScore(nodes.length, readyNodes.length, podStats, issues); // 5. Filter critical issues only const criticalIssues = issues.filter(i => i.severity === 'critical'); // 6. Recommendations const recommendations = generateClusterRecommendations(issues, podStats, nodes.length); // 7. Summary const summary = generateClusterSummary(nodes.length, readyNodes.length, podStats, overallScore); return { overallScore, nodeHealth: { total: nodes.length, ready: readyNodes.length, notReady: notReadyNodes.length, issues: issues.filter(i => i.resource?.kind === 'Node'), }, podHealth: { ...podStats, issues: issues.filter(i => i.resource?.kind === 'Pod'), }, resourceUtilization, criticalIssues, recommendations, summary, }; } /** * Calculate overall health score */ function calculateOverallScore( totalNodes: number, readyNodes: number, podStats: any, issues: DiagnosticIssue[] ): number { let score = 100; // Deduct for node status const nodeHealthPercent = (readyNodes / totalNodes) * 100; score -= (100 - nodeHealthPercent) * 0.5; // Deduct for pod status const healthyPodPercent = (podStats.running / podStats.total) * 100; score -= (100 - healthyPodPercent) * 0.3; // CrashLoop is severe score -= podStats.crashLooping * 5; // Deduct for issues for (const issue of issues) { if (issue.severity === 'critical') score -= 10; else if (issue.severity === 'high') score -= 5; else if (issue.severity === 'medium') score -= 2; } return Math.max(0, Math.min(100, score)); } /** * Generate cluster recommendations */ function generateClusterRecommendations( issues: DiagnosticIssue[], podStats: any, totalNodes: number ): string[] { const recommendations: string[] = []; const critical = issues.filter(i => i.severity === 'critical'); if (critical.length > 0) { recommendations.push(`🔴 Resolve ${critical.length} Critical issue(s) as top priority`); } if (podStats.crashLooping > 0) { recommendations.push(`⚠️ ${podStats.crashLooping} pod(s) in CrashLoop state. Check logs immediately`); } if (podStats.pending > 5) { recommendations.push(`⚠️ ${podStats.pending} pod(s) in Pending state. Check for resource insufficiency`); } if (totalNodes < 3) { recommendations.push('💡 For high availability, running at least 3 nodes is recommended'); } if (recommendations.length === 0) { recommendations.push('✅ Cluster is healthy!'); } return recommendations; } /** * Generate cluster summary */ function generateClusterSummary( totalNodes: number, readyNodes: number, podStats: any, score: number ): string { let summary = `Cluster Health Score: ${score.toFixed(1)}/100\n\n`; summary += `Nodes: ${readyNodes}/${totalNodes} Ready\n`; summary += `Pods: ${podStats.running}/${podStats.total} Running\n`; if (podStats.crashLooping > 0) { summary += `⚠️ CrashLoop: ${podStats.crashLooping}\n`; } if (podStats.pending > 0) { summary += `⚠️ Pending: ${podStats.pending}\n`; } if (podStats.failed > 0) { summary += `⚠️ Failed: ${podStats.failed}\n`; } return summary; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ongjin/k8s-doctor-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server