Skip to main content
Glama
pod-diagnostics.ts27.7 kB
/** * Pod diagnostics module * * Comprehensively analyzes pod status, containers, and events * to identify the root cause of problems * * @author zerry */ import * as k8s from '@kubernetes/client-node'; import type { PodDiagnostics, DiagnosticIssue, ContainerStatus, ResourceUsage, K8sEvent, PodPhase, } from '../types.js'; import { withRetry } from '../utils/retry.js'; /** * Comprehensive pod diagnostics * * This is the core feature. Analyzes all pod states * to clearly explain "why it's not working" */ export async function diagnosePod( coreApi: k8s.CoreV1Api, namespace: string, podName: string, metricsApi?: k8s.Metrics ): Promise<PodDiagnostics> { try { console.error(`[diagnosePod] Starting diagnostics for pod ${podName} in namespace ${namespace}`); // Parallel API calls for better performance const [pod, eventsResponse] = await Promise.all([ // 1. Get pod information with retry withRetry(() => coreApi.readNamespacedPod({ name: podName, namespace }), { maxAttempts: 3, initialDelay: 500, }).catch((error) => { console.error(`[diagnosePod] Failed to get pod info:`, error.message); throw new Error(`Cannot read pod ${podName}: ${error.message}`); }), // 2. Get pod events with retry withRetry(() => coreApi.listNamespacedEvent({ namespace, fieldSelector: `involvedObject.name=${podName}`, }), { maxAttempts: 2, // Events are less critical, fewer retries initialDelay: 500, }).catch((error) => { console.error(`[diagnosePod] Failed to get events (non-fatal):`, error.message); // Return empty events instead of failing return { items: [] }; }), ]); const events = parseEvents(eventsResponse.items); // 3. Analyze container status const containers = parseContainerStatuses(pod.status?.containerStatuses || []); // 4. Detect issues const issues: DiagnosticIssue[] = []; // Container-related issues issues.push(...detectContainerIssues(pod, containers, events)); // Image pull issues issues.push(...detectImagePullIssues(pod, events)); // Resource-related issues const resources = await analyzeResourceUsage(pod, namespace, podName, metricsApi); issues.push(...detectResourceIssues(pod, resources)); // Volume mount issues issues.push(...detectVolumeIssues(pod, events)); // Network issues issues.push(...detectNetworkIssues(pod, events)); // 5. Calculate health score const healthScore = calculateHealthScore(pod, issues); // 6. Generate summary const summary = generatePodSummary(pod, issues, healthScore); return { podInfo: { name: pod.metadata?.name || podName, namespace: pod.metadata?.namespace || namespace, phase: (pod.status?.phase as PodPhase) || 'Unknown', startTime: pod.status?.startTime?.toISOString(), nodeName: pod.spec?.nodeName, hostIP: pod.status?.hostIP, podIP: pod.status?.podIP, }, containers, issues, resources, events, summary, healthScore, }; } catch (error: any) { console.error(`[diagnosePod] Fatal error:`, error); throw new Error(`Pod diagnosis failed: ${error.message}`); } } /** * Specialized CrashLoopBackOff diagnostics * * CrashLoop is really tricky, this function accurately identifies the cause */ export async function diagnoseCrashLoop( coreApi: k8s.CoreV1Api, logApi: k8s.Log, namespace: string, podName: string, containerName?: string ): Promise<DiagnosticIssue[]> { const issues: DiagnosticIssue[] = []; try { console.error(`[diagnoseCrashLoop] Analyzing pod ${podName} in namespace ${namespace}`); const pod = await withRetry( () => coreApi.readNamespacedPod({ name: podName, namespace }), { maxAttempts: 3 } ); const containerStatuses = pod.status?.containerStatuses || []; for (const status of containerStatuses) { // If containerName specified, only that container if (containerName && status.name !== containerName) continue; const restartCount = status.restartCount || 0; // Detect CrashLoop if (restartCount > 3 || status.state?.waiting?.reason === 'CrashLoopBackOff') { // Check termination reason from previous state const lastTerminated = status.lastState?.terminated; let rootCause = 'unknown'; let solution = ''; if (lastTerminated) { const exitCode = lastTerminated.exitCode; // Analyze exit code if (exitCode === 0) { rootCause = 'Container exited normally but keeps restarting due to restart policy'; solution = 'Change spec.restartPolicy to "Never" or "OnFailure"\n```yaml\nspec:\n restartPolicy: OnFailure\n```'; } else if (exitCode === 1) { rootCause = 'Application error caused termination'; solution = 'Check logs to fix application errors\n```bash\nkubectl logs ' + podName + ' -n ' + namespace + ' -c ' + status.name + ' --previous\n```'; } else if (exitCode === 137) { rootCause = 'OOM (Out Of Memory) - Container was killed due to insufficient memory'; solution = 'Increase memory limit or optimize application memory usage\n```yaml\nresources:\n limits:\n memory: "512Mi" # Set higher than current\n```'; } else if (exitCode === 143) { rootCause = 'Terminated by SIGTERM - Received normal termination signal'; solution = 'Graceful shutdown may not be properly implemented. Try increasing terminationGracePeriodSeconds'; } else if (exitCode === 126) { rootCause = 'Permission denied - Executable file lacks execute permission'; solution = 'Grant execute permission with chmod +x in Dockerfile'; } else if (exitCode === 127) { rootCause = 'Command not found - CMD/ENTRYPOINT command does not exist'; solution = 'Verify CMD/ENTRYPOINT path in Dockerfile'; } else { rootCause = `Unknown error (exit code ${exitCode})`; solution = 'Check logs to identify detailed cause'; } } // Find additional clues in logs try { const { Writable } = require('stream'); const stream = new Writable(); let logData = ''; stream._write = (chunk: any, _encoding: string, next: Function) => { logData += chunk.toString(); next(); }; await withRetry( () => logApi.log(namespace, podName, status.name, stream, { previous: true, tailLines: 50, }), { maxAttempts: 2 } ); // Find error patterns in logs const relevantLogs: string[] = []; const lines = logData.split('\n'); for (const line of lines) { if ( line.toLowerCase().includes('error') || line.toLowerCase().includes('exception') || line.toLowerCase().includes('fatal') || line.toLowerCase().includes('panic') ) { relevantLogs.push(line.trim()); } } issues.push({ type: 'CrashLoopBackOff', severity: 'critical', message: `Container "${status.name}" has restarted ${restartCount} times`, rootCause, solution, resource: { kind: 'Pod', name: podName, namespace, }, relevantLogs: relevantLogs.slice(0, 10), // Max 10 lines timestamp: new Date().toISOString(), }); } catch (logError: any) { console.error(`[diagnoseCrashLoop] Failed to retrieve logs for ${status.name}:`, logError.message); // Add issue even if logs cannot be retrieved issues.push({ type: 'CrashLoopBackOff', severity: 'critical', message: `Container "${status.name}" has restarted ${restartCount} times`, rootCause, solution, resource: { kind: 'Pod', name: podName, namespace, }, timestamp: new Date().toISOString(), }); } } } } catch (error: any) { throw new Error(`CrashLoop diagnostics failed: ${error.message}`); } return issues; } /** * Parse container statuses */ function parseContainerStatuses(statuses: any[]): ContainerStatus[] { return statuses.map(s => ({ name: s.name, ready: s.ready || false, restartCount: s.restartCount || 0, state: s.state || {}, lastState: s.lastState, image: s.image, imageID: s.imageID, })); } /** * Parse events */ function parseEvents(items: any[]): K8sEvent[] { return items .map(e => ({ type: e.type, reason: e.reason, message: e.message, count: e.count || 1, firstTimestamp: e.firstTimestamp, lastTimestamp: e.lastTimestamp, source: e.source?.component, })) .sort((a, b) => new Date(b.lastTimestamp).getTime() - new Date(a.lastTimestamp).getTime() ); } /** * Detect container issues */ function detectContainerIssues( pod: any, containers: ContainerStatus[], _events: K8sEvent[] ): DiagnosticIssue[] { const issues: DiagnosticIssue[] = []; for (const container of containers) { // Check Waiting state if (container.state.waiting) { const reason = container.state.waiting.reason; const message = container.state.waiting.message; if (reason === 'ErrImagePull' || reason === 'ImagePullBackOff') { // Image pull issues are handled in a separate function continue; } issues.push({ type: `Container Waiting: ${reason}`, severity: 'high', message: `Container "${container.name}" is in ${reason} state`, rootCause: message || 'Unknown reason', solution: getWaitingSolution(reason), resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } // Check Terminated state if (container.state.terminated && container.state.terminated.exitCode !== 0) { issues.push({ type: 'Container Terminated', severity: 'high', message: `Container "${container.name}" terminated with exit code ${container.state.terminated.exitCode}`, rootCause: container.state.terminated.reason || 'Unknown reason', solution: getTerminatedSolution(container.state.terminated.exitCode), resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } } return issues; } /** * Detect image pull issues */ function detectImagePullIssues(pod: any, events: K8sEvent[]): DiagnosticIssue[] { const issues: DiagnosticIssue[] = []; const imagePullEvents = events.filter(e => e.reason === 'Failed' && e.message.includes('pull') ); if (imagePullEvents.length > 0) { const event = imagePullEvents[0]; let rootCause = 'Cannot download image'; let solution = ''; if (event.message.includes('not found') || event.message.includes('manifest unknown')) { rootCause = 'Image or tag does not exist'; solution = '1. Verify image name and tag\n2. Test locally with docker pull <image>'; } else if (event.message.includes('unauthorized') || event.message.includes('authentication')) { rootCause = 'Image registry authentication failed'; solution = '```bash\nkubectl create secret docker-registry regcred \\\n --docker-server=<registry> \\\n --docker-username=<username> \\\n --docker-password=<password>\n\n# Add to Pod spec:\nspec:\n imagePullSecrets:\n - name: regcred\n```'; } else if (event.message.includes('timeout')) { rootCause = 'Network timeout - Cannot access registry'; solution = '1. Check cluster network connectivity\n2. Verify firewall/proxy settings\n3. Verify registry URL is correct'; } issues.push({ type: 'ImagePullBackOff', severity: 'critical', message: 'Cannot pull container image', rootCause, solution, resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, relatedEvents: [event], timestamp: new Date().toISOString(), }); } return issues; } /** * Analyze resource usage * * Collects real-time metrics from Metrics Server if available */ async function analyzeResourceUsage( pod: any, namespace: string, podName: string, metricsApi?: k8s.Metrics ): Promise<ResourceUsage> { const containers = pod.spec?.containers || []; let totalCpuRequest = 0; let totalCpuLimit = 0; let totalMemRequest = 0; let totalMemLimit = 0; for (const container of containers) { const requests = container.resources?.requests || {}; const limits = container.resources?.limits || {}; totalCpuRequest += parseCPU(requests.cpu || '0'); totalCpuLimit += parseCPU(limits.cpu || '0'); totalMemRequest += parseMemory(requests.memory || '0'); totalMemLimit += parseMemory(limits.memory || '0'); } // Try to get real-time metrics from Metrics Server let currentCpu: number | undefined; let currentMem: number | undefined; let cpuUsagePercent: number | undefined; let memUsagePercent: number | undefined; if (metricsApi) { try { const metrics = await withRetry( () => metricsApi.getPodMetrics(namespace), { maxAttempts: 2, initialDelay: 500, shouldRetry: (error) => { // Don't retry if Metrics Server is not installed if (error.statusCode === 404) return false; return true; }, } ); // Find the specific pod in the metrics list const podMetric = metrics.items?.find((item: any) => item.metadata?.name === podName); if (podMetric) { // Sum up all container metrics let totalCpuUsage = 0; let totalMemUsage = 0; for (const container of podMetric.containers || []) { // CPU is in nanocores, convert to millicores if (container.usage?.cpu) { totalCpuUsage += parseMetricCPU(container.usage.cpu); } // Memory is in Ki, convert to bytes if (container.usage?.memory) { totalMemUsage += parseMetricMemory(container.usage.memory); } } currentCpu = totalCpuUsage; currentMem = totalMemUsage; // Calculate usage percentages if (totalCpuLimit > 0) { cpuUsagePercent = (currentCpu / totalCpuLimit) * 100; } if (totalMemLimit > 0) { memUsagePercent = (currentMem / totalMemLimit) * 100; } } } catch (error: any) { // Metrics Server not available or pod metrics not ready // This is fine, we'll just show spec values if (error.statusCode !== 404) { console.error(`[analyzeResourceUsage] Failed to get metrics (non-fatal):`, error.message); } } } return { cpu: { current: currentCpu, requested: totalCpuRequest, limit: totalCpuLimit, usagePercent: cpuUsagePercent, isThrottled: cpuUsagePercent !== undefined && cpuUsagePercent >= 80, }, memory: { current: currentMem, requested: totalMemRequest, limit: totalMemLimit, usagePercent: memUsagePercent, isOOMRisk: memUsagePercent !== undefined && memUsagePercent >= 90, }, }; } /** * Detect resource issues */ function detectResourceIssues(pod: any, resources: ResourceUsage): DiagnosticIssue[] { const issues: DiagnosticIssue[] = []; // Check for high CPU usage (throttling) if (resources.cpu.isThrottled && resources.cpu.usagePercent !== undefined) { issues.push({ type: 'High CPU Usage', severity: 'high', message: `CPU usage is high (${resources.cpu.usagePercent.toFixed(1)}%)`, rootCause: 'CPU limit may be too low for current workload', solution: `Increase CPU limit or optimize application:\n\`\`\`yaml\nresources:\n limits:\n cpu: "${Math.ceil((resources.cpu.limit || 1000) * 1.5)}m" # Increased by 50%\n\`\`\``, resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } // Check for OOM risk if (resources.memory.isOOMRisk && resources.memory.usagePercent !== undefined) { issues.push({ type: 'OOM Risk', severity: 'critical', message: `Memory usage is critically high (${resources.memory.usagePercent.toFixed(1)}%)`, rootCause: 'Pod is at risk of OOM kill - memory usage exceeds 90% of limit', solution: `Increase memory limit immediately:\n\`\`\`yaml\nresources:\n limits:\n memory: "${Math.ceil((resources.memory.limit || 512 * 1024 * 1024) / (1024 * 1024) * 1.5)}Mi" # Increased by 50%\n\`\`\``, resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } // When resource limits are not set if (!resources.cpu.limit) { issues.push({ type: 'Missing CPU Limit', severity: 'medium', message: 'CPU limit is not set', rootCause: 'CPU usage can increase without limit', solution: '```yaml\nresources:\n limits:\n cpu: "1000m"\n requests:\n cpu: "100m"\n```', resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } if (!resources.memory.limit) { issues.push({ type: 'Missing Memory Limit', severity: 'high', message: 'Memory limit is not set', rootCause: 'Memory leak can affect entire node', solution: '```yaml\nresources:\n limits:\n memory: "512Mi"\n requests:\n memory: "128Mi"\n```', resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, timestamp: new Date().toISOString(), }); } return issues; } /** * Detect volume issues */ function detectVolumeIssues(pod: any, events: K8sEvent[]): DiagnosticIssue[] { const issues: DiagnosticIssue[] = []; const volumeEvents = events.filter(e => e.message.includes('volume') || e.message.includes('mount') ); for (const event of volumeEvents) { if (event.type === 'Warning') { issues.push({ type: 'Volume Mount Issue', severity: 'high', message: 'Volume mount failed', rootCause: event.message, solution: '1. Verify PVC is in Bound state\n2. Verify storage class is correct\n3. Check status with kubectl describe pvc <pvc-name>', resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, relatedEvents: [event], timestamp: new Date().toISOString(), }); } } return issues; } /** * Detect network issues */ function detectNetworkIssues(pod: any, events: K8sEvent[]): DiagnosticIssue[] { const issues: DiagnosticIssue[] = []; const networkEvents = events.filter(e => e.message.includes('network') || e.message.includes('CNI') ); for (const event of networkEvents) { if (event.type === 'Warning') { issues.push({ type: 'Network Configuration Issue', severity: 'high', message: 'Network configuration problem', rootCause: event.message, solution: '1. Check CNI plugin status\n2. Check network policy\n3. Verify Pod CIDR range', resource: { kind: 'Pod', name: pod.metadata?.name, namespace: pod.metadata?.namespace, }, relatedEvents: [event], timestamp: new Date().toISOString(), }); } } return issues; } /** * Calculate health score */ function calculateHealthScore(pod: any, issues: DiagnosticIssue[]): number { let score = 100; // Deductions based on pod phase const phase = pod.status?.phase; if (phase === 'Failed') score -= 100; else if (phase === 'Pending') score -= 30; else if (phase === 'Unknown') score -= 50; // Deductions based on issues for (const issue of issues) { if (issue.severity === 'critical') score -= 30; else if (issue.severity === 'high') score -= 20; else if (issue.severity === 'medium') score -= 10; else if (issue.severity === 'low') score -= 5; } return Math.max(0, Math.min(100, score)); } /** * Generate pod summary */ function generatePodSummary(pod: any, issues: DiagnosticIssue[], healthScore: number): string { const phase = pod.status?.phase || 'Unknown'; const containerCount = pod.spec?.containers?.length || 0; const readyContainers = pod.status?.containerStatuses?.filter((c: any) => c.ready).length || 0; let summary = `Pod "${pod.metadata?.name}" is currently in ${phase} state.\n`; summary += `Containers: ${readyContainers}/${containerCount} ready\n`; summary += `Health: ${healthScore}/100\n\n`; if (issues.length === 0) { summary += '✅ No issues found!'; } else { summary += `⚠️ ${issues.length} issue(s) detected.\n`; const critical = issues.filter(i => i.severity === 'critical').length; const high = issues.filter(i => i.severity === 'high').length; if (critical > 0) summary += ` - Critical: ${critical}\n`; if (high > 0) summary += ` - High: ${high}\n`; } return summary; } // ===== Helper functions ===== function getWaitingSolution(reason: string): string { const solutions: Record<string, string> = { 'CreateContainerConfigError': 'Check container configuration (ConfigMap, Secret, etc.)', 'InvalidImageName': 'Verify image name format', 'CreateContainerError': 'Check container creation settings', }; return solutions[reason] || 'Check logs and events to identify the cause'; } function getTerminatedSolution(exitCode: number): string { const solutions: Record<number, string> = { 1: 'Check application logs to fix errors', 137: 'Increase memory limit (OOM killed)', 143: 'Verify graceful shutdown implementation', 126: 'Check executable permissions (chmod +x)', 127: 'Verify CMD/ENTRYPOINT path', }; return solutions[exitCode] || `Check logs for exit code ${exitCode}`; } function parseCPU(cpu: string): number { if (cpu.endsWith('m')) { return parseInt(cpu.slice(0, -1)); } return parseFloat(cpu) * 1000; } function parseMemory(mem: string): number { const units: Record<string, number> = { 'Ki': 1024, 'Mi': 1024 * 1024, 'Gi': 1024 * 1024 * 1024, 'K': 1000, 'M': 1000 * 1000, 'G': 1000 * 1000 * 1000, }; for (const [unit, multiplier] of Object.entries(units)) { if (mem.endsWith(unit)) { return parseFloat(mem.slice(0, -unit.length)) * multiplier; } } return parseFloat(mem); } /** * Parse CPU from Metrics API format * Metrics API returns nanocores (e.g., "123456789n") or millicores (e.g., "123m") */ function parseMetricCPU(cpu: string): number { if (cpu.endsWith('n')) { // Nanocores to millicores: divide by 1,000,000 return parseInt(cpu.slice(0, -1)) / 1_000_000; } else if (cpu.endsWith('m')) { // Already in millicores return parseInt(cpu.slice(0, -1)); } else { // Cores to millicores: multiply by 1000 return parseFloat(cpu) * 1000; } } /** * Parse Memory from Metrics API format * Metrics API returns in Ki (e.g., "123456Ki") */ function parseMetricMemory(mem: string): number { if (mem.endsWith('Ki')) { return parseInt(mem.slice(0, -2)) * 1024; } else if (mem.endsWith('Mi')) { return parseInt(mem.slice(0, -2)) * 1024 * 1024; } else if (mem.endsWith('Gi')) { return parseInt(mem.slice(0, -2)) * 1024 * 1024 * 1024; } // Assume bytes return parseInt(mem); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ongjin/k8s-doctor-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server