MCP Console Automation Server

Overview Schema Related Servers Score Discussions

HealthMonitor.ts•77.1 KiB

import { EventEmitter } from 'events'; import { Logger } from '../utils/logger.js'; import { SessionState, SystemMetrics, ProcessMetrics, HealthCheckResult, } from '../types/index.js'; import * as os from 'os'; import * as fs from 'fs/promises'; import * as path from 'path'; import { spawn, exec } from 'child_process'; import { promisify } from 'util'; const execAsync = promisify(exec); export interface HealthMonitorConfig { enabledChecks: { session: boolean; system: boolean; network: boolean; process: boolean; disk: boolean; memory: boolean; ssh: boolean; }; thresholds: { cpu: number; memory: number; disk: number; networkLatency: number; processResponseTime: number; sshConnectionLatency: number; sshHealthScore: number; }; checkInterval: number; sshHealthCheckInterval: number; alertThresholds: { consecutive_failures: number; response_time_threshold: number; memory_usage_threshold: number; error_rate_threshold: number; ssh_connection_timeout: number; ssh_failure_rate_threshold: number; }; recovery: { enabled: boolean; maxAttempts: number; backoffMultiplier: number; baseDelay: number; sshProactiveReconnect: boolean; }; monitoring: { enableMetrics: boolean; enableAlerting: boolean; enablePredictiveAnalysis: boolean; enableSSHHealthPrediction: boolean; }; } // HealthCheckResult is imported from types/index.ts export interface SSHHealthMetrics { connectionLatency: number; lastKeepAliveTimestamp?: number; // Changed from Date to number timestamp lastKeepAlive?: Date; // Keep for compatibility consecutiveFailures: number; connectionUptime: number; throughput: number; errorRate: number; authenticationType?: 'password' | 'publickey' | 'keyboard-interactive'; encryptionAlgorithm?: string; compressionEnabled?: number; // Changed to number (0 or 1) serverVersion?: string; clientVersion?: string; } export interface SSHHealthResult extends HealthCheckResult { sshMetrics?: SSHHealthMetrics; connectionId?: string; hostInfo?: { hostname: string; port: number; username: string; }; predictiveScores?: { connectionStability: number; performanceDegradation: number; authenticationRisk: number; networkQuality: number; overallFailureRisk: number; }; } export interface SSHSessionHealth { sessionId: string; connectionId: string; hostname: string; port: number; username: string; connectedAt: Date; lastActivity: Date; healthScore: number; connectionLatencyHistory: number[]; throughputHistory: number[]; errorHistory: { timestamp: Date; error: string; recoverable: boolean }[]; keepAliveStatus: 'active' | 'failed' | 'degraded'; connectionStability: number; predictiveFailureScore: number; authenticationMethod: string; encryptionInfo: { algorithm: string; keySize: number; compressionEnabled: boolean; }; serverInfo: { version: string; protocol: string; supportedAlgorithms: string[]; }; performanceMetrics: { averageLatency: number; maxLatency: number; minLatency: number; throughputBytesPerSecond: number; commandExecutionTime: number; dataTransferTime: number; }; } export interface SSHPredictionModel { sessionId: string; connectionStabilityTrend: number[]; latencyTrend: number[]; throughputTrend: number[]; errorRateTrend: number[]; keepAliveSuccessRate: number; lastUpdated: Date; confidenceScore: number; predictedFailureTime?: Date; riskFactors: { networkInstability: number; performanceDegradation: number; authenticationIssues: number; serverOverload: number; connectionAging: number; }; } export interface HealthReport { overall: 'healthy' | 'warning' | 'unhealthy' | 'critical'; timestamp: Date; checks: HealthCheckResult[]; metrics: { totalChecks: number; healthyChecks: number; warningChecks: number; unhealthyChecks: number; criticalChecks: number; averageResponseTime: number; }; trends: { healthScore: number; healthScoreTrend: 'improving' | 'stable' | 'degrading'; predictionNextHour?: 'healthy' | 'warning' | 'unhealthy' | 'critical'; }; actionItems: string[]; sshHealthSummary?: { totalSSHSessions: number; healthySSHSessions: number; unhealthySSHSessions: number; averageSSHLatency: number; predictedFailures: number; proactiveReconnections: number; }; } /** * Comprehensive Health Monitor for session and system health * Provides real-time health checks, predictive analysis, and automatic recovery */ export class HealthMonitor extends EventEmitter { private logger: Logger; private config: HealthMonitorConfig; private healthHistory: Map<string, HealthCheckResult[]> = new Map(); private activeChecks: Map<string, NodeJS.Timeout> = new Map(); private healthScoreHistory: number[] = []; private lastSystemMetrics?: SystemMetrics; private monitoringInterval?: NodeJS.Timeout; private sshHealthInterval?: NodeJS.Timeout; private isRunning = false; // SSH-specific monitoring data private sshSessions: Map<string, SSHSessionHealth> = new Map(); private sshHealthHistory: Map<string, SSHHealthResult[]> = new Map(); private sshPredictionModels: Map<string, SSHPredictionModel> = new Map(); // Health check statistics private stats = { totalChecks: 0, successfulChecks: 0, failedChecks: 0, averageResponseTime: 0, consecutiveFailures: new Map<string, number>(), lastCheckTimes: new Map<string, Date>(), errorRates: new Map<string, number>(), }; constructor(config?: Partial<HealthMonitorConfig>) { super(); this.logger = new Logger('HealthMonitor'); this.config = { enabledChecks: { session: true, system: true, network: true, process: true, disk: true, memory: true, ssh: true, ...config?.enabledChecks, }, thresholds: { cpu: 80, memory: 85, disk: 90, networkLatency: 1000, processResponseTime: 5000, sshConnectionLatency: 2000, sshHealthScore: 70, ...config?.thresholds, }, checkInterval: config?.checkInterval || 30000, sshHealthCheckInterval: config?.sshHealthCheckInterval || 15000, alertThresholds: { consecutive_failures: 3, response_time_threshold: 10000, memory_usage_threshold: 95, error_rate_threshold: 0.1, ssh_connection_timeout: 30000, ssh_failure_rate_threshold: 0.05, ...config?.alertThresholds, }, recovery: { enabled: true, maxAttempts: 3, backoffMultiplier: 2, baseDelay: 5000, sshProactiveReconnect: true, ...config?.recovery, }, monitoring: { enableMetrics: true, enableAlerting: true, enablePredictiveAnalysis: true, enableSSHHealthPrediction: true, ...config?.monitoring, }, }; this.logger.info('HealthMonitor initialized with config:', this.config); } /** * Start the health monitoring system */ async start(): Promise<void> { if (this.isRunning) { this.logger.warn('HealthMonitor is already running'); return; } this.logger.info('Starting HealthMonitor...'); this.isRunning = true; // Start periodic health checks this.monitoringInterval = setInterval(async () => { await this.performHealthChecks(); }, this.config.checkInterval); // Start SSH-specific health monitoring if enabled if (this.config.enabledChecks.ssh) { this.sshHealthInterval = setInterval(async () => { await this.performSSHHealthChecks(); }, this.config.sshHealthCheckInterval); } // Perform initial health checks await this.performHealthChecks(); if (this.config.enabledChecks.ssh) { await this.performSSHHealthChecks(); } this.emit('started'); this.logger.info('HealthMonitor started successfully'); } /** * Stop the health monitoring system */ async stop(): Promise<void> { if (!this.isRunning) { this.logger.warn('HealthMonitor is not running'); return; } this.logger.info('Stopping HealthMonitor...'); this.isRunning = false; // Clear monitoring interval if (this.monitoringInterval) { clearInterval(this.monitoringInterval); this.monitoringInterval = undefined; } // Clear SSH health monitoring interval if (this.sshHealthInterval) { clearInterval(this.sshHealthInterval); this.sshHealthInterval = undefined; } // Clear active checks this.activeChecks.forEach((timeout, checkId) => { clearTimeout(timeout); }); this.activeChecks.clear(); this.emit('stopped'); this.logger.info('HealthMonitor stopped'); } /** * Perform comprehensive health checks */ private async performHealthChecks(): Promise<void> { const startTime = Date.now(); const checks: HealthCheckResult[] = []; try { // System health checks if (this.config.enabledChecks.system) { checks.push(await this.performSystemHealthCheck()); } // Memory health check if (this.config.enabledChecks.memory) { checks.push(await this.performMemoryHealthCheck()); } // Disk health check if (this.config.enabledChecks.disk) { checks.push(await this.performDiskHealthCheck()); } // Network health check if (this.config.enabledChecks.network) { checks.push(await this.performNetworkHealthCheck()); } // Update statistics this.updateHealthStatistics(checks); // Generate health report const report = await this.generateHealthReport(checks); // Emit health report this.emit('health-report', report); // Check for critical issues requiring immediate action await this.handleCriticalIssues(report); // Update health score history for trend analysis this.updateHealthTrends(report); } catch (error) { this.logger.error('Error during health checks:', error); this.emit('health-check-error', { error: error instanceof Error ? error.message : String(error), }); } const duration = Date.now() - startTime; this.logger.debug(`Health checks completed in ${duration}ms`); } /** * Perform system-level health check */ private async performSystemHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `system-${Date.now()}`; try { const cpuUsage = await this.getCPUUsage(); const memoryUsage = process.memoryUsage(); const uptime = os.uptime(); const loadAvg = os.loadavg(); const memoryPercent = (memoryUsage.heapUsed / memoryUsage.heapTotal) * 100; const isHealthy = cpuUsage < this.config.thresholds.cpu && memoryPercent < this.config.thresholds.memory; const status: HealthCheckResult['status'] = cpuUsage > 90 || memoryPercent > 95 ? 'critical' : cpuUsage > this.config.thresholds.cpu || memoryPercent > this.config.thresholds.memory ? 'unhealthy' : cpuUsage > 60 || memoryPercent > 70 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { cpuUsage, memoryPercent, uptime, loadAvg1: loadAvg[0], loadAvg5: loadAvg[1], loadAvg15: loadAvg[2], }, details: { message: `System health: CPU ${cpuUsage.toFixed(1)}%, Memory ${memoryPercent.toFixed(1)}%`, diagnosis: status !== 'healthy' ? `High resource usage detected. CPU: ${cpuUsage.toFixed(1)}% (threshold: ${this.config.thresholds.cpu}%), Memory: ${memoryPercent.toFixed(1)}% (threshold: ${this.config.thresholds.memory}%)` : 'System resources within normal parameters', recommendations: status !== 'healthy' ? [ 'Monitor system resource usage closely', 'Consider reducing concurrent operations', 'Check for memory leaks in running processes', 'Review system performance optimization', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { cpu: { checkStatus: cpuUsage > this.config.thresholds.cpu ? 'fail' : cpuUsage > 60 ? 'warn' : 'pass', value: cpuUsage, message: `CPU usage: ${cpuUsage.toFixed(1)}%`, duration: Date.now() - startTime, }, memory: { checkStatus: memoryPercent > this.config.thresholds.memory ? 'fail' : memoryPercent > 70 ? 'warn' : 'pass', value: memoryPercent, message: `Memory usage: ${memoryPercent.toFixed(1)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `System health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve system metrics', recommendations: [ 'Check system monitoring tools', 'Verify system stability', ], recoverable: true, }, duration: Date.now() - startTime, checks: { system: { checkStatus: 'fail', message: `System health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform memory-specific health check */ private async performMemoryHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `memory-${Date.now()}`; try { const memInfo = await this.getMemoryInfo(); const processMemory = process.memoryUsage(); const systemMemoryPercent = ((memInfo.total - memInfo.free) / memInfo.total) * 100; const processMemoryMB = processMemory.heapUsed / 1024 / 1024; const status: HealthCheckResult['status'] = systemMemoryPercent > 95 ? 'critical' : systemMemoryPercent > this.config.thresholds.memory ? 'unhealthy' : systemMemoryPercent > 70 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { systemMemoryPercent, processMemoryMB, totalMemoryGB: memInfo.total / 1024 / 1024 / 1024, freeMemoryGB: memInfo.free / 1024 / 1024 / 1024, heapUsedMB: processMemory.heapUsed / 1024 / 1024, heapTotalMB: processMemory.heapTotal / 1024 / 1024, }, details: { message: `Memory usage: System ${systemMemoryPercent.toFixed(1)}%, Process ${processMemoryMB.toFixed(1)}MB`, diagnosis: status !== 'healthy' ? `High memory usage detected. System: ${systemMemoryPercent.toFixed(1)}% (threshold: ${this.config.thresholds.memory}%)` : 'Memory usage within normal parameters', recommendations: status !== 'healthy' ? [ 'Monitor for memory leaks', 'Consider garbage collection optimization', 'Review buffer sizes and data structures', 'Implement memory pressure relief mechanisms', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { system_memory: { checkStatus: systemMemoryPercent > 95 ? 'fail' : systemMemoryPercent > this.config.thresholds.memory ? 'fail' : systemMemoryPercent > 70 ? 'warn' : 'pass', value: systemMemoryPercent, message: `System memory usage: ${systemMemoryPercent.toFixed(1)}%`, duration: Date.now() - startTime, }, process_memory: { checkStatus: 'pass', value: processMemoryMB, message: `Process memory usage: ${processMemoryMB.toFixed(1)}MB`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Memory health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve memory metrics', recommendations: [ 'Check system memory monitoring', 'Verify system stability', ], recoverable: true, }, duration: Date.now() - startTime, checks: { memory: { checkStatus: 'fail', message: `Memory health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform disk health check */ private async performDiskHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `disk-${Date.now()}`; try { const diskInfo = await this.getDiskInfo(); const diskUsagePercent = (diskInfo.used / diskInfo.total) * 100; const status: HealthCheckResult['status'] = diskUsagePercent > 95 ? 'critical' : diskUsagePercent > this.config.thresholds.disk ? 'unhealthy' : diskUsagePercent > 80 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { diskUsagePercent, totalGB: diskInfo.total / 1024 / 1024 / 1024, usedGB: diskInfo.used / 1024 / 1024 / 1024, freeGB: diskInfo.free / 1024 / 1024 / 1024, }, details: { message: `Disk usage: ${diskUsagePercent.toFixed(1)}% (${(diskInfo.free / 1024 / 1024 / 1024).toFixed(1)}GB free)`, diagnosis: status !== 'healthy' ? `High disk usage detected: ${diskUsagePercent.toFixed(1)}% (threshold: ${this.config.thresholds.disk}%)` : 'Disk usage within normal parameters', recommendations: status !== 'healthy' ? [ 'Clean up temporary files and logs', 'Archive old session data', 'Monitor disk space regularly', 'Consider log rotation and cleanup policies', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { disk_usage: { checkStatus: diskUsagePercent > 95 ? 'fail' : diskUsagePercent > this.config.thresholds.disk ? 'fail' : diskUsagePercent > 80 ? 'warn' : 'pass', value: diskUsagePercent, message: `Disk usage: ${diskUsagePercent.toFixed(1)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Disk health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve disk metrics', recommendations: [ 'Check disk monitoring tools', 'Verify filesystem health', ], recoverable: true, }, duration: Date.now() - startTime, checks: { disk: { checkStatus: 'fail', message: `Disk health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform network health check */ private async performNetworkHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `network-${Date.now()}`; try { // Test network connectivity with multiple targets const testTargets = ['8.8.8.8', '1.1.1.1', 'google.com']; const networkTests = await Promise.allSettled( testTargets.map((target) => this.pingHost(target)) ); const successfulTests = networkTests.filter( (result) => result.status === 'fulfilled' ); const avgLatency = successfulTests.length > 0 ? (successfulTests as PromiseFulfilledResult<number>[]).reduce( (sum, result) => sum + result.value, 0 ) / successfulTests.length : Infinity; const connectivityPercent = (successfulTests.length / testTargets.length) * 100; const status: HealthCheckResult['status'] = connectivityPercent === 0 ? 'critical' : connectivityPercent < 50 || avgLatency > this.config.thresholds.networkLatency ? 'unhealthy' : connectivityPercent < 100 || avgLatency > 500 ? 'warning' : 'healthy'; return { checkId, checkType: 'network', timestamp: new Date(), status, metrics: { connectivityPercent, avgLatency: avgLatency === Infinity ? -1 : avgLatency, successfulTests: successfulTests.length, totalTests: testTargets.length, }, details: { message: `Network connectivity: ${connectivityPercent.toFixed(0)}% (${successfulTests.length}/${testTargets.length} targets), avg latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms`, diagnosis: status !== 'healthy' ? `Network connectivity issues detected. Success rate: ${connectivityPercent.toFixed(0)}%, Average latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms` : 'Network connectivity within normal parameters', recommendations: status !== 'healthy' ? [ 'Check network connectivity and firewall settings', 'Verify DNS resolution', 'Test with different network targets', 'Monitor network interface statistics', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { connectivity: { checkStatus: connectivityPercent === 0 ? 'fail' : connectivityPercent < 50 ? 'fail' : connectivityPercent < 100 ? 'warn' : 'pass', value: connectivityPercent, message: `Network connectivity: ${connectivityPercent.toFixed(0)}%`, duration: Date.now() - startTime, }, latency: { checkStatus: avgLatency === Infinity ? 'fail' : avgLatency > this.config.thresholds.networkLatency ? 'fail' : avgLatency > 500 ? 'warn' : 'pass', value: avgLatency === Infinity ? -1 : avgLatency, message: `Average latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'network', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Network health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to perform network connectivity tests', recommendations: [ 'Check network configuration', 'Verify internet connectivity', ], recoverable: true, }, duration: Date.now() - startTime, checks: { network: { checkStatus: 'fail', message: `Network health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Check health of a specific session */ async checkSessionHealth( sessionState: SessionState ): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `session-${sessionState.id}-${Date.now()}`; try { const now = Date.now(); const timeSinceLastActivity = now - sessionState.lastActivity.getTime(); const sessionAge = now - sessionState.createdAt.getTime(); // Calculate session health score let healthScore = sessionState.healthScore || 100; // Deduct points based on various factors if ( sessionState.status === 'failed' || sessionState.status === 'recovering' ) { healthScore -= 30; } if (timeSinceLastActivity > 300000) { // 5 minutes of inactivity healthScore -= 20; } if (sessionState.recoveryAttempts > 0) { healthScore -= sessionState.recoveryAttempts * 10; } const status: HealthCheckResult['status'] = healthScore < 20 || sessionState.status === 'failed' ? 'critical' : healthScore < 50 || sessionState.status === 'recovering' ? 'unhealthy' : healthScore < 80 || timeSinceLastActivity > 180000 ? 'warning' : 'healthy'; return { checkId, checkType: 'session', sessionId: sessionState.id, timestamp: new Date(), status, metrics: { healthScore, timeSinceLastActivityMs: timeSinceLastActivity, sessionAgeMs: sessionAge, recoveryAttempts: sessionState.recoveryAttempts, }, details: { message: `Session health: Score ${healthScore.toFixed(0)}/100, Status: ${sessionState.status}`, diagnosis: status !== 'healthy' ? `Session health degraded. Score: ${healthScore.toFixed(0)}/100, Status: ${sessionState.status}, Last activity: ${Math.floor(timeSinceLastActivity / 1000)}s ago` : 'Session operating normally', recommendations: status !== 'healthy' ? [ 'Monitor session activity patterns', 'Check for underlying connection issues', 'Consider session restart if persistently unhealthy', 'Review session configuration and resource allocation', ] : [], recoverable: sessionState.status !== 'failed' || sessionState.recoveryAttempts < sessionState.maxRecoveryAttempts, }, duration: Date.now() - startTime, checks: { session_status: { checkStatus: sessionState.status === 'failed' ? 'fail' : sessionState.status === 'recovering' ? 'warn' : 'pass', value: sessionState.status, message: `Session status: ${sessionState.status}`, duration: Date.now() - startTime, }, health_score: { checkStatus: healthScore < 20 ? 'fail' : healthScore < 50 ? 'fail' : healthScore < 80 ? 'warn' : 'pass', value: healthScore, message: `Health score: ${healthScore.toFixed(0)}/100`, duration: Date.now() - startTime, }, activity: { checkStatus: timeSinceLastActivity > 300000 ? 'warn' : 'pass', value: timeSinceLastActivity, message: `Last activity: ${Math.floor(timeSinceLastActivity / 1000)}s ago`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'session', sessionId: sessionState.id, timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Session health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to assess session health', recommendations: [ 'Check session state consistency', 'Review session monitoring', ], recoverable: true, }, duration: Date.now() - startTime, checks: { session: { checkStatus: 'fail', message: `Session health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Register SSH session for health monitoring */ registerSSHSession( sessionId: string, connectionInfo: { hostname: string; port: number; username: string; authenticationMethod?: string; } ): void { const sshHealth: SSHSessionHealth = { sessionId, connectionId: `${connectionInfo.hostname}:${connectionInfo.port}`, hostname: connectionInfo.hostname, port: connectionInfo.port, username: connectionInfo.username, connectedAt: new Date(), lastActivity: new Date(), healthScore: 100, connectionLatencyHistory: [], throughputHistory: [], errorHistory: [], keepAliveStatus: 'active', connectionStability: 1.0, predictiveFailureScore: 0, authenticationMethod: connectionInfo.authenticationMethod || 'unknown', encryptionInfo: { algorithm: 'unknown', keySize: 0, compressionEnabled: false, }, serverInfo: { version: 'unknown', protocol: 'unknown', supportedAlgorithms: [], }, performanceMetrics: { averageLatency: 0, maxLatency: 0, minLatency: 0, throughputBytesPerSecond: 0, commandExecutionTime: 0, dataTransferTime: 0, }, }; this.sshSessions.set(sessionId, sshHealth); this.sshHealthHistory.set(sessionId, []); // Initialize prediction model const predictionModel: SSHPredictionModel = { sessionId, connectionStabilityTrend: [1.0], latencyTrend: [], throughputTrend: [], errorRateTrend: [0], keepAliveSuccessRate: 1.0, lastUpdated: new Date(), confidenceScore: 0.5, riskFactors: { networkInstability: 0, performanceDegradation: 0, authenticationIssues: 0, serverOverload: 0, connectionAging: 0, }, }; this.sshPredictionModels.set(sessionId, predictionModel); this.logger.info( `SSH session ${sessionId} registered for health monitoring: ${connectionInfo.hostname}:${connectionInfo.port}` ); this.emit('ssh-session-registered', { sessionId, connectionInfo, sshHealth, }); } /** * Unregister SSH session from health monitoring */ unregisterSSHSession(sessionId: string): void { this.sshSessions.delete(sessionId); this.sshHealthHistory.delete(sessionId); this.sshPredictionModels.delete(sessionId); this.logger.info( `SSH session ${sessionId} unregistered from health monitoring` ); this.emit('ssh-session-unregistered', { sessionId }); } /** * Update SSH session activity */ updateSSHSessionActivity( sessionId: string, activityData: { latency?: number; throughput?: number; commandExecutionTime?: number; dataTransferSize?: number; error?: { message: string; recoverable: boolean }; } ): void { const sshHealth = this.sshSessions.get(sessionId); if (!sshHealth) { return; } sshHealth.lastActivity = new Date(); if (activityData.latency !== undefined) { sshHealth.connectionLatencyHistory.push(activityData.latency); if (sshHealth.connectionLatencyHistory.length > 100) { sshHealth.connectionLatencyHistory.shift(); } // Update performance metrics const latencies = sshHealth.connectionLatencyHistory; sshHealth.performanceMetrics.averageLatency = latencies.reduce((a, b) => a + b, 0) / latencies.length; sshHealth.performanceMetrics.maxLatency = Math.max(...latencies); sshHealth.performanceMetrics.minLatency = Math.min(...latencies); } if (activityData.throughput !== undefined) { sshHealth.throughputHistory.push(activityData.throughput); if (sshHealth.throughputHistory.length > 100) { sshHealth.throughputHistory.shift(); } sshHealth.performanceMetrics.throughputBytesPerSecond = sshHealth.throughputHistory.reduce((a, b) => a + b, 0) / sshHealth.throughputHistory.length; } if (activityData.commandExecutionTime !== undefined) { sshHealth.performanceMetrics.commandExecutionTime = activityData.commandExecutionTime; } if (activityData.error) { sshHealth.errorHistory.push({ timestamp: new Date(), error: activityData.error.message, recoverable: activityData.error.recoverable, }); if (sshHealth.errorHistory.length > 50) { sshHealth.errorHistory.shift(); } } // Update health score and prediction model this.updateSSHHealthScore(sessionId); this.updateSSHPredictionModel(sessionId); } /** * Perform SSH-specific health checks */ private async performSSHHealthChecks(): Promise<void> { if (this.sshSessions.size === 0) { return; } const startTime = Date.now(); const sshResults: SSHHealthResult[] = []; try { const sessionEntries = Array.from(this.sshSessions.entries()); for (const [sessionId, sshHealth] of sessionEntries) { const healthResult = await this.checkSSHSessionHealth( sessionId, sshHealth ); sshResults.push(healthResult); // Store in history const history = this.sshHealthHistory.get(sessionId) || []; history.push(healthResult); // Keep only last 100 checks if (history.length > 100) { history.shift(); } this.sshHealthHistory.set(sessionId, history); // Check for critical issues requiring immediate action if ( healthResult.status === 'critical' || (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.8 ) { await this.handleCriticalSSHIssue(sessionId, healthResult); } else if ( healthResult.status === 'unhealthy' || (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.6 ) { this.emit('ssh-session-degraded', { sessionId, healthResult }); } // Check for proactive reconnection needs if ( this.config.recovery.sshProactiveReconnect && (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.7 ) { this.emit('ssh-proactive-reconnect-needed', { sessionId, healthResult, reason: 'predictive-failure-risk', urgency: 'medium', }); } } this.emit('ssh-health-checks-completed', { results: sshResults, duration: Date.now() - startTime, timestamp: new Date(), }); } catch (error) { this.logger.error('Error during SSH health checks:', error); this.emit('ssh-health-check-error', { error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } const duration = Date.now() - startTime; this.logger.debug( `SSH health checks completed in ${duration}ms for ${this.sshSessions.size} sessions` ); } /** * Check health of a specific SSH session */ private async checkSSHSessionHealth( sessionId: string, sshHealth: SSHSessionHealth ): Promise<SSHHealthResult> { const startTime = Date.now(); const checkId = `ssh-${sessionId}-${Date.now()}`; try { const now = Date.now(); const timeSinceLastActivity = now - sshHealth.lastActivity.getTime(); const connectionAge = now - sshHealth.connectedAt.getTime(); // Perform actual SSH connection test const connectionTest = await this.testSSHConnection(sessionId, sshHealth); // Calculate predictive scores const predictiveScores = this.calculateSSHPredictiveScores( sessionId, sshHealth ); // Update health score based on test results and predictions let healthScore = sshHealth.healthScore; if (!connectionTest.success) { healthScore = Math.max(0, healthScore - 30); } if (timeSinceLastActivity > 300000) { // 5 minutes of inactivity healthScore = Math.max(0, healthScore - 10); } if (predictiveScores.overallFailureRisk > 0.5) { healthScore = Math.max( 0, healthScore - predictiveScores.overallFailureRisk * 20 ); } const status: SSHHealthResult['status'] = healthScore < 20 || predictiveScores.overallFailureRisk > 0.9 ? 'critical' : healthScore < 50 || predictiveScores.overallFailureRisk > 0.7 ? 'unhealthy' : healthScore < 80 || predictiveScores.overallFailureRisk > 0.5 ? 'warning' : 'healthy'; // Update SSH health object sshHealth.healthScore = healthScore; sshHealth.predictiveFailureScore = predictiveScores.overallFailureRisk; const sshMetrics: Record<string, number> = { connectionLatency: connectionTest.latency, lastKeepAliveTimestamp: Date.now(), consecutiveFailures: connectionTest.success ? 0 : sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 300000 ).length, connectionUptime: connectionAge, throughput: sshHealth.performanceMetrics.throughputBytesPerSecond, errorRate: this.calculateSSHErrorRate(sshHealth), compressionEnabled: sshHealth.encryptionInfo.compressionEnabled ? 1 : 0, }; return { checkId, checkType: 'session', sessionId, timestamp: new Date(), status, metrics: { healthScore, connectionLatency: connectionTest.latency, timeSinceLastActivityMs: timeSinceLastActivity, connectionAgeMs: connectionAge, ...sshMetrics, }, details: { message: `SSH session health: Score ${healthScore.toFixed(0)}/100, Risk ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%`, diagnosis: status !== 'healthy' ? `SSH session health degraded. Score: ${healthScore.toFixed(0)}/100, Failure risk: ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%` : 'SSH session operating normally', recommendations: this.generateSSHRecommendations( status, predictiveScores, sshHealth ), recoverable: status !== 'critical' || connectionTest.success, }, duration: Date.now() - startTime, checks: { connection_test: { checkStatus: connectionTest.success ? 'pass' : 'fail', value: connectionTest.success, message: `Connection test: ${connectionTest.success ? 'SUCCESS' : 'FAILED'}`, duration: connectionTest.duration || 0, }, health_score: { checkStatus: healthScore < 50 ? 'fail' : healthScore < 80 ? 'warn' : 'pass', value: healthScore, message: `Health score: ${healthScore.toFixed(0)}/100`, duration: Date.now() - startTime, }, failure_risk: { checkStatus: predictiveScores.overallFailureRisk > 0.7 ? 'fail' : predictiveScores.overallFailureRisk > 0.5 ? 'warn' : 'pass', value: predictiveScores.overallFailureRisk, message: `Failure risk: ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, sshMetrics: sshMetrics as any, connectionId: sshHealth.connectionId, hostInfo: { hostname: sshHealth.hostname, port: sshHealth.port, username: sshHealth.username, }, predictiveScores, }; } catch (error) { this.logger.error( `SSH health check failed for session ${sessionId}:`, error ); return { checkId, checkType: 'session', sessionId, timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `SSH health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to assess SSH session health', recommendations: [ 'Check SSH connection', 'Verify network connectivity', 'Consider session restart', ], recoverable: true, }, duration: Date.now() - startTime, checks: { ssh_health: { checkStatus: 'fail', message: `SSH health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, connectionId: sshHealth.connectionId, hostInfo: { hostname: sshHealth.hostname, port: sshHealth.port, username: sshHealth.username, }, }; } } /** * Test SSH connection health */ private async testSSHConnection( sessionId: string, sshHealth: SSHSessionHealth ): Promise<{ success: boolean; latency: number; duration?: number; error?: string; }> { const startTime = Date.now(); return new Promise((resolve) => { // Emit SSH connection test request this.emit('ssh-connection-test-request', { sessionId, timeout: this.config.alertThresholds.ssh_connection_timeout, callback: (result: { success: boolean; latency?: number; error?: string; }) => { resolve({ success: result.success, latency: result.latency || Date.now() - startTime, duration: Date.now() - startTime, error: result.error, }); }, }); // Timeout mechanism setTimeout(() => { resolve({ success: false, latency: Date.now() - startTime, duration: Date.now() - startTime, error: `SSH connection test timeout after ${this.config.alertThresholds.ssh_connection_timeout}ms`, }); }, this.config.alertThresholds.ssh_connection_timeout); }); } /** * Calculate SSH predictive scores */ private calculateSSHPredictiveScores( sessionId: string, sshHealth: SSHSessionHealth ): { connectionStability: number; performanceDegradation: number; authenticationRisk: number; networkQuality: number; overallFailureRisk: number; } { const predictionModel = this.sshPredictionModels.get(sessionId); if (!predictionModel) { return { connectionStability: 1.0, performanceDegradation: 0, authenticationRisk: 0, networkQuality: 1.0, overallFailureRisk: 0, }; } // Calculate connection stability based on historical data let connectionStability = 1.0; if (predictionModel.connectionStabilityTrend.length > 5) { const recentStability = predictionModel.connectionStabilityTrend.slice(-5); const avgStability = recentStability.reduce((a, b) => a + b, 0) / recentStability.length; connectionStability = Math.max(0, avgStability); } // Calculate performance degradation let performanceDegradation = 0; if (predictionModel.latencyTrend.length > 10) { const recent = predictionModel.latencyTrend.slice(-5); const older = predictionModel.latencyTrend.slice(-10, -5); if (older.length > 0) { const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; if (recentAvg > olderAvg * 1.5) { performanceDegradation = Math.min( 1.0, (recentAvg - olderAvg) / olderAvg ); } } } // Calculate authentication risk based on error history let authenticationRisk = 0; const authErrors = sshHealth.errorHistory.filter( (e) => e.error.toLowerCase().includes('auth') || e.error.toLowerCase().includes('permission') || e.error.toLowerCase().includes('denied') ); if (authErrors.length > 0) { authenticationRisk = Math.min(1.0, authErrors.length / 10); } // Calculate network quality based on latency and error patterns let networkQuality = 1.0; if (sshHealth.connectionLatencyHistory.length > 5) { const avgLatency = sshHealth.performanceMetrics.averageLatency; const maxLatency = sshHealth.performanceMetrics.maxLatency; if (avgLatency > this.config.thresholds.sshConnectionLatency) { networkQuality *= 0.7; } if (maxLatency > this.config.thresholds.sshConnectionLatency * 2) { networkQuality *= 0.5; } // Factor in latency variance const variance = sshHealth.connectionLatencyHistory .map((l) => Math.pow(l - avgLatency, 2)) .reduce((a, b) => a + b, 0) / sshHealth.connectionLatencyHistory.length; if (variance > avgLatency) { networkQuality *= 0.8; // High variance indicates unstable network } } // Calculate overall failure risk const weights = { connectionStability: 0.3, performanceDegradation: 0.25, authenticationRisk: 0.2, networkQuality: 0.25, }; const overallFailureRisk = (1 - connectionStability) * weights.connectionStability + performanceDegradation * weights.performanceDegradation + authenticationRisk * weights.authenticationRisk + (1 - networkQuality) * weights.networkQuality; return { connectionStability, performanceDegradation, authenticationRisk, networkQuality, overallFailureRisk: Math.min(1.0, overallFailureRisk), }; } /** * Update SSH health score */ private updateSSHHealthScore(sessionId: string): void { const sshHealth = this.sshSessions.get(sessionId); if (!sshHealth) { return; } let healthScore = 100; // Factor in recent errors const recentErrors = sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 300000 // Last 5 minutes ); healthScore -= recentErrors.length * 5; // Factor in connection stability healthScore *= sshHealth.connectionStability; // Factor in performance if ( sshHealth.performanceMetrics.averageLatency > this.config.thresholds.sshConnectionLatency ) { const latencyPenalty = Math.min( 20, (sshHealth.performanceMetrics.averageLatency / this.config.thresholds.sshConnectionLatency - 1) * 20 ); healthScore -= latencyPenalty; } // Factor in keep-alive status if (sshHealth.keepAliveStatus === 'failed') { healthScore -= 30; } else if (sshHealth.keepAliveStatus === 'degraded') { healthScore -= 15; } sshHealth.healthScore = Math.max(0, Math.min(100, healthScore)); } /** * Update SSH prediction model */ private updateSSHPredictionModel(sessionId: string): void { const sshHealth = this.sshSessions.get(sessionId); const predictionModel = this.sshPredictionModels.get(sessionId); if (!sshHealth || !predictionModel) { return; } // Update trends predictionModel.connectionStabilityTrend.push( sshHealth.connectionStability ); if (predictionModel.connectionStabilityTrend.length > 50) { predictionModel.connectionStabilityTrend.shift(); } if (sshHealth.connectionLatencyHistory.length > 0) { const latestLatency = sshHealth.connectionLatencyHistory[ sshHealth.connectionLatencyHistory.length - 1 ]; predictionModel.latencyTrend.push(latestLatency); if (predictionModel.latencyTrend.length > 50) { predictionModel.latencyTrend.shift(); } } if (sshHealth.throughputHistory.length > 0) { const latestThroughput = sshHealth.throughputHistory[sshHealth.throughputHistory.length - 1]; predictionModel.throughputTrend.push(latestThroughput); if (predictionModel.throughputTrend.length > 50) { predictionModel.throughputTrend.shift(); } } // Update error rate const errorRate = this.calculateSSHErrorRate(sshHealth); predictionModel.errorRateTrend.push(errorRate); if (predictionModel.errorRateTrend.length > 50) { predictionModel.errorRateTrend.shift(); } // Update risk factors predictionModel.riskFactors.networkInstability = this.calculateNetworkInstability(sshHealth); predictionModel.riskFactors.performanceDegradation = this.calculatePerformanceDegradation(predictionModel); predictionModel.riskFactors.authenticationIssues = this.calculateAuthenticationRisk(sshHealth); predictionModel.riskFactors.connectionAging = this.calculateConnectionAging(sshHealth); // Update confidence score based on data availability const dataPoints = predictionModel.connectionStabilityTrend.length + predictionModel.latencyTrend.length + predictionModel.throughputTrend.length; predictionModel.confidenceScore = Math.min(1.0, dataPoints / 100); predictionModel.lastUpdated = new Date(); } /** * Calculate SSH error rate */ private calculateSSHErrorRate(sshHealth: SSHSessionHealth): number { const recentErrors = sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 3600000 // Last hour ); const connectionAge = Date.now() - sshHealth.connectedAt.getTime(); const hoursConnected = Math.max(1, connectionAge / 3600000); return recentErrors.length / hoursConnected; } /** * Calculate network instability score */ private calculateNetworkInstability(sshHealth: SSHSessionHealth): number { if (sshHealth.connectionLatencyHistory.length < 10) { return 0; } const latencies = sshHealth.connectionLatencyHistory; const avg = latencies.reduce((a, b) => a + b, 0) / latencies.length; const variance = latencies.map((l) => Math.pow(l - avg, 2)).reduce((a, b) => a + b, 0) / latencies.length; const coefficient = Math.sqrt(variance) / avg; return Math.min(1.0, coefficient); } /** * Calculate performance degradation */ private calculatePerformanceDegradation( predictionModel: SSHPredictionModel ): number { if (predictionModel.latencyTrend.length < 20) { return 0; } const recent = predictionModel.latencyTrend.slice(-10); const older = predictionModel.latencyTrend.slice(-20, -10); const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; if (recentAvg > olderAvg) { return Math.min(1.0, (recentAvg - olderAvg) / olderAvg); } return 0; } /** * Calculate authentication risk */ private calculateAuthenticationRisk(sshHealth: SSHSessionHealth): number { const authErrors = sshHealth.errorHistory.filter( (e) => e.error.toLowerCase().includes('auth') || e.error.toLowerCase().includes('permission') || e.error.toLowerCase().includes('denied') ); return Math.min(1.0, authErrors.length / 5); } /** * Calculate connection aging factor */ private calculateConnectionAging(sshHealth: SSHSessionHealth): number { const connectionAge = Date.now() - sshHealth.connectedAt.getTime(); const hoursConnected = connectionAge / 3600000; // Connections become more fragile after 24 hours if (hoursConnected > 24) { return Math.min(1.0, (hoursConnected - 24) / 48); } return 0; } /** * Generate SSH-specific recommendations */ private generateSSHRecommendations( status: HealthCheckResult['status'], predictiveScores: any, sshHealth: SSHSessionHealth ): string[] { const recommendations: string[] = []; if (status === 'critical' || predictiveScores.overallFailureRisk > 0.8) { recommendations.push('URGENT: Consider immediate SSH session restart'); recommendations.push( 'Verify network connectivity and SSH server availability' ); } if (predictiveScores.networkQuality < 0.5) { recommendations.push( 'Network quality degraded - check network stability' ); recommendations.push( 'Consider using compression or connection multiplexing' ); } if (predictiveScores.performanceDegradation > 0.3) { recommendations.push( 'Performance degradation detected - investigate server load' ); recommendations.push('Monitor SSH server resource utilization'); } if (predictiveScores.authenticationRisk > 0.2) { recommendations.push( 'Authentication issues detected - verify credentials' ); recommendations.push( 'Check SSH key validity and server authentication settings' ); } if ( sshHealth.performanceMetrics.averageLatency > this.config.thresholds.sshConnectionLatency ) { recommendations.push( `High latency detected (${sshHealth.performanceMetrics.averageLatency.toFixed(0)}ms)` ); recommendations.push( 'Consider using SSH connection multiplexing or keep-alive' ); } if (sshHealth.errorHistory.length > 10) { recommendations.push( 'Frequent errors detected - review connection stability' ); recommendations.push( 'Enable SSH debug logging for detailed troubleshooting' ); } return recommendations; } /** * Handle critical SSH issues */ private async handleCriticalSSHIssue( sessionId: string, healthResult: SSHHealthResult ): Promise<void> { this.logger.warn(`Handling critical SSH issue for session ${sessionId}`); // Emit critical SSH issue event this.emit('critical-ssh-issue', { sessionId, healthResult, timestamp: new Date(), autoRecoveryAttempted: false, }); // Attempt automatic recovery if enabled if (this.config.recovery.enabled && healthResult.details.recoverable) { await this.attemptSSHAutoRecovery(sessionId, healthResult); } } /** * Attempt SSH automatic recovery */ private async attemptSSHAutoRecovery( sessionId: string, healthResult: SSHHealthResult ): Promise<void> { const recoveryKey = `ssh-${sessionId}`; let attempts = this.stats.consecutiveFailures.get(recoveryKey) || 0; if (attempts >= this.config.recovery.maxAttempts) { this.logger.warn(`Max SSH recovery attempts reached for ${sessionId}`); return; } attempts++; this.stats.consecutiveFailures.set(recoveryKey, attempts); const delay = this.config.recovery.baseDelay * Math.pow(this.config.recovery.backoffMultiplier, attempts - 1); this.logger.info( `Attempting SSH auto-recovery for ${sessionId} (attempt ${attempts}/${this.config.recovery.maxAttempts}) in ${delay}ms` ); setTimeout(async () => { try { // Emit SSH recovery attempt event this.emit('ssh-auto-recovery-attempt', { sessionId, healthResult, attempt: attempts, maxAttempts: this.config.recovery.maxAttempts, timestamp: new Date(), }); // Request SSH session recovery let recoverySuccessful = false; // Emit SSH recovery request this.emit('ssh-recovery-request', { sessionId, healthResult, strategy: 'proactive-reconnect', timestamp: new Date(), callback: (success: boolean) => { recoverySuccessful = success; }, }); // Wait a moment for recovery to be attempted await new Promise((resolve) => setTimeout(resolve, 2000)); if (recoverySuccessful) { this.stats.consecutiveFailures.delete(recoveryKey); this.emit('ssh-auto-recovery-success', { sessionId, attempt: attempts, timestamp: new Date(), }); this.logger.info(`SSH auto-recovery successful for ${sessionId}`); } else { this.emit('ssh-auto-recovery-failed', { sessionId, attempt: attempts, timestamp: new Date(), }); this.logger.warn( `SSH auto-recovery failed for ${sessionId} (attempt ${attempts})` ); } } catch (error) { this.logger.error(`SSH auto-recovery error for ${sessionId}:`, error); this.emit('ssh-auto-recovery-error', { sessionId, attempt: attempts, error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } }, delay); } /** * Generate comprehensive health report */ private async generateHealthReport( checks: HealthCheckResult[] ): Promise<HealthReport> { const healthyCount = checks.filter((c) => c.status === 'healthy').length; const warningCount = checks.filter((c) => c.status === 'warning').length; const unhealthyCount = checks.filter( (c) => c.status === 'unhealthy' ).length; const criticalCount = checks.filter((c) => c.status === 'critical').length; // Calculate overall health status const overall: HealthReport['overall'] = criticalCount > 0 ? 'critical' : unhealthyCount > 0 ? 'unhealthy' : warningCount > 0 ? 'warning' : 'healthy'; // Calculate health score (0-100) const healthScore = Math.max( 0, 100 - warningCount * 10 - unhealthyCount * 25 - criticalCount * 50 ); this.healthScoreHistory.push(healthScore); // Keep only last 100 scores for trend analysis if (this.healthScoreHistory.length > 100) { this.healthScoreHistory.shift(); } // Calculate trend const trend = this.calculateHealthTrend(); // Calculate average response time const avgResponseTime = checks.length > 0 ? checks.reduce((sum, check) => sum + check.duration, 0) / checks.length : 0; // Generate action items const actionItems = this.generateActionItems(checks); return { overall, timestamp: new Date(), checks, metrics: { totalChecks: checks.length, healthyChecks: healthyCount, warningChecks: warningCount, unhealthyChecks: unhealthyCount, criticalChecks: criticalCount, averageResponseTime: avgResponseTime, }, trends: { healthScore, healthScoreTrend: trend, predictionNextHour: this.config.monitoring.enablePredictiveAnalysis ? this.predictHealthStatus() : undefined, }, actionItems, sshHealthSummary: this.generateSSHHealthSummary(), }; } /** * Generate SSH health summary */ private generateSSHHealthSummary(): { totalSSHSessions: number; healthySSHSessions: number; unhealthySSHSessions: number; averageSSHLatency: number; predictedFailures: number; proactiveReconnections: number; } { const sshSessions = Array.from(this.sshSessions.values()); const healthySessions = sshSessions.filter( (s) => s.healthScore >= 80 ).length; const unhealthySessions = sshSessions.filter( (s) => s.healthScore < 50 ).length; const avgLatency = sshSessions.length > 0 ? sshSessions.reduce( (sum, s) => sum + s.performanceMetrics.averageLatency, 0 ) / sshSessions.length : 0; const predictedFailures = sshSessions.filter( (s) => s.predictiveFailureScore > 0.7 ).length; // Count proactive reconnections from stats (would be tracked elsewhere) const proactiveReconnections = 0; // This would be tracked in a separate counter return { totalSSHSessions: sshSessions.length, healthySSHSessions: healthySessions, unhealthySSHSessions: unhealthySessions, averageSSHLatency: avgLatency, predictedFailures, proactiveReconnections, }; } /** * Calculate health trend based on historical data */ private calculateHealthTrend(): 'improving' | 'stable' | 'degrading' { if (this.healthScoreHistory.length < 5) { return 'stable'; } const recent = this.healthScoreHistory.slice(-5); const older = this.healthScoreHistory.slice(-10, -5); if (older.length === 0) return 'stable'; const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; const difference = recentAvg - olderAvg; if (difference > 5) return 'improving'; if (difference < -5) return 'degrading'; return 'stable'; } /** * Predict health status for the next hour using simple trend analysis */ private predictHealthStatus(): | 'healthy' | 'warning' | 'unhealthy' | 'critical' { if (this.healthScoreHistory.length < 10) { return 'healthy'; } // Simple linear regression on recent health scores const recentScores = this.healthScoreHistory.slice(-10); const x = Array.from({ length: recentScores.length }, (_, i) => i); const y = recentScores; const n = recentScores.length; const sumX = x.reduce((a, b) => a + b, 0); const sumY = y.reduce((a, b) => a + b, 0); const sumXY = x.reduce((acc, xi, i) => acc + xi * y[i], 0); const sumXX = x.reduce((acc, xi) => acc + xi * xi, 0); const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX); const predictedScore = recentScores[recentScores.length - 1] + slope * 12; // 12 intervals ahead (assuming 5min intervals) if (predictedScore < 30) return 'critical'; if (predictedScore < 60) return 'unhealthy'; if (predictedScore < 80) return 'warning'; return 'healthy'; } /** * Generate actionable recommendations based on health checks */ private generateActionItems(checks: HealthCheckResult[]): string[] { const actionItems: string[] = []; const criticalChecks = checks.filter((c) => c.status === 'critical'); const unhealthyChecks = checks.filter((c) => c.status === 'unhealthy'); if (criticalChecks.length > 0) { actionItems.push( `CRITICAL: ${criticalChecks.length} critical issues require immediate attention` ); criticalChecks.forEach((check) => { if (check.details.recommendations) { actionItems.push( ...check.details.recommendations.map((r) => `- ${r}`) ); } }); } if (unhealthyChecks.length > 0) { actionItems.push( `WARNING: ${unhealthyChecks.length} unhealthy components need attention` ); } // Add general maintenance recommendations if (this.healthScoreHistory.length > 0) { const avgScore = this.healthScoreHistory.reduce((a, b) => a + b, 0) / this.healthScoreHistory.length; if (avgScore < 80) { actionItems.push('Consider system optimization and maintenance'); } } return actionItems; } /** * Handle critical health issues that require immediate action */ private async handleCriticalIssues(report: HealthReport): Promise<void> { const criticalChecks = report.checks.filter((c) => c.status === 'critical'); if (criticalChecks.length === 0) return; this.logger.warn( `Handling ${criticalChecks.length} critical health issues` ); for (const check of criticalChecks) { // Emit critical health event this.emit('critical-health-issue', { check, timestamp: new Date(), autoRecoveryAttempted: false, }); // Attempt automatic recovery if enabled and issue is recoverable if (this.config.recovery.enabled && check.details.recoverable) { await this.attemptAutoRecovery(check); } } // If overall health is critical, emit system-wide alert if (report.overall === 'critical') { this.emit('system-critical', { report, timestamp: new Date(), actionRequired: true, }); } } /** * Attempt automatic recovery for critical issues */ private async attemptAutoRecovery(check: HealthCheckResult): Promise<void> { const recoveryKey = `${check.checkType}-${check.sessionId || 'system'}`; let attempts = this.stats.consecutiveFailures.get(recoveryKey) || 0; if (attempts >= this.config.recovery.maxAttempts) { this.logger.warn(`Max recovery attempts reached for ${recoveryKey}`); return; } attempts++; this.stats.consecutiveFailures.set(recoveryKey, attempts); const delay = this.config.recovery.baseDelay * Math.pow(this.config.recovery.backoffMultiplier, attempts - 1); this.logger.info( `Attempting auto-recovery for ${recoveryKey} (attempt ${attempts}/${this.config.recovery.maxAttempts}) in ${delay}ms` ); setTimeout(async () => { try { // Emit recovery attempt event this.emit('auto-recovery-attempt', { check, attempt: attempts, maxAttempts: this.config.recovery.maxAttempts, timestamp: new Date(), }); // Recovery logic based on check type let recoverySuccessful = false; switch (check.checkType) { case 'system': recoverySuccessful = await this.recoverSystemIssue(check); break; case 'network': recoverySuccessful = await this.recoverNetworkIssue(check); break; case 'session': recoverySuccessful = await this.recoverSessionIssue(check); break; default: this.logger.warn( `No recovery strategy for check type: ${check.checkType}` ); } if (recoverySuccessful) { this.stats.consecutiveFailures.delete(recoveryKey); this.emit('auto-recovery-success', { check, attempt: attempts, timestamp: new Date(), }); this.logger.info(`Auto-recovery successful for ${recoveryKey}`); } else { this.emit('auto-recovery-failed', { check, attempt: attempts, timestamp: new Date(), }); this.logger.warn( `Auto-recovery failed for ${recoveryKey} (attempt ${attempts})` ); } } catch (error) { this.logger.error(`Auto-recovery error for ${recoveryKey}:`, error); this.emit('auto-recovery-error', { check, attempt: attempts, error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } }, delay); } /** * Attempt to recover from system issues */ private async recoverSystemIssue(check: HealthCheckResult): Promise<boolean> { try { // Force garbage collection if available if (global.gc) { global.gc(); this.logger.info('Forced garbage collection for memory recovery'); return true; } // Clear caches, clean up temporary data, etc. // This is a placeholder for system recovery actions this.logger.info('Attempting system recovery actions'); return true; } catch (error) { this.logger.error('System recovery failed:', error); return false; } } /** * Attempt to recover from network issues */ private async recoverNetworkIssue( check: HealthCheckResult ): Promise<boolean> { try { // Flush DNS cache, reset network connections, etc. // This is a placeholder for network recovery actions this.logger.info('Attempting network recovery actions'); return true; } catch (error) { this.logger.error('Network recovery failed:', error); return false; } } /** * Attempt to recover from session issues */ private async recoverSessionIssue( check: HealthCheckResult ): Promise<boolean> { try { if (!check.sessionId) return false; // Emit session recovery request this.emit('session-recovery-request', { sessionId: check.sessionId, check, timestamp: new Date(), }); this.logger.info(`Requested session recovery for ${check.sessionId}`); return true; } catch (error) { this.logger.error('Session recovery failed:', error); return false; } } /** * Update health statistics */ private updateHealthStatistics(checks: HealthCheckResult[]): void { this.stats.totalChecks += checks.length; const successfulChecks = checks.filter( (c) => c.status === 'healthy' || c.status === 'warning' ).length; this.stats.successfulChecks += successfulChecks; this.stats.failedChecks += checks.length - successfulChecks; // Update average response time const totalResponseTime = checks.reduce( (sum, check) => sum + check.duration, 0 ); this.stats.averageResponseTime = (this.stats.averageResponseTime * (this.stats.totalChecks - checks.length) + totalResponseTime) / this.stats.totalChecks; // Update consecutive failure counts for (const check of checks) { const key = `${check.checkType}-${check.sessionId || 'system'}`; if (check.status === 'healthy' || check.status === 'warning') { this.stats.consecutiveFailures.delete(key); } else { const current = this.stats.consecutiveFailures.get(key) || 0; this.stats.consecutiveFailures.set(key, current + 1); } this.stats.lastCheckTimes.set(key, new Date()); } } /** * Update health trends */ private updateHealthTrends(report: HealthReport): void { // Store health reports for trend analysis const key = `health-${report.timestamp.toISOString().split('T')[0]}`; // Daily key if (!this.healthHistory.has(key)) { this.healthHistory.set(key, []); } // Convert report to a check result for storage const overallCheck: HealthCheckResult = { checkId: `overall-${Date.now()}`, checkType: 'system', timestamp: report.timestamp, status: report.overall, metrics: { healthScore: report.trends.healthScore, totalChecks: report.metrics.totalChecks, healthyChecks: report.metrics.healthyChecks, averageResponseTime: report.metrics.averageResponseTime, }, details: { message: `Overall health: ${report.overall}`, recoverable: true, }, duration: 0, checks: { overall_health: { checkStatus: report.overall === 'healthy' ? 'pass' : report.overall === 'warning' ? 'warn' : 'fail', value: report.trends.healthScore, message: `Overall health: ${report.overall}`, duration: 0, }, }, overallScore: report.overall === 'healthy' ? 100 : report.overall === 'warning' ? 75 : report.overall === 'unhealthy' ? 50 : 0, }; this.healthHistory.get(key)!.push(overallCheck); // Clean up old history (keep last 30 days) const cutoffDate = new Date(); cutoffDate.setDate(cutoffDate.getDate() - 30); this.healthHistory.forEach((history, historyKey) => { if (new Date(historyKey.replace('health-', '')) < cutoffDate) { this.healthHistory.delete(historyKey); } }); } /** * Get current health statistics */ getHealthStatistics() { return { ...this.stats, isRunning: this.isRunning, healthScoreHistory: [...this.healthScoreHistory], consecutiveFailures: Object.fromEntries(this.stats.consecutiveFailures), lastCheckTimes: Object.fromEntries( Array.from(this.stats.lastCheckTimes.entries()).map(([k, v]) => [ k, v.toISOString(), ]) ), errorRates: Object.fromEntries(this.stats.errorRates), }; } /** * Get historical health data */ getHealthHistory(days = 7): Record<string, HealthCheckResult[]> { const result: Record<string, HealthCheckResult[]> = {}; const cutoffDate = new Date(); cutoffDate.setDate(cutoffDate.getDate() - days); this.healthHistory.forEach((checks, key) => { const date = new Date(key.replace('health-', '')); if (date >= cutoffDate) { result[key] = checks; } }); return result; } // Utility methods for system metrics private async getCPUUsage(): Promise<number> { return new Promise((resolve) => { const startUsage = process.cpuUsage(); setTimeout(() => { const endUsage = process.cpuUsage(startUsage); const totalUsage = endUsage.user + endUsage.system; const percentage = totalUsage / 10000 / os.cpus().length; // Convert to percentage resolve(Math.min(100, percentage)); }, 100); }); } private async getMemoryInfo(): Promise<{ total: number; free: number; used: number; }> { const totalMem = os.totalmem(); const freeMem = os.freemem(); return { total: totalMem, free: freeMem, used: totalMem - freeMem, }; } private async getDiskInfo(): Promise<{ total: number; used: number; free: number; }> { try { // For cross-platform compatibility, we'll use a simple approach if (process.platform === 'win32') { const { stdout } = await execAsync( 'wmic logicaldisk where "Caption=\'C:\'" get Size,FreeSpace /value' ); const lines = stdout.split('\n').filter((line) => line.includes('=')); const freeSpace = parseInt( lines.find((line) => line.startsWith('FreeSpace='))?.split('=')[1] || '0' ); const totalSpace = parseInt( lines.find((line) => line.startsWith('Size='))?.split('=')[1] || '0' ); return { total: totalSpace, free: freeSpace, used: totalSpace - freeSpace, }; } else { const { stdout } = await execAsync('df -k . | tail -1'); const fields = stdout.trim().split(/\s+/); const total = parseInt(fields[1]) * 1024; const used = parseInt(fields[2]) * 1024; const free = parseInt(fields[3]) * 1024; return { total, used, free }; } } catch (error) { this.logger.error('Error getting disk info:', error); return { total: 0, used: 0, free: 0 }; } } private async pingHost(host: string): Promise<number> { return new Promise((resolve, reject) => { const startTime = Date.now(); const command = process.platform === 'win32' ? `ping -n 1 ${host}` : `ping -c 1 ${host}`; exec(command, (error, stdout) => { if (error) { reject(error); } else { const latency = Date.now() - startTime; resolve(latency); } }); }); } /** * Clean up resources */ async destroy(): Promise<void> { await this.stop(); this.healthHistory.clear(); this.healthScoreHistory.length = 0; // Clean up SSH-specific resources this.sshSessions.clear(); this.sshHealthHistory.clear(); this.sshPredictionModels.clear(); this.removeAllListeners(); this.logger.info('HealthMonitor destroyed'); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ooples/mcp-console-automation'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

HealthMonitor.ts•77.1 KiB