Skip to main content
Glama
ooples

MCP Console Automation Server

HealthMonitor.ts78.9 kB
import { EventEmitter } from 'events'; import { Logger } from '../utils/logger.js'; import { SessionState, SystemMetrics, ProcessMetrics, HealthCheckResult, } from '../types/index.js'; import * as os from 'os'; import * as fs from 'fs/promises'; import * as path from 'path'; import { spawn, exec } from 'child_process'; import { promisify } from 'util'; const execAsync = promisify(exec); export interface HealthMonitorConfig { enabledChecks: { session: boolean; system: boolean; network: boolean; process: boolean; disk: boolean; memory: boolean; ssh: boolean; }; thresholds: { cpu: number; memory: number; disk: number; networkLatency: number; processResponseTime: number; sshConnectionLatency: number; sshHealthScore: number; }; checkInterval: number; sshHealthCheckInterval: number; alertThresholds: { consecutive_failures: number; response_time_threshold: number; memory_usage_threshold: number; error_rate_threshold: number; ssh_connection_timeout: number; ssh_failure_rate_threshold: number; }; recovery: { enabled: boolean; maxAttempts: number; backoffMultiplier: number; baseDelay: number; sshProactiveReconnect: boolean; }; monitoring: { enableMetrics: boolean; enableAlerting: boolean; enablePredictiveAnalysis: boolean; enableSSHHealthPrediction: boolean; }; } // HealthCheckResult is imported from types/index.ts export interface SSHHealthMetrics { connectionLatency: number; lastKeepAliveTimestamp?: number; // Changed from Date to number timestamp lastKeepAlive?: Date; // Keep for compatibility consecutiveFailures: number; connectionUptime: number; throughput: number; errorRate: number; authenticationType?: 'password' | 'publickey' | 'keyboard-interactive'; encryptionAlgorithm?: string; compressionEnabled?: number; // Changed to number (0 or 1) serverVersion?: string; clientVersion?: string; } export interface SSHHealthResult extends HealthCheckResult { sshMetrics?: SSHHealthMetrics; connectionId?: string; hostInfo?: { hostname: string; port: number; username: string; }; predictiveScores?: { connectionStability: number; performanceDegradation: number; authenticationRisk: number; networkQuality: number; overallFailureRisk: number; }; } export interface SSHSessionHealth { sessionId: string; connectionId: string; hostname: string; port: number; username: string; connectedAt: Date; lastActivity: Date; healthScore: number; connectionLatencyHistory: number[]; throughputHistory: number[]; errorHistory: { timestamp: Date; error: string; recoverable: boolean }[]; keepAliveStatus: 'active' | 'failed' | 'degraded'; connectionStability: number; predictiveFailureScore: number; authenticationMethod: string; encryptionInfo: { algorithm: string; keySize: number; compressionEnabled: boolean; }; serverInfo: { version: string; protocol: string; supportedAlgorithms: string[]; }; performanceMetrics: { averageLatency: number; maxLatency: number; minLatency: number; throughputBytesPerSecond: number; commandExecutionTime: number; dataTransferTime: number; }; } export interface SSHPredictionModel { sessionId: string; connectionStabilityTrend: number[]; latencyTrend: number[]; throughputTrend: number[]; errorRateTrend: number[]; keepAliveSuccessRate: number; lastUpdated: Date; confidenceScore: number; predictedFailureTime?: Date; riskFactors: { networkInstability: number; performanceDegradation: number; authenticationIssues: number; serverOverload: number; connectionAging: number; }; } export interface HealthReport { overall: 'healthy' | 'warning' | 'unhealthy' | 'critical'; timestamp: Date; checks: HealthCheckResult[]; metrics: { totalChecks: number; healthyChecks: number; warningChecks: number; unhealthyChecks: number; criticalChecks: number; averageResponseTime: number; }; trends: { healthScore: number; healthScoreTrend: 'improving' | 'stable' | 'degrading'; predictionNextHour?: 'healthy' | 'warning' | 'unhealthy' | 'critical'; }; actionItems: string[]; sshHealthSummary?: { totalSSHSessions: number; healthySSHSessions: number; unhealthySSHSessions: number; averageSSHLatency: number; predictedFailures: number; proactiveReconnections: number; }; } /** * Comprehensive Health Monitor for session and system health * Provides real-time health checks, predictive analysis, and automatic recovery */ export class HealthMonitor extends EventEmitter { private logger: Logger; private config: HealthMonitorConfig; private healthHistory: Map<string, HealthCheckResult[]> = new Map(); private activeChecks: Map<string, NodeJS.Timeout> = new Map(); private healthScoreHistory: number[] = []; private lastSystemMetrics?: SystemMetrics; private monitoringInterval?: NodeJS.Timeout; private sshHealthInterval?: NodeJS.Timeout; private isRunning = false; // SSH-specific monitoring data private sshSessions: Map<string, SSHSessionHealth> = new Map(); private sshHealthHistory: Map<string, SSHHealthResult[]> = new Map(); private sshPredictionModels: Map<string, SSHPredictionModel> = new Map(); // Health check statistics private stats = { totalChecks: 0, successfulChecks: 0, failedChecks: 0, averageResponseTime: 0, consecutiveFailures: new Map<string, number>(), lastCheckTimes: new Map<string, Date>(), errorRates: new Map<string, number>(), }; constructor(config?: Partial<HealthMonitorConfig>) { super(); this.logger = new Logger('HealthMonitor'); this.config = { enabledChecks: { session: true, system: true, network: true, process: true, disk: true, memory: true, ssh: true, ...config?.enabledChecks, }, thresholds: { cpu: 80, memory: 85, disk: 90, networkLatency: 1000, processResponseTime: 5000, sshConnectionLatency: 2000, sshHealthScore: 70, ...config?.thresholds, }, checkInterval: config?.checkInterval || 30000, sshHealthCheckInterval: config?.sshHealthCheckInterval || 15000, alertThresholds: { consecutive_failures: 3, response_time_threshold: 10000, memory_usage_threshold: 95, error_rate_threshold: 0.1, ssh_connection_timeout: 30000, ssh_failure_rate_threshold: 0.05, ...config?.alertThresholds, }, recovery: { enabled: true, maxAttempts: 3, backoffMultiplier: 2, baseDelay: 5000, sshProactiveReconnect: true, ...config?.recovery, }, monitoring: { enableMetrics: true, enableAlerting: true, enablePredictiveAnalysis: true, enableSSHHealthPrediction: true, ...config?.monitoring, }, }; this.logger.info('HealthMonitor initialized with config:', this.config); } /** * Start the health monitoring system */ async start(): Promise<void> { if (this.isRunning) { this.logger.warn('HealthMonitor is already running'); return; } this.logger.info('Starting HealthMonitor...'); this.isRunning = true; // Start periodic health checks this.monitoringInterval = setInterval(async () => { await this.performHealthChecks(); }, this.config.checkInterval); // Start SSH-specific health monitoring if enabled if (this.config.enabledChecks.ssh) { this.sshHealthInterval = setInterval(async () => { await this.performSSHHealthChecks(); }, this.config.sshHealthCheckInterval); } // Perform initial health checks await this.performHealthChecks(); if (this.config.enabledChecks.ssh) { await this.performSSHHealthChecks(); } this.emit('started'); this.logger.info('HealthMonitor started successfully'); } /** * Stop the health monitoring system */ async stop(): Promise<void> { if (!this.isRunning) { this.logger.warn('HealthMonitor is not running'); return; } this.logger.info('Stopping HealthMonitor...'); this.isRunning = false; // Clear monitoring interval if (this.monitoringInterval) { clearInterval(this.monitoringInterval); this.monitoringInterval = undefined; } // Clear SSH health monitoring interval if (this.sshHealthInterval) { clearInterval(this.sshHealthInterval); this.sshHealthInterval = undefined; } // Clear active checks this.activeChecks.forEach((timeout, checkId) => { clearTimeout(timeout); }); this.activeChecks.clear(); this.emit('stopped'); this.logger.info('HealthMonitor stopped'); } /** * Perform comprehensive health checks */ private async performHealthChecks(): Promise<void> { const startTime = Date.now(); const checks: HealthCheckResult[] = []; try { // System health checks if (this.config.enabledChecks.system) { checks.push(await this.performSystemHealthCheck()); } // Memory health check if (this.config.enabledChecks.memory) { checks.push(await this.performMemoryHealthCheck()); } // Disk health check if (this.config.enabledChecks.disk) { checks.push(await this.performDiskHealthCheck()); } // Network health check if (this.config.enabledChecks.network) { checks.push(await this.performNetworkHealthCheck()); } // Update statistics this.updateHealthStatistics(checks); // Generate health report const report = await this.generateHealthReport(checks); // Emit health report this.emit('health-report', report); // Check for critical issues requiring immediate action await this.handleCriticalIssues(report); // Update health score history for trend analysis this.updateHealthTrends(report); } catch (error) { this.logger.error('Error during health checks:', error); this.emit('health-check-error', { error: error instanceof Error ? error.message : String(error), }); } const duration = Date.now() - startTime; this.logger.debug(`Health checks completed in ${duration}ms`); } /** * Perform system-level health check */ private async performSystemHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `system-${Date.now()}`; try { const cpuUsage = await this.getCPUUsage(); const memoryUsage = process.memoryUsage(); const uptime = os.uptime(); const loadAvg = os.loadavg(); const memoryPercent = (memoryUsage.heapUsed / memoryUsage.heapTotal) * 100; const isHealthy = cpuUsage < this.config.thresholds.cpu && memoryPercent < this.config.thresholds.memory; const status: HealthCheckResult['status'] = cpuUsage > 90 || memoryPercent > 95 ? 'critical' : cpuUsage > this.config.thresholds.cpu || memoryPercent > this.config.thresholds.memory ? 'unhealthy' : cpuUsage > 60 || memoryPercent > 70 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { cpuUsage, memoryPercent, uptime, loadAvg1: loadAvg[0], loadAvg5: loadAvg[1], loadAvg15: loadAvg[2], }, details: { message: `System health: CPU ${cpuUsage.toFixed(1)}%, Memory ${memoryPercent.toFixed(1)}%`, diagnosis: status !== 'healthy' ? `High resource usage detected. CPU: ${cpuUsage.toFixed(1)}% (threshold: ${this.config.thresholds.cpu}%), Memory: ${memoryPercent.toFixed(1)}% (threshold: ${this.config.thresholds.memory}%)` : 'System resources within normal parameters', recommendations: status !== 'healthy' ? [ 'Monitor system resource usage closely', 'Consider reducing concurrent operations', 'Check for memory leaks in running processes', 'Review system performance optimization', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { cpu: { checkStatus: cpuUsage > this.config.thresholds.cpu ? 'fail' : cpuUsage > 60 ? 'warn' : 'pass', value: cpuUsage, message: `CPU usage: ${cpuUsage.toFixed(1)}%`, duration: Date.now() - startTime, }, memory: { checkStatus: memoryPercent > this.config.thresholds.memory ? 'fail' : memoryPercent > 70 ? 'warn' : 'pass', value: memoryPercent, message: `Memory usage: ${memoryPercent.toFixed(1)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `System health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve system metrics', recommendations: [ 'Check system monitoring tools', 'Verify system stability', ], recoverable: true, }, duration: Date.now() - startTime, checks: { system: { checkStatus: 'fail', message: `System health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform memory-specific health check */ private async performMemoryHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `memory-${Date.now()}`; try { const memInfo = await this.getMemoryInfo(); const processMemory = process.memoryUsage(); const systemMemoryPercent = ((memInfo.total - memInfo.free) / memInfo.total) * 100; const processMemoryMB = processMemory.heapUsed / 1024 / 1024; const status: HealthCheckResult['status'] = systemMemoryPercent > 95 ? 'critical' : systemMemoryPercent > this.config.thresholds.memory ? 'unhealthy' : systemMemoryPercent > 70 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { systemMemoryPercent, processMemoryMB, totalMemoryGB: memInfo.total / 1024 / 1024 / 1024, freeMemoryGB: memInfo.free / 1024 / 1024 / 1024, heapUsedMB: processMemory.heapUsed / 1024 / 1024, heapTotalMB: processMemory.heapTotal / 1024 / 1024, }, details: { message: `Memory usage: System ${systemMemoryPercent.toFixed(1)}%, Process ${processMemoryMB.toFixed(1)}MB`, diagnosis: status !== 'healthy' ? `High memory usage detected. System: ${systemMemoryPercent.toFixed(1)}% (threshold: ${this.config.thresholds.memory}%)` : 'Memory usage within normal parameters', recommendations: status !== 'healthy' ? [ 'Monitor for memory leaks', 'Consider garbage collection optimization', 'Review buffer sizes and data structures', 'Implement memory pressure relief mechanisms', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { system_memory: { checkStatus: systemMemoryPercent > 95 ? 'fail' : systemMemoryPercent > this.config.thresholds.memory ? 'fail' : systemMemoryPercent > 70 ? 'warn' : 'pass', value: systemMemoryPercent, message: `System memory usage: ${systemMemoryPercent.toFixed(1)}%`, duration: Date.now() - startTime, }, process_memory: { checkStatus: 'pass', value: processMemoryMB, message: `Process memory usage: ${processMemoryMB.toFixed(1)}MB`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Memory health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve memory metrics', recommendations: [ 'Check system memory monitoring', 'Verify system stability', ], recoverable: true, }, duration: Date.now() - startTime, checks: { memory: { checkStatus: 'fail', message: `Memory health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform disk health check */ private async performDiskHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `disk-${Date.now()}`; try { const diskInfo = await this.getDiskInfo(); const diskUsagePercent = (diskInfo.used / diskInfo.total) * 100; const status: HealthCheckResult['status'] = diskUsagePercent > 95 ? 'critical' : diskUsagePercent > this.config.thresholds.disk ? 'unhealthy' : diskUsagePercent > 80 ? 'warning' : 'healthy'; return { checkId, checkType: 'system', timestamp: new Date(), status, metrics: { diskUsagePercent, totalGB: diskInfo.total / 1024 / 1024 / 1024, usedGB: diskInfo.used / 1024 / 1024 / 1024, freeGB: diskInfo.free / 1024 / 1024 / 1024, }, details: { message: `Disk usage: ${diskUsagePercent.toFixed(1)}% (${(diskInfo.free / 1024 / 1024 / 1024).toFixed(1)}GB free)`, diagnosis: status !== 'healthy' ? `High disk usage detected: ${diskUsagePercent.toFixed(1)}% (threshold: ${this.config.thresholds.disk}%)` : 'Disk usage within normal parameters', recommendations: status !== 'healthy' ? [ 'Clean up temporary files and logs', 'Archive old session data', 'Monitor disk space regularly', 'Consider log rotation and cleanup policies', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { disk_usage: { checkStatus: diskUsagePercent > 95 ? 'fail' : diskUsagePercent > this.config.thresholds.disk ? 'fail' : diskUsagePercent > 80 ? 'warn' : 'pass', value: diskUsagePercent, message: `Disk usage: ${diskUsagePercent.toFixed(1)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'system', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Disk health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to retrieve disk metrics', recommendations: [ 'Check disk monitoring tools', 'Verify filesystem health', ], recoverable: true, }, duration: Date.now() - startTime, checks: { disk: { checkStatus: 'fail', message: `Disk health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Perform network health check */ private async performNetworkHealthCheck(): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `network-${Date.now()}`; try { // Test network connectivity with multiple targets const testTargets = ['8.8.8.8', '1.1.1.1', 'google.com']; const networkTests = await Promise.allSettled( testTargets.map((target) => this.pingHost(target)) ); const successfulTests = networkTests.filter( (result) => result.status === 'fulfilled' ); const avgLatency = successfulTests.length > 0 ? (successfulTests as PromiseFulfilledResult<number>[]).reduce( (sum, result) => sum + result.value, 0 ) / successfulTests.length : Infinity; const connectivityPercent = (successfulTests.length / testTargets.length) * 100; const status: HealthCheckResult['status'] = connectivityPercent === 0 ? 'critical' : connectivityPercent < 50 || avgLatency > this.config.thresholds.networkLatency ? 'unhealthy' : connectivityPercent < 100 || avgLatency > 500 ? 'warning' : 'healthy'; return { checkId, checkType: 'network', timestamp: new Date(), status, metrics: { connectivityPercent, avgLatency: avgLatency === Infinity ? -1 : avgLatency, successfulTests: successfulTests.length, totalTests: testTargets.length, }, details: { message: `Network connectivity: ${connectivityPercent.toFixed(0)}% (${successfulTests.length}/${testTargets.length} targets), avg latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms`, diagnosis: status !== 'healthy' ? `Network connectivity issues detected. Success rate: ${connectivityPercent.toFixed(0)}%, Average latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms` : 'Network connectivity within normal parameters', recommendations: status !== 'healthy' ? [ 'Check network connectivity and firewall settings', 'Verify DNS resolution', 'Test with different network targets', 'Monitor network interface statistics', ] : [], recoverable: status !== 'critical', }, duration: Date.now() - startTime, checks: { connectivity: { checkStatus: connectivityPercent === 0 ? 'fail' : connectivityPercent < 50 ? 'fail' : connectivityPercent < 100 ? 'warn' : 'pass', value: connectivityPercent, message: `Network connectivity: ${connectivityPercent.toFixed(0)}%`, duration: Date.now() - startTime, }, latency: { checkStatus: avgLatency === Infinity ? 'fail' : avgLatency > this.config.thresholds.networkLatency ? 'fail' : avgLatency > 500 ? 'warn' : 'pass', value: avgLatency === Infinity ? -1 : avgLatency, message: `Average latency: ${avgLatency === Infinity ? 'N/A' : avgLatency.toFixed(0)}ms`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'network', timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Network health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to perform network connectivity tests', recommendations: [ 'Check network configuration', 'Verify internet connectivity', ], recoverable: true, }, duration: Date.now() - startTime, checks: { network: { checkStatus: 'fail', message: `Network health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Check health of a specific session */ async checkSessionHealth( sessionState: SessionState ): Promise<HealthCheckResult> { const startTime = Date.now(); const checkId = `session-${sessionState.id}-${Date.now()}`; try { const now = Date.now(); const timeSinceLastActivity = now - sessionState.lastActivity.getTime(); const sessionAge = now - sessionState.createdAt.getTime(); // Calculate session health score let healthScore = sessionState.healthScore || 100; // Deduct points based on various factors if ( sessionState.status === 'failed' || sessionState.status === 'recovering' ) { healthScore -= 30; } if (timeSinceLastActivity > 300000) { // 5 minutes of inactivity healthScore -= 20; } if (sessionState.recoveryAttempts > 0) { healthScore -= sessionState.recoveryAttempts * 10; } const status: HealthCheckResult['status'] = healthScore < 20 || sessionState.status === 'failed' ? 'critical' : healthScore < 50 || sessionState.status === 'recovering' ? 'unhealthy' : healthScore < 80 || timeSinceLastActivity > 180000 ? 'warning' : 'healthy'; return { checkId, checkType: 'session', sessionId: sessionState.id, timestamp: new Date(), status, metrics: { healthScore, timeSinceLastActivityMs: timeSinceLastActivity, sessionAgeMs: sessionAge, recoveryAttempts: sessionState.recoveryAttempts, }, details: { message: `Session health: Score ${healthScore.toFixed(0)}/100, Status: ${sessionState.status}`, diagnosis: status !== 'healthy' ? `Session health degraded. Score: ${healthScore.toFixed(0)}/100, Status: ${sessionState.status}, Last activity: ${Math.floor(timeSinceLastActivity / 1000)}s ago` : 'Session operating normally', recommendations: status !== 'healthy' ? [ 'Monitor session activity patterns', 'Check for underlying connection issues', 'Consider session restart if persistently unhealthy', 'Review session configuration and resource allocation', ] : [], recoverable: sessionState.status !== 'failed' || sessionState.recoveryAttempts < sessionState.maxRecoveryAttempts, }, duration: Date.now() - startTime, checks: { session_status: { checkStatus: sessionState.status === 'failed' ? 'fail' : sessionState.status === 'recovering' ? 'warn' : 'pass', value: sessionState.status, message: `Session status: ${sessionState.status}`, duration: Date.now() - startTime, }, health_score: { checkStatus: healthScore < 20 ? 'fail' : healthScore < 50 ? 'fail' : healthScore < 80 ? 'warn' : 'pass', value: healthScore, message: `Health score: ${healthScore.toFixed(0)}/100`, duration: Date.now() - startTime, }, activity: { checkStatus: timeSinceLastActivity > 300000 ? 'warn' : 'pass', value: timeSinceLastActivity, message: `Last activity: ${Math.floor(timeSinceLastActivity / 1000)}s ago`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, }; } catch (error) { return { checkId, checkType: 'session', sessionId: sessionState.id, timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `Session health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to assess session health', recommendations: [ 'Check session state consistency', 'Review session monitoring', ], recoverable: true, }, duration: Date.now() - startTime, checks: { session: { checkStatus: 'fail', message: `Session health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, }; } } /** * Register SSH session for health monitoring */ registerSSHSession( sessionId: string, connectionInfo: { hostname: string; port: number; username: string; authenticationMethod?: string; } ): void { const sshHealth: SSHSessionHealth = { sessionId, connectionId: `${connectionInfo.hostname}:${connectionInfo.port}`, hostname: connectionInfo.hostname, port: connectionInfo.port, username: connectionInfo.username, connectedAt: new Date(), lastActivity: new Date(), healthScore: 100, connectionLatencyHistory: [], throughputHistory: [], errorHistory: [], keepAliveStatus: 'active', connectionStability: 1.0, predictiveFailureScore: 0, authenticationMethod: connectionInfo.authenticationMethod || 'unknown', encryptionInfo: { algorithm: 'unknown', keySize: 0, compressionEnabled: false, }, serverInfo: { version: 'unknown', protocol: 'unknown', supportedAlgorithms: [], }, performanceMetrics: { averageLatency: 0, maxLatency: 0, minLatency: 0, throughputBytesPerSecond: 0, commandExecutionTime: 0, dataTransferTime: 0, }, }; this.sshSessions.set(sessionId, sshHealth); this.sshHealthHistory.set(sessionId, []); // Initialize prediction model const predictionModel: SSHPredictionModel = { sessionId, connectionStabilityTrend: [1.0], latencyTrend: [], throughputTrend: [], errorRateTrend: [0], keepAliveSuccessRate: 1.0, lastUpdated: new Date(), confidenceScore: 0.5, riskFactors: { networkInstability: 0, performanceDegradation: 0, authenticationIssues: 0, serverOverload: 0, connectionAging: 0, }, }; this.sshPredictionModels.set(sessionId, predictionModel); this.logger.info( `SSH session ${sessionId} registered for health monitoring: ${connectionInfo.hostname}:${connectionInfo.port}` ); this.emit('ssh-session-registered', { sessionId, connectionInfo, sshHealth, }); } /** * Unregister SSH session from health monitoring */ unregisterSSHSession(sessionId: string): void { this.sshSessions.delete(sessionId); this.sshHealthHistory.delete(sessionId); this.sshPredictionModels.delete(sessionId); this.logger.info( `SSH session ${sessionId} unregistered from health monitoring` ); this.emit('ssh-session-unregistered', { sessionId }); } /** * Update SSH session activity */ updateSSHSessionActivity( sessionId: string, activityData: { latency?: number; throughput?: number; commandExecutionTime?: number; dataTransferSize?: number; error?: { message: string; recoverable: boolean }; } ): void { const sshHealth = this.sshSessions.get(sessionId); if (!sshHealth) { return; } sshHealth.lastActivity = new Date(); if (activityData.latency !== undefined) { sshHealth.connectionLatencyHistory.push(activityData.latency); if (sshHealth.connectionLatencyHistory.length > 100) { sshHealth.connectionLatencyHistory.shift(); } // Update performance metrics const latencies = sshHealth.connectionLatencyHistory; sshHealth.performanceMetrics.averageLatency = latencies.reduce((a, b) => a + b, 0) / latencies.length; sshHealth.performanceMetrics.maxLatency = Math.max(...latencies); sshHealth.performanceMetrics.minLatency = Math.min(...latencies); } if (activityData.throughput !== undefined) { sshHealth.throughputHistory.push(activityData.throughput); if (sshHealth.throughputHistory.length > 100) { sshHealth.throughputHistory.shift(); } sshHealth.performanceMetrics.throughputBytesPerSecond = sshHealth.throughputHistory.reduce((a, b) => a + b, 0) / sshHealth.throughputHistory.length; } if (activityData.commandExecutionTime !== undefined) { sshHealth.performanceMetrics.commandExecutionTime = activityData.commandExecutionTime; } if (activityData.error) { sshHealth.errorHistory.push({ timestamp: new Date(), error: activityData.error.message, recoverable: activityData.error.recoverable, }); if (sshHealth.errorHistory.length > 50) { sshHealth.errorHistory.shift(); } } // Update health score and prediction model this.updateSSHHealthScore(sessionId); this.updateSSHPredictionModel(sessionId); } /** * Perform SSH-specific health checks */ private async performSSHHealthChecks(): Promise<void> { if (this.sshSessions.size === 0) { return; } const startTime = Date.now(); const sshResults: SSHHealthResult[] = []; try { const sessionEntries = Array.from(this.sshSessions.entries()); for (const [sessionId, sshHealth] of sessionEntries) { const healthResult = await this.checkSSHSessionHealth( sessionId, sshHealth ); sshResults.push(healthResult); // Store in history const history = this.sshHealthHistory.get(sessionId) || []; history.push(healthResult); // Keep only last 100 checks if (history.length > 100) { history.shift(); } this.sshHealthHistory.set(sessionId, history); // Check for critical issues requiring immediate action if ( healthResult.status === 'critical' || (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.8 ) { await this.handleCriticalSSHIssue(sessionId, healthResult); } else if ( healthResult.status === 'unhealthy' || (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.6 ) { this.emit('ssh-session-degraded', { sessionId, healthResult }); } // Check for proactive reconnection needs if ( this.config.recovery.sshProactiveReconnect && (healthResult.predictiveScores?.overallFailureRisk || 0) > 0.7 ) { this.emit('ssh-proactive-reconnect-needed', { sessionId, healthResult, reason: 'predictive-failure-risk', urgency: 'medium', }); } } this.emit('ssh-health-checks-completed', { results: sshResults, duration: Date.now() - startTime, timestamp: new Date(), }); } catch (error) { this.logger.error('Error during SSH health checks:', error); this.emit('ssh-health-check-error', { error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } const duration = Date.now() - startTime; this.logger.debug( `SSH health checks completed in ${duration}ms for ${this.sshSessions.size} sessions` ); } /** * Check health of a specific SSH session */ private async checkSSHSessionHealth( sessionId: string, sshHealth: SSHSessionHealth ): Promise<SSHHealthResult> { const startTime = Date.now(); const checkId = `ssh-${sessionId}-${Date.now()}`; try { const now = Date.now(); const timeSinceLastActivity = now - sshHealth.lastActivity.getTime(); const connectionAge = now - sshHealth.connectedAt.getTime(); // Perform actual SSH connection test const connectionTest = await this.testSSHConnection(sessionId, sshHealth); // Calculate predictive scores const predictiveScores = this.calculateSSHPredictiveScores( sessionId, sshHealth ); // Update health score based on test results and predictions let healthScore = sshHealth.healthScore; if (!connectionTest.success) { healthScore = Math.max(0, healthScore - 30); } if (timeSinceLastActivity > 300000) { // 5 minutes of inactivity healthScore = Math.max(0, healthScore - 10); } if (predictiveScores.overallFailureRisk > 0.5) { healthScore = Math.max( 0, healthScore - predictiveScores.overallFailureRisk * 20 ); } const status: SSHHealthResult['status'] = healthScore < 20 || predictiveScores.overallFailureRisk > 0.9 ? 'critical' : healthScore < 50 || predictiveScores.overallFailureRisk > 0.7 ? 'unhealthy' : healthScore < 80 || predictiveScores.overallFailureRisk > 0.5 ? 'warning' : 'healthy'; // Update SSH health object sshHealth.healthScore = healthScore; sshHealth.predictiveFailureScore = predictiveScores.overallFailureRisk; const sshMetrics: Record<string, number> = { connectionLatency: connectionTest.latency, lastKeepAliveTimestamp: Date.now(), consecutiveFailures: connectionTest.success ? 0 : sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 300000 ).length, connectionUptime: connectionAge, throughput: sshHealth.performanceMetrics.throughputBytesPerSecond, errorRate: this.calculateSSHErrorRate(sshHealth), compressionEnabled: sshHealth.encryptionInfo.compressionEnabled ? 1 : 0, }; return { checkId, checkType: 'session', sessionId, timestamp: new Date(), status, metrics: { healthScore, connectionLatency: connectionTest.latency, timeSinceLastActivityMs: timeSinceLastActivity, connectionAgeMs: connectionAge, ...sshMetrics, }, details: { message: `SSH session health: Score ${healthScore.toFixed(0)}/100, Risk ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%`, diagnosis: status !== 'healthy' ? `SSH session health degraded. Score: ${healthScore.toFixed(0)}/100, Failure risk: ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%` : 'SSH session operating normally', recommendations: this.generateSSHRecommendations( status, predictiveScores, sshHealth ), recoverable: status !== 'critical' || connectionTest.success, }, duration: Date.now() - startTime, checks: { connection_test: { checkStatus: connectionTest.success ? 'pass' : 'fail', value: connectionTest.success, message: `Connection test: ${connectionTest.success ? 'SUCCESS' : 'FAILED'}`, duration: connectionTest.duration || 0, }, health_score: { checkStatus: healthScore < 50 ? 'fail' : healthScore < 80 ? 'warn' : 'pass', value: healthScore, message: `Health score: ${healthScore.toFixed(0)}/100`, duration: Date.now() - startTime, }, failure_risk: { checkStatus: predictiveScores.overallFailureRisk > 0.7 ? 'fail' : predictiveScores.overallFailureRisk > 0.5 ? 'warn' : 'pass', value: predictiveScores.overallFailureRisk, message: `Failure risk: ${(predictiveScores.overallFailureRisk * 100).toFixed(0)}%`, duration: Date.now() - startTime, }, }, overallScore: status === 'healthy' ? 100 : status === 'warning' ? 75 : status === 'unhealthy' ? 50 : 0, sshMetrics: sshMetrics as any, connectionId: sshHealth.connectionId, hostInfo: { hostname: sshHealth.hostname, port: sshHealth.port, username: sshHealth.username, }, predictiveScores, }; } catch (error) { this.logger.error( `SSH health check failed for session ${sessionId}:`, error ); return { checkId, checkType: 'session', sessionId, timestamp: new Date(), status: 'critical', metrics: {}, details: { message: `SSH health check failed: ${error instanceof Error ? error.message : String(error)}`, diagnosis: 'Unable to assess SSH session health', recommendations: [ 'Check SSH connection', 'Verify network connectivity', 'Consider session restart', ], recoverable: true, }, duration: Date.now() - startTime, checks: { ssh_health: { checkStatus: 'fail', message: `SSH health check failed: ${error instanceof Error ? error.message : String(error)}`, }, }, overallScore: 0, connectionId: sshHealth.connectionId, hostInfo: { hostname: sshHealth.hostname, port: sshHealth.port, username: sshHealth.username, }, }; } } /** * Test SSH connection health */ private async testSSHConnection( sessionId: string, sshHealth: SSHSessionHealth ): Promise<{ success: boolean; latency: number; duration?: number; error?: string; }> { const startTime = Date.now(); return new Promise((resolve) => { // Emit SSH connection test request this.emit('ssh-connection-test-request', { sessionId, timeout: this.config.alertThresholds.ssh_connection_timeout, callback: (result: { success: boolean; latency?: number; error?: string; }) => { resolve({ success: result.success, latency: result.latency || Date.now() - startTime, duration: Date.now() - startTime, error: result.error, }); }, }); // Timeout mechanism setTimeout(() => { resolve({ success: false, latency: Date.now() - startTime, duration: Date.now() - startTime, error: `SSH connection test timeout after ${this.config.alertThresholds.ssh_connection_timeout}ms`, }); }, this.config.alertThresholds.ssh_connection_timeout); }); } /** * Calculate SSH predictive scores */ private calculateSSHPredictiveScores( sessionId: string, sshHealth: SSHSessionHealth ): { connectionStability: number; performanceDegradation: number; authenticationRisk: number; networkQuality: number; overallFailureRisk: number; } { const predictionModel = this.sshPredictionModels.get(sessionId); if (!predictionModel) { return { connectionStability: 1.0, performanceDegradation: 0, authenticationRisk: 0, networkQuality: 1.0, overallFailureRisk: 0, }; } // Calculate connection stability based on historical data let connectionStability = 1.0; if (predictionModel.connectionStabilityTrend.length > 5) { const recentStability = predictionModel.connectionStabilityTrend.slice(-5); const avgStability = recentStability.reduce((a, b) => a + b, 0) / recentStability.length; connectionStability = Math.max(0, avgStability); } // Calculate performance degradation let performanceDegradation = 0; if (predictionModel.latencyTrend.length > 10) { const recent = predictionModel.latencyTrend.slice(-5); const older = predictionModel.latencyTrend.slice(-10, -5); if (older.length > 0) { const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; if (recentAvg > olderAvg * 1.5) { performanceDegradation = Math.min( 1.0, (recentAvg - olderAvg) / olderAvg ); } } } // Calculate authentication risk based on error history let authenticationRisk = 0; const authErrors = sshHealth.errorHistory.filter( (e) => e.error.toLowerCase().includes('auth') || e.error.toLowerCase().includes('permission') || e.error.toLowerCase().includes('denied') ); if (authErrors.length > 0) { authenticationRisk = Math.min(1.0, authErrors.length / 10); } // Calculate network quality based on latency and error patterns let networkQuality = 1.0; if (sshHealth.connectionLatencyHistory.length > 5) { const avgLatency = sshHealth.performanceMetrics.averageLatency; const maxLatency = sshHealth.performanceMetrics.maxLatency; if (avgLatency > this.config.thresholds.sshConnectionLatency) { networkQuality *= 0.7; } if (maxLatency > this.config.thresholds.sshConnectionLatency * 2) { networkQuality *= 0.5; } // Factor in latency variance const variance = sshHealth.connectionLatencyHistory .map((l) => Math.pow(l - avgLatency, 2)) .reduce((a, b) => a + b, 0) / sshHealth.connectionLatencyHistory.length; if (variance > avgLatency) { networkQuality *= 0.8; // High variance indicates unstable network } } // Calculate overall failure risk const weights = { connectionStability: 0.3, performanceDegradation: 0.25, authenticationRisk: 0.2, networkQuality: 0.25, }; const overallFailureRisk = (1 - connectionStability) * weights.connectionStability + performanceDegradation * weights.performanceDegradation + authenticationRisk * weights.authenticationRisk + (1 - networkQuality) * weights.networkQuality; return { connectionStability, performanceDegradation, authenticationRisk, networkQuality, overallFailureRisk: Math.min(1.0, overallFailureRisk), }; } /** * Update SSH health score */ private updateSSHHealthScore(sessionId: string): void { const sshHealth = this.sshSessions.get(sessionId); if (!sshHealth) { return; } let healthScore = 100; // Factor in recent errors const recentErrors = sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 300000 // Last 5 minutes ); healthScore -= recentErrors.length * 5; // Factor in connection stability healthScore *= sshHealth.connectionStability; // Factor in performance if ( sshHealth.performanceMetrics.averageLatency > this.config.thresholds.sshConnectionLatency ) { const latencyPenalty = Math.min( 20, (sshHealth.performanceMetrics.averageLatency / this.config.thresholds.sshConnectionLatency - 1) * 20 ); healthScore -= latencyPenalty; } // Factor in keep-alive status if (sshHealth.keepAliveStatus === 'failed') { healthScore -= 30; } else if (sshHealth.keepAliveStatus === 'degraded') { healthScore -= 15; } sshHealth.healthScore = Math.max(0, Math.min(100, healthScore)); } /** * Update SSH prediction model */ private updateSSHPredictionModel(sessionId: string): void { const sshHealth = this.sshSessions.get(sessionId); const predictionModel = this.sshPredictionModels.get(sessionId); if (!sshHealth || !predictionModel) { return; } // Update trends predictionModel.connectionStabilityTrend.push( sshHealth.connectionStability ); if (predictionModel.connectionStabilityTrend.length > 50) { predictionModel.connectionStabilityTrend.shift(); } if (sshHealth.connectionLatencyHistory.length > 0) { const latestLatency = sshHealth.connectionLatencyHistory[ sshHealth.connectionLatencyHistory.length - 1 ]; predictionModel.latencyTrend.push(latestLatency); if (predictionModel.latencyTrend.length > 50) { predictionModel.latencyTrend.shift(); } } if (sshHealth.throughputHistory.length > 0) { const latestThroughput = sshHealth.throughputHistory[sshHealth.throughputHistory.length - 1]; predictionModel.throughputTrend.push(latestThroughput); if (predictionModel.throughputTrend.length > 50) { predictionModel.throughputTrend.shift(); } } // Update error rate const errorRate = this.calculateSSHErrorRate(sshHealth); predictionModel.errorRateTrend.push(errorRate); if (predictionModel.errorRateTrend.length > 50) { predictionModel.errorRateTrend.shift(); } // Update risk factors predictionModel.riskFactors.networkInstability = this.calculateNetworkInstability(sshHealth); predictionModel.riskFactors.performanceDegradation = this.calculatePerformanceDegradation(predictionModel); predictionModel.riskFactors.authenticationIssues = this.calculateAuthenticationRisk(sshHealth); predictionModel.riskFactors.connectionAging = this.calculateConnectionAging(sshHealth); // Update confidence score based on data availability const dataPoints = predictionModel.connectionStabilityTrend.length + predictionModel.latencyTrend.length + predictionModel.throughputTrend.length; predictionModel.confidenceScore = Math.min(1.0, dataPoints / 100); predictionModel.lastUpdated = new Date(); } /** * Calculate SSH error rate */ private calculateSSHErrorRate(sshHealth: SSHSessionHealth): number { const recentErrors = sshHealth.errorHistory.filter( (e) => Date.now() - e.timestamp.getTime() < 3600000 // Last hour ); const connectionAge = Date.now() - sshHealth.connectedAt.getTime(); const hoursConnected = Math.max(1, connectionAge / 3600000); return recentErrors.length / hoursConnected; } /** * Calculate network instability score */ private calculateNetworkInstability(sshHealth: SSHSessionHealth): number { if (sshHealth.connectionLatencyHistory.length < 10) { return 0; } const latencies = sshHealth.connectionLatencyHistory; const avg = latencies.reduce((a, b) => a + b, 0) / latencies.length; const variance = latencies.map((l) => Math.pow(l - avg, 2)).reduce((a, b) => a + b, 0) / latencies.length; const coefficient = Math.sqrt(variance) / avg; return Math.min(1.0, coefficient); } /** * Calculate performance degradation */ private calculatePerformanceDegradation( predictionModel: SSHPredictionModel ): number { if (predictionModel.latencyTrend.length < 20) { return 0; } const recent = predictionModel.latencyTrend.slice(-10); const older = predictionModel.latencyTrend.slice(-20, -10); const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; if (recentAvg > olderAvg) { return Math.min(1.0, (recentAvg - olderAvg) / olderAvg); } return 0; } /** * Calculate authentication risk */ private calculateAuthenticationRisk(sshHealth: SSHSessionHealth): number { const authErrors = sshHealth.errorHistory.filter( (e) => e.error.toLowerCase().includes('auth') || e.error.toLowerCase().includes('permission') || e.error.toLowerCase().includes('denied') ); return Math.min(1.0, authErrors.length / 5); } /** * Calculate connection aging factor */ private calculateConnectionAging(sshHealth: SSHSessionHealth): number { const connectionAge = Date.now() - sshHealth.connectedAt.getTime(); const hoursConnected = connectionAge / 3600000; // Connections become more fragile after 24 hours if (hoursConnected > 24) { return Math.min(1.0, (hoursConnected - 24) / 48); } return 0; } /** * Generate SSH-specific recommendations */ private generateSSHRecommendations( status: HealthCheckResult['status'], predictiveScores: any, sshHealth: SSHSessionHealth ): string[] { const recommendations: string[] = []; if (status === 'critical' || predictiveScores.overallFailureRisk > 0.8) { recommendations.push('URGENT: Consider immediate SSH session restart'); recommendations.push( 'Verify network connectivity and SSH server availability' ); } if (predictiveScores.networkQuality < 0.5) { recommendations.push( 'Network quality degraded - check network stability' ); recommendations.push( 'Consider using compression or connection multiplexing' ); } if (predictiveScores.performanceDegradation > 0.3) { recommendations.push( 'Performance degradation detected - investigate server load' ); recommendations.push('Monitor SSH server resource utilization'); } if (predictiveScores.authenticationRisk > 0.2) { recommendations.push( 'Authentication issues detected - verify credentials' ); recommendations.push( 'Check SSH key validity and server authentication settings' ); } if ( sshHealth.performanceMetrics.averageLatency > this.config.thresholds.sshConnectionLatency ) { recommendations.push( `High latency detected (${sshHealth.performanceMetrics.averageLatency.toFixed(0)}ms)` ); recommendations.push( 'Consider using SSH connection multiplexing or keep-alive' ); } if (sshHealth.errorHistory.length > 10) { recommendations.push( 'Frequent errors detected - review connection stability' ); recommendations.push( 'Enable SSH debug logging for detailed troubleshooting' ); } return recommendations; } /** * Handle critical SSH issues */ private async handleCriticalSSHIssue( sessionId: string, healthResult: SSHHealthResult ): Promise<void> { this.logger.warn(`Handling critical SSH issue for session ${sessionId}`); // Emit critical SSH issue event this.emit('critical-ssh-issue', { sessionId, healthResult, timestamp: new Date(), autoRecoveryAttempted: false, }); // Attempt automatic recovery if enabled if (this.config.recovery.enabled && healthResult.details.recoverable) { await this.attemptSSHAutoRecovery(sessionId, healthResult); } } /** * Attempt SSH automatic recovery */ private async attemptSSHAutoRecovery( sessionId: string, healthResult: SSHHealthResult ): Promise<void> { const recoveryKey = `ssh-${sessionId}`; let attempts = this.stats.consecutiveFailures.get(recoveryKey) || 0; if (attempts >= this.config.recovery.maxAttempts) { this.logger.warn(`Max SSH recovery attempts reached for ${sessionId}`); return; } attempts++; this.stats.consecutiveFailures.set(recoveryKey, attempts); const delay = this.config.recovery.baseDelay * Math.pow(this.config.recovery.backoffMultiplier, attempts - 1); this.logger.info( `Attempting SSH auto-recovery for ${sessionId} (attempt ${attempts}/${this.config.recovery.maxAttempts}) in ${delay}ms` ); setTimeout(async () => { try { // Emit SSH recovery attempt event this.emit('ssh-auto-recovery-attempt', { sessionId, healthResult, attempt: attempts, maxAttempts: this.config.recovery.maxAttempts, timestamp: new Date(), }); // Request SSH session recovery let recoverySuccessful = false; // Emit SSH recovery request this.emit('ssh-recovery-request', { sessionId, healthResult, strategy: 'proactive-reconnect', timestamp: new Date(), callback: (success: boolean) => { recoverySuccessful = success; }, }); // Wait a moment for recovery to be attempted await new Promise((resolve) => setTimeout(resolve, 2000)); if (recoverySuccessful) { this.stats.consecutiveFailures.delete(recoveryKey); this.emit('ssh-auto-recovery-success', { sessionId, attempt: attempts, timestamp: new Date(), }); this.logger.info(`SSH auto-recovery successful for ${sessionId}`); } else { this.emit('ssh-auto-recovery-failed', { sessionId, attempt: attempts, timestamp: new Date(), }); this.logger.warn( `SSH auto-recovery failed for ${sessionId} (attempt ${attempts})` ); } } catch (error) { this.logger.error(`SSH auto-recovery error for ${sessionId}:`, error); this.emit('ssh-auto-recovery-error', { sessionId, attempt: attempts, error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } }, delay); } /** * Generate comprehensive health report */ private async generateHealthReport( checks: HealthCheckResult[] ): Promise<HealthReport> { const healthyCount = checks.filter((c) => c.status === 'healthy').length; const warningCount = checks.filter((c) => c.status === 'warning').length; const unhealthyCount = checks.filter( (c) => c.status === 'unhealthy' ).length; const criticalCount = checks.filter((c) => c.status === 'critical').length; // Calculate overall health status const overall: HealthReport['overall'] = criticalCount > 0 ? 'critical' : unhealthyCount > 0 ? 'unhealthy' : warningCount > 0 ? 'warning' : 'healthy'; // Calculate health score (0-100) const healthScore = Math.max( 0, 100 - warningCount * 10 - unhealthyCount * 25 - criticalCount * 50 ); this.healthScoreHistory.push(healthScore); // Keep only last 100 scores for trend analysis if (this.healthScoreHistory.length > 100) { this.healthScoreHistory.shift(); } // Calculate trend const trend = this.calculateHealthTrend(); // Calculate average response time const avgResponseTime = checks.length > 0 ? checks.reduce((sum, check) => sum + check.duration, 0) / checks.length : 0; // Generate action items const actionItems = this.generateActionItems(checks); return { overall, timestamp: new Date(), checks, metrics: { totalChecks: checks.length, healthyChecks: healthyCount, warningChecks: warningCount, unhealthyChecks: unhealthyCount, criticalChecks: criticalCount, averageResponseTime: avgResponseTime, }, trends: { healthScore, healthScoreTrend: trend, predictionNextHour: this.config.monitoring.enablePredictiveAnalysis ? this.predictHealthStatus() : undefined, }, actionItems, sshHealthSummary: this.generateSSHHealthSummary(), }; } /** * Generate SSH health summary */ private generateSSHHealthSummary(): { totalSSHSessions: number; healthySSHSessions: number; unhealthySSHSessions: number; averageSSHLatency: number; predictedFailures: number; proactiveReconnections: number; } { const sshSessions = Array.from(this.sshSessions.values()); const healthySessions = sshSessions.filter( (s) => s.healthScore >= 80 ).length; const unhealthySessions = sshSessions.filter( (s) => s.healthScore < 50 ).length; const avgLatency = sshSessions.length > 0 ? sshSessions.reduce( (sum, s) => sum + s.performanceMetrics.averageLatency, 0 ) / sshSessions.length : 0; const predictedFailures = sshSessions.filter( (s) => s.predictiveFailureScore > 0.7 ).length; // Count proactive reconnections from stats (would be tracked elsewhere) const proactiveReconnections = 0; // This would be tracked in a separate counter return { totalSSHSessions: sshSessions.length, healthySSHSessions: healthySessions, unhealthySSHSessions: unhealthySessions, averageSSHLatency: avgLatency, predictedFailures, proactiveReconnections, }; } /** * Calculate health trend based on historical data */ private calculateHealthTrend(): 'improving' | 'stable' | 'degrading' { if (this.healthScoreHistory.length < 5) { return 'stable'; } const recent = this.healthScoreHistory.slice(-5); const older = this.healthScoreHistory.slice(-10, -5); if (older.length === 0) return 'stable'; const recentAvg = recent.reduce((a, b) => a + b, 0) / recent.length; const olderAvg = older.reduce((a, b) => a + b, 0) / older.length; const difference = recentAvg - olderAvg; if (difference > 5) return 'improving'; if (difference < -5) return 'degrading'; return 'stable'; } /** * Predict health status for the next hour using simple trend analysis */ private predictHealthStatus(): | 'healthy' | 'warning' | 'unhealthy' | 'critical' { if (this.healthScoreHistory.length < 10) { return 'healthy'; } // Simple linear regression on recent health scores const recentScores = this.healthScoreHistory.slice(-10); const x = Array.from({ length: recentScores.length }, (_, i) => i); const y = recentScores; const n = recentScores.length; const sumX = x.reduce((a, b) => a + b, 0); const sumY = y.reduce((a, b) => a + b, 0); const sumXY = x.reduce((acc, xi, i) => acc + xi * y[i], 0); const sumXX = x.reduce((acc, xi) => acc + xi * xi, 0); const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX); const predictedScore = recentScores[recentScores.length - 1] + slope * 12; // 12 intervals ahead (assuming 5min intervals) if (predictedScore < 30) return 'critical'; if (predictedScore < 60) return 'unhealthy'; if (predictedScore < 80) return 'warning'; return 'healthy'; } /** * Generate actionable recommendations based on health checks */ private generateActionItems(checks: HealthCheckResult[]): string[] { const actionItems: string[] = []; const criticalChecks = checks.filter((c) => c.status === 'critical'); const unhealthyChecks = checks.filter((c) => c.status === 'unhealthy'); if (criticalChecks.length > 0) { actionItems.push( `CRITICAL: ${criticalChecks.length} critical issues require immediate attention` ); criticalChecks.forEach((check) => { if (check.details.recommendations) { actionItems.push( ...check.details.recommendations.map((r) => `- ${r}`) ); } }); } if (unhealthyChecks.length > 0) { actionItems.push( `WARNING: ${unhealthyChecks.length} unhealthy components need attention` ); } // Add general maintenance recommendations if (this.healthScoreHistory.length > 0) { const avgScore = this.healthScoreHistory.reduce((a, b) => a + b, 0) / this.healthScoreHistory.length; if (avgScore < 80) { actionItems.push('Consider system optimization and maintenance'); } } return actionItems; } /** * Handle critical health issues that require immediate action */ private async handleCriticalIssues(report: HealthReport): Promise<void> { const criticalChecks = report.checks.filter((c) => c.status === 'critical'); if (criticalChecks.length === 0) return; this.logger.warn( `Handling ${criticalChecks.length} critical health issues` ); for (const check of criticalChecks) { // Emit critical health event this.emit('critical-health-issue', { check, timestamp: new Date(), autoRecoveryAttempted: false, }); // Attempt automatic recovery if enabled and issue is recoverable if (this.config.recovery.enabled && check.details.recoverable) { await this.attemptAutoRecovery(check); } } // If overall health is critical, emit system-wide alert if (report.overall === 'critical') { this.emit('system-critical', { report, timestamp: new Date(), actionRequired: true, }); } } /** * Attempt automatic recovery for critical issues */ private async attemptAutoRecovery(check: HealthCheckResult): Promise<void> { const recoveryKey = `${check.checkType}-${check.sessionId || 'system'}`; let attempts = this.stats.consecutiveFailures.get(recoveryKey) || 0; if (attempts >= this.config.recovery.maxAttempts) { this.logger.warn(`Max recovery attempts reached for ${recoveryKey}`); return; } attempts++; this.stats.consecutiveFailures.set(recoveryKey, attempts); const delay = this.config.recovery.baseDelay * Math.pow(this.config.recovery.backoffMultiplier, attempts - 1); this.logger.info( `Attempting auto-recovery for ${recoveryKey} (attempt ${attempts}/${this.config.recovery.maxAttempts}) in ${delay}ms` ); setTimeout(async () => { try { // Emit recovery attempt event this.emit('auto-recovery-attempt', { check, attempt: attempts, maxAttempts: this.config.recovery.maxAttempts, timestamp: new Date(), }); // Recovery logic based on check type let recoverySuccessful = false; switch (check.checkType) { case 'system': recoverySuccessful = await this.recoverSystemIssue(check); break; case 'network': recoverySuccessful = await this.recoverNetworkIssue(check); break; case 'session': recoverySuccessful = await this.recoverSessionIssue(check); break; default: this.logger.warn( `No recovery strategy for check type: ${check.checkType}` ); } if (recoverySuccessful) { this.stats.consecutiveFailures.delete(recoveryKey); this.emit('auto-recovery-success', { check, attempt: attempts, timestamp: new Date(), }); this.logger.info(`Auto-recovery successful for ${recoveryKey}`); } else { this.emit('auto-recovery-failed', { check, attempt: attempts, timestamp: new Date(), }); this.logger.warn( `Auto-recovery failed for ${recoveryKey} (attempt ${attempts})` ); } } catch (error) { this.logger.error(`Auto-recovery error for ${recoveryKey}:`, error); this.emit('auto-recovery-error', { check, attempt: attempts, error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }); } }, delay); } /** * Attempt to recover from system issues */ private async recoverSystemIssue(check: HealthCheckResult): Promise<boolean> { try { // Force garbage collection if available if (global.gc) { global.gc(); this.logger.info('Forced garbage collection for memory recovery'); return true; } // Clear caches, clean up temporary data, etc. // This is a placeholder for system recovery actions this.logger.info('Attempting system recovery actions'); return true; } catch (error) { this.logger.error('System recovery failed:', error); return false; } } /** * Attempt to recover from network issues */ private async recoverNetworkIssue( check: HealthCheckResult ): Promise<boolean> { try { // Flush DNS cache, reset network connections, etc. // This is a placeholder for network recovery actions this.logger.info('Attempting network recovery actions'); return true; } catch (error) { this.logger.error('Network recovery failed:', error); return false; } } /** * Attempt to recover from session issues */ private async recoverSessionIssue( check: HealthCheckResult ): Promise<boolean> { try { if (!check.sessionId) return false; // Emit session recovery request this.emit('session-recovery-request', { sessionId: check.sessionId, check, timestamp: new Date(), }); this.logger.info(`Requested session recovery for ${check.sessionId}`); return true; } catch (error) { this.logger.error('Session recovery failed:', error); return false; } } /** * Update health statistics */ private updateHealthStatistics(checks: HealthCheckResult[]): void { this.stats.totalChecks += checks.length; const successfulChecks = checks.filter( (c) => c.status === 'healthy' || c.status === 'warning' ).length; this.stats.successfulChecks += successfulChecks; this.stats.failedChecks += checks.length - successfulChecks; // Update average response time const totalResponseTime = checks.reduce( (sum, check) => sum + check.duration, 0 ); this.stats.averageResponseTime = (this.stats.averageResponseTime * (this.stats.totalChecks - checks.length) + totalResponseTime) / this.stats.totalChecks; // Update consecutive failure counts for (const check of checks) { const key = `${check.checkType}-${check.sessionId || 'system'}`; if (check.status === 'healthy' || check.status === 'warning') { this.stats.consecutiveFailures.delete(key); } else { const current = this.stats.consecutiveFailures.get(key) || 0; this.stats.consecutiveFailures.set(key, current + 1); } this.stats.lastCheckTimes.set(key, new Date()); } } /** * Update health trends */ private updateHealthTrends(report: HealthReport): void { // Store health reports for trend analysis const key = `health-${report.timestamp.toISOString().split('T')[0]}`; // Daily key if (!this.healthHistory.has(key)) { this.healthHistory.set(key, []); } // Convert report to a check result for storage const overallCheck: HealthCheckResult = { checkId: `overall-${Date.now()}`, checkType: 'system', timestamp: report.timestamp, status: report.overall, metrics: { healthScore: report.trends.healthScore, totalChecks: report.metrics.totalChecks, healthyChecks: report.metrics.healthyChecks, averageResponseTime: report.metrics.averageResponseTime, }, details: { message: `Overall health: ${report.overall}`, recoverable: true, }, duration: 0, checks: { overall_health: { checkStatus: report.overall === 'healthy' ? 'pass' : report.overall === 'warning' ? 'warn' : 'fail', value: report.trends.healthScore, message: `Overall health: ${report.overall}`, duration: 0, }, }, overallScore: report.overall === 'healthy' ? 100 : report.overall === 'warning' ? 75 : report.overall === 'unhealthy' ? 50 : 0, }; this.healthHistory.get(key)!.push(overallCheck); // Clean up old history (keep last 30 days) const cutoffDate = new Date(); cutoffDate.setDate(cutoffDate.getDate() - 30); this.healthHistory.forEach((history, historyKey) => { if (new Date(historyKey.replace('health-', '')) < cutoffDate) { this.healthHistory.delete(historyKey); } }); } /** * Get current health statistics */ getHealthStatistics() { return { ...this.stats, isRunning: this.isRunning, healthScoreHistory: [...this.healthScoreHistory], consecutiveFailures: Object.fromEntries(this.stats.consecutiveFailures), lastCheckTimes: Object.fromEntries( Array.from(this.stats.lastCheckTimes.entries()).map(([k, v]) => [ k, v.toISOString(), ]) ), errorRates: Object.fromEntries(this.stats.errorRates), }; } /** * Get historical health data */ getHealthHistory(days = 7): Record<string, HealthCheckResult[]> { const result: Record<string, HealthCheckResult[]> = {}; const cutoffDate = new Date(); cutoffDate.setDate(cutoffDate.getDate() - days); this.healthHistory.forEach((checks, key) => { const date = new Date(key.replace('health-', '')); if (date >= cutoffDate) { result[key] = checks; } }); return result; } // Utility methods for system metrics private async getCPUUsage(): Promise<number> { return new Promise((resolve) => { const startUsage = process.cpuUsage(); setTimeout(() => { const endUsage = process.cpuUsage(startUsage); const totalUsage = endUsage.user + endUsage.system; const percentage = totalUsage / 10000 / os.cpus().length; // Convert to percentage resolve(Math.min(100, percentage)); }, 100); }); } private async getMemoryInfo(): Promise<{ total: number; free: number; used: number; }> { const totalMem = os.totalmem(); const freeMem = os.freemem(); return { total: totalMem, free: freeMem, used: totalMem - freeMem, }; } private async getDiskInfo(): Promise<{ total: number; used: number; free: number; }> { try { // For cross-platform compatibility, we'll use a simple approach if (process.platform === 'win32') { const { stdout } = await execAsync( 'wmic logicaldisk where "Caption=\'C:\'" get Size,FreeSpace /value' ); const lines = stdout.split('\n').filter((line) => line.includes('=')); const freeSpace = parseInt( lines.find((line) => line.startsWith('FreeSpace='))?.split('=')[1] || '0' ); const totalSpace = parseInt( lines.find((line) => line.startsWith('Size='))?.split('=')[1] || '0' ); return { total: totalSpace, free: freeSpace, used: totalSpace - freeSpace, }; } else { const { stdout } = await execAsync('df -k . | tail -1'); const fields = stdout.trim().split(/\s+/); const total = parseInt(fields[1]) * 1024; const used = parseInt(fields[2]) * 1024; const free = parseInt(fields[3]) * 1024; return { total, used, free }; } } catch (error) { this.logger.error('Error getting disk info:', error); return { total: 0, used: 0, free: 0 }; } } private async pingHost(host: string): Promise<number> { return new Promise((resolve, reject) => { const startTime = Date.now(); const command = process.platform === 'win32' ? `ping -n 1 ${host}` : `ping -c 1 ${host}`; exec(command, (error, stdout) => { if (error) { reject(error); } else { const latency = Date.now() - startTime; resolve(latency); } }); }); } /** * Clean up resources */ async destroy(): Promise<void> { await this.stop(); this.healthHistory.clear(); this.healthScoreHistory.length = 0; // Clean up SSH-specific resources this.sshSessions.clear(); this.sshHealthHistory.clear(); this.sshPredictionModels.clear(); this.removeAllListeners(); this.logger.info('HealthMonitor destroyed'); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ooples/mcp-console-automation'

If you have feedback or need assistance with the MCP directory API, please join our Discord server