Skip to main content
Glama
auto-recovery.ts14.8 kB
import { EventEmitter } from 'events'; import { logger } from '../utils/logger.js'; import { metrics } from './metrics.js'; import { resourceManager } from '../utils/resource-manager.js'; import { config } from '../config/index.js'; /** * Auto-recovery system for handling failures and resource issues * Following WCGW: Assume everything will fail, build recovery for it */ export class AutoRecoverySystem extends EventEmitter { private static instance: AutoRecoverySystem; // State tracking private isRecovering = false; private lastRecoveryTime = 0; private recoveryAttempts = 0; private memorySnapshots: Array<{ time: number; usage: number }> = []; // Configuration private readonly config = { memoryLeakThreshold: 50 * 1024 * 1024, // 50MB increase memoryLeakWindow: 60000, // 1 minute maxRecoveryAttempts: 3, recoveryBackoff: 30000, // 30 seconds gcThreshold: 0.7, // 70% memory usage triggers GC restartThreshold: 0.95 // 95% memory usage triggers restart }; private constructor() { super(); this.startMonitoring(); this.setupEventHandlers(); } static getInstance(): AutoRecoverySystem { if (!AutoRecoverySystem.instance) { AutoRecoverySystem.instance = new AutoRecoverySystem(); } return AutoRecoverySystem.instance; } /** * Start monitoring for issues */ private startMonitoring(): void { // Monitor memory every 5 seconds setInterval(() => { this.checkMemoryLeak(); this.checkMemoryPressure(); }, 5000); // Monitor system health every minute setInterval(() => { this.performHealthCheck(); }, 60000); // Listen for metric alerts metrics.on('alert', (alert) => { this.handleMetricAlert(alert); }); // Listen for resource manager events resourceManager.on('memory-critical', (usage) => { this.handleMemoryCritical(usage); }); resourceManager.on('memory-warning', (usage) => { this.handleMemoryWarning(usage); }); } /** * Setup process event handlers */ private setupEventHandlers(): void { // Catch unhandled errors process.on('uncaughtException', (error) => { logger.error('Uncaught exception in auto-recovery', error); this.attemptRecovery('uncaught_exception', error); }); process.on('unhandledRejection', (reason, promise) => { logger.error('Unhandled rejection in auto-recovery', { reason, promise }); this.attemptRecovery('unhandled_rejection', reason); }); // Monitor event loop let lastCheck = Date.now(); setInterval(() => { const now = Date.now(); const delay = now - lastCheck - 1000; if (delay > 100) { logger.warn('Event loop blocked', { delay }); metrics.histogram('system.eventloop.delay', delay); if (delay > 5000) { this.handleEventLoopBlock(delay); } } lastCheck = now; }, 1000); } /** * Check for memory leaks */ private checkMemoryLeak(): void { const currentUsage = process.memoryUsage().heapUsed; const currentTime = Date.now(); // Add to snapshots this.memorySnapshots.push({ time: currentTime, usage: currentUsage }); // Keep only recent snapshots this.memorySnapshots = this.memorySnapshots.filter( s => currentTime - s.time < this.config.memoryLeakWindow ); // Need at least 2 snapshots if (this.memorySnapshots.length < 2) return; // Calculate memory growth const oldestSnapshot = this.memorySnapshots[0]; const memoryGrowth = currentUsage - oldestSnapshot.usage; const timeElapsed = currentTime - oldestSnapshot.time; // Check if growth exceeds threshold if (memoryGrowth > this.config.memoryLeakThreshold) { const growthRate = memoryGrowth / timeElapsed * 1000 * 60; // bytes per minute logger.warn('Potential memory leak detected', { growth: `${(memoryGrowth / 1024 / 1024).toFixed(2)}MB`, rate: `${(growthRate / 1024 / 1024).toFixed(2)}MB/min`, current: `${(currentUsage / 1024 / 1024).toFixed(2)}MB` }); this.emit('memory-leak', { growth: memoryGrowth, rate: growthRate, current: currentUsage }); // Take action this.handleMemoryLeak(); } } /** * Check memory pressure */ private checkMemoryPressure(): void { const memUsage = process.memoryUsage(); const maxMemory = config.getNumber('MAX_MEMORY_MB') * 1024 * 1024; const usageRatio = memUsage.heapUsed / maxMemory; metrics.gauge('system.memory.usage_ratio', usageRatio); if (usageRatio > this.config.restartThreshold) { logger.error('Critical memory usage, restart required', { usage: `${(memUsage.heapUsed / 1024 / 1024).toFixed(2)}MB`, max: `${(maxMemory / 1024 / 1024).toFixed(2)}MB`, ratio: `${(usageRatio * 100).toFixed(1)}%` }); this.initiateGracefulRestart(); } else if (usageRatio > this.config.gcThreshold) { this.triggerGarbageCollection(); } } /** * Perform health check */ private async performHealthCheck(): Promise<boolean> { const health = { memory: this.checkMemoryHealth(), resources: this.checkResourceHealth(), operations: this.checkOperationHealth() }; const isHealthy = Object.values(health).every(h => h.status === 'healthy'); if (!isHealthy) { logger.warn('Health check failed', health); this.emit('unhealthy', health); // Attempt recovery if critical const criticalIssues = Object.values(health).filter(h => h.status === 'critical'); if (criticalIssues.length > 0) { this.attemptRecovery('health_check', health); } } return isHealthy; } /** * Check memory health */ private checkMemoryHealth(): any { const memUsage = process.memoryUsage(); const maxMemory = config.getNumber('MAX_MEMORY_MB') * 1024 * 1024; const usageRatio = memUsage.heapUsed / maxMemory; if (usageRatio > 0.9) { return { status: 'critical', message: 'Memory usage above 90%', usage: usageRatio }; } else if (usageRatio > 0.7) { return { status: 'warning', message: 'Memory usage above 70%', usage: usageRatio }; } return { status: 'healthy', usage: usageRatio }; } /** * Check resource health */ private checkResourceHealth(): any { const usage = resourceManager.getUsage(); const issues = []; if (usage.processes > 8) issues.push('Too many processes'); if (usage.connections > 80) issues.push('Too many connections'); if (usage.fileHandles > 40) issues.push('Too many file handles'); if (issues.length > 2) { return { status: 'critical', message: issues.join(', '), usage }; } else if (issues.length > 0) { return { status: 'warning', message: issues.join(', '), usage }; } return { status: 'healthy', usage }; } /** * Check operation health */ private checkOperationHealth(): any { const allMetrics = metrics.getAllMetrics(); const issues = []; // Check error rates for (const [name, stats] of Object.entries(allMetrics.operations)) { const errorRate = (stats as any).errorRate; if (errorRate > 0.1) { issues.push(`High error rate for ${name}: ${(errorRate * 100).toFixed(1)}%`); } } if (issues.length > 3) { return { status: 'critical', message: issues.join(', ') }; } else if (issues.length > 0) { return { status: 'warning', message: issues.join(', ') }; } return { status: 'healthy' }; } /** * Handle memory leak */ private handleMemoryLeak(): void { logger.info('Handling memory leak'); // First try garbage collection this.triggerGarbageCollection(); // Clear caches and unused resources setTimeout(() => { resourceManager.cleanup().then(() => { logger.info('Resource cleanup completed'); // Check if memory improved setTimeout(() => { const currentUsage = process.memoryUsage().heapUsed; const improved = this.memorySnapshots[this.memorySnapshots.length - 1].usage - currentUsage; if (improved > 10 * 1024 * 1024) { logger.info(`Memory recovered: ${(improved / 1024 / 1024).toFixed(2)}MB`); } else { logger.warn('Memory leak persists after cleanup'); this.emit('persistent-memory-leak'); } }, 5000); }); }, 1000); } /** * Handle memory critical */ private handleMemoryCritical(usage: number): void { logger.error('Memory critical handler triggered', { usage }); // Immediate actions this.triggerGarbageCollection(); // Stop accepting new requests this.emit('pause-operations'); // Aggressive cleanup setTimeout(() => { this.performAggressiveCleanup(); }, 1000); } /** * Handle memory warning */ private handleMemoryWarning(usage: number): void { logger.warn('Memory warning handler triggered', { usage }); this.triggerGarbageCollection(); } /** * Handle metric alert */ private handleMetricAlert(alert: any): void { logger.info('Handling metric alert', alert); switch (alert.type) { case 'memory': this.handleMemoryWarning(alert.value); break; case 'error_rate': // Log and monitor metrics.increment('recovery.error_rate_alerts'); break; case 'response_time': // Check for system overload this.checkSystemOverload(); break; } } /** * Handle event loop block */ private handleEventLoopBlock(delay: number): void { logger.error('Event loop blocked', { delay }); metrics.increment('recovery.eventloop_blocks'); // If severe, trigger recovery if (delay > 10000) { this.attemptRecovery('eventloop_block', { delay }); } } /** * Check system overload */ private checkSystemOverload(): void { const usage = resourceManager.getUsage(); const metrics = this.getAllMetrics(); // Simple overload detection const overloaded = usage.processes > 8 || usage.connections > 80 || Object.values(metrics.operations).some((op: any) => op.avgTime > 5000); if (overloaded) { logger.warn('System overload detected'); this.emit('system-overload'); // Implement backpressure this.implementBackpressure(); } } /** * Trigger garbage collection */ private triggerGarbageCollection(): void { if (global.gc) { const before = process.memoryUsage().heapUsed; global.gc(); const after = process.memoryUsage().heapUsed; const freed = before - after; logger.info('Garbage collection completed', { freed: `${(freed / 1024 / 1024).toFixed(2)}MB`, before: `${(before / 1024 / 1024).toFixed(2)}MB`, after: `${(after / 1024 / 1024).toFixed(2)}MB` }); metrics.histogram('recovery.gc_freed', freed); } else { logger.warn('Garbage collection not available (run with --expose-gc)'); } } /** * Perform aggressive cleanup */ private async performAggressiveCleanup(): Promise<void> { logger.info('Performing aggressive cleanup'); // Clear all caches this.emit('clear-caches'); // Force close idle connections await resourceManager.cleanup(); // Clear metrics history metrics.reset(); // Force GC this.triggerGarbageCollection(); logger.info('Aggressive cleanup completed'); } /** * Implement backpressure */ private implementBackpressure(): void { logger.info('Implementing backpressure'); // Emit event for request limiting this.emit('enable-backpressure'); // Schedule removal after 30 seconds setTimeout(() => { logger.info('Removing backpressure'); this.emit('disable-backpressure'); }, 30000); } /** * Attempt recovery */ private async attemptRecovery(reason: string, details: any): Promise<void> { if (this.isRecovering) { logger.warn('Recovery already in progress'); return; } // Check recovery attempts const now = Date.now(); if (now - this.lastRecoveryTime < this.config.recoveryBackoff) { logger.warn('Recovery attempted too soon'); return; } this.isRecovering = true; this.lastRecoveryTime = now; this.recoveryAttempts++; logger.info('Attempting recovery', { reason, attempt: this.recoveryAttempts, details }); try { // Emit recovery start this.emit('recovery-start', { reason, attempt: this.recoveryAttempts }); // Recovery steps await this.performAggressiveCleanup(); // Wait for stabilization await new Promise(resolve => setTimeout(resolve, 5000)); // Check if recovery successful const health = await this.performHealthCheck(); if (health) { logger.info('Recovery successful'); this.emit('recovery-success'); this.recoveryAttempts = 0; } else { throw new Error('Recovery failed, system still unhealthy'); } } catch (error) { logger.error('Recovery failed', error); this.emit('recovery-failed', error); // Check if we should restart if (this.recoveryAttempts >= this.config.maxRecoveryAttempts) { logger.error('Max recovery attempts reached, initiating restart'); this.initiateGracefulRestart(); } } finally { this.isRecovering = false; } } /** * Initiate graceful restart */ private initiateGracefulRestart(): void { logger.info('Initiating graceful restart'); this.emit('shutdown-requested'); // Give time for cleanup setTimeout(() => { process.exit(1); // Exit with error code to trigger restart }, 5000); } private getAllMetrics(): any { return metrics.getAllMetrics(); } } // Export singleton instance export const autoRecovery = AutoRecoverySystem.getInstance(); // Log recovery events autoRecovery.on('recovery-start', (details) => { metrics.increment('recovery.attempts', 1, { reason: details.reason }); }); autoRecovery.on('recovery-success', () => { metrics.increment('recovery.success'); }); autoRecovery.on('recovery-failed', () => { metrics.increment('recovery.failures'); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Rajawatrajat/mcp-software-engineer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server