MCP Software Engineer

auto-recovery.ts•14.4 KiB

import { EventEmitter } from 'events'; import { logger } from '../utils/logger.js'; import { metrics } from './metrics.js'; import { resourceManager } from '../utils/resource-manager.js'; import { config } from '../config/index.js'; /** * Auto-recovery system for handling failures and resource issues * Following WCGW: Assume everything will fail, build recovery for it */ export class AutoRecoverySystem extends EventEmitter { private static instance: AutoRecoverySystem; // State tracking private isRecovering = false; private lastRecoveryTime = 0; private recoveryAttempts = 0; private memorySnapshots: Array<{ time: number; usage: number }> = []; // Configuration private readonly config = { memoryLeakThreshold: 50 * 1024 * 1024, // 50MB increase memoryLeakWindow: 60000, // 1 minute maxRecoveryAttempts: 3, recoveryBackoff: 30000, // 30 seconds gcThreshold: 0.7, // 70% memory usage triggers GC restartThreshold: 0.95 // 95% memory usage triggers restart }; private constructor() { super(); this.startMonitoring(); this.setupEventHandlers(); } static getInstance(): AutoRecoverySystem { if (!AutoRecoverySystem.instance) { AutoRecoverySystem.instance = new AutoRecoverySystem(); } return AutoRecoverySystem.instance; } /** * Start monitoring for issues */ private startMonitoring(): void { // Monitor memory every 5 seconds setInterval(() => { this.checkMemoryLeak(); this.checkMemoryPressure(); }, 5000); // Monitor system health every minute setInterval(() => { this.performHealthCheck(); }, 60000); // Listen for metric alerts metrics.on('alert', (alert) => { this.handleMetricAlert(alert); }); // Listen for resource manager events resourceManager.on('memory-critical', (usage) => { this.handleMemoryCritical(usage); }); resourceManager.on('memory-warning', (usage) => { this.handleMemoryWarning(usage); }); } /** * Setup process event handlers */ private setupEventHandlers(): void { // Catch unhandled errors process.on('uncaughtException', (error) => { logger.error('Uncaught exception in auto-recovery', error); this.attemptRecovery('uncaught_exception', error); }); process.on('unhandledRejection', (reason, promise) => { logger.error('Unhandled rejection in auto-recovery', { reason, promise }); this.attemptRecovery('unhandled_rejection', reason); }); // Monitor event loop let lastCheck = Date.now(); setInterval(() => { const now = Date.now(); const delay = now - lastCheck - 1000; if (delay > 100) { logger.warn('Event loop blocked', { delay }); metrics.histogram('system.eventloop.delay', delay); if (delay > 5000) { this.handleEventLoopBlock(delay); } } lastCheck = now; }, 1000); } /** * Check for memory leaks */ private checkMemoryLeak(): void { const currentUsage = process.memoryUsage().heapUsed; const currentTime = Date.now(); // Add to snapshots this.memorySnapshots.push({ time: currentTime, usage: currentUsage }); // Keep only recent snapshots this.memorySnapshots = this.memorySnapshots.filter( s => currentTime - s.time < this.config.memoryLeakWindow ); // Need at least 2 snapshots if (this.memorySnapshots.length < 2) return; // Calculate memory growth const oldestSnapshot = this.memorySnapshots[0]; const memoryGrowth = currentUsage - oldestSnapshot.usage; const timeElapsed = currentTime - oldestSnapshot.time; // Check if growth exceeds threshold if (memoryGrowth > this.config.memoryLeakThreshold) { const growthRate = memoryGrowth / timeElapsed * 1000 * 60; // bytes per minute logger.warn('Potential memory leak detected', { growth: `${(memoryGrowth / 1024 / 1024).toFixed(2)}MB`, rate: `${(growthRate / 1024 / 1024).toFixed(2)}MB/min`, current: `${(currentUsage / 1024 / 1024).toFixed(2)}MB` }); this.emit('memory-leak', { growth: memoryGrowth, rate: growthRate, current: currentUsage }); // Take action this.handleMemoryLeak(); } } /** * Check memory pressure */ private checkMemoryPressure(): void { const memUsage = process.memoryUsage(); const maxMemory = config.getNumber('MAX_MEMORY_MB') * 1024 * 1024; const usageRatio = memUsage.heapUsed / maxMemory; metrics.gauge('system.memory.usage_ratio', usageRatio); if (usageRatio > this.config.restartThreshold) { logger.error('Critical memory usage, restart required', { usage: `${(memUsage.heapUsed / 1024 / 1024).toFixed(2)}MB`, max: `${(maxMemory / 1024 / 1024).toFixed(2)}MB`, ratio: `${(usageRatio * 100).toFixed(1)}%` }); this.initiateGracefulRestart(); } else if (usageRatio > this.config.gcThreshold) { this.triggerGarbageCollection(); } } /** * Perform health check */ private async performHealthCheck(): Promise<boolean> { const health = { memory: this.checkMemoryHealth(), resources: this.checkResourceHealth(), operations: this.checkOperationHealth() }; const isHealthy = Object.values(health).every(h => h.status === 'healthy'); if (!isHealthy) { logger.warn('Health check failed', health); this.emit('unhealthy', health); // Attempt recovery if critical const criticalIssues = Object.values(health).filter(h => h.status === 'critical'); if (criticalIssues.length > 0) { this.attemptRecovery('health_check', health); } } return isHealthy; } /** * Check memory health */ private checkMemoryHealth(): any { const memUsage = process.memoryUsage(); const maxMemory = config.getNumber('MAX_MEMORY_MB') * 1024 * 1024; const usageRatio = memUsage.heapUsed / maxMemory; if (usageRatio > 0.9) { return { status: 'critical', message: 'Memory usage above 90%', usage: usageRatio }; } else if (usageRatio > 0.7) { return { status: 'warning', message: 'Memory usage above 70%', usage: usageRatio }; } return { status: 'healthy', usage: usageRatio }; } /** * Check resource health */ private checkResourceHealth(): any { const usage = resourceManager.getUsage(); const issues = []; if (usage.processes > 8) issues.push('Too many processes'); if (usage.connections > 80) issues.push('Too many connections'); if (usage.fileHandles > 40) issues.push('Too many file handles'); if (issues.length > 2) { return { status: 'critical', message: issues.join(', '), usage }; } else if (issues.length > 0) { return { status: 'warning', message: issues.join(', '), usage }; } return { status: 'healthy', usage }; } /** * Check operation health */ private checkOperationHealth(): any { const allMetrics = metrics.getAllMetrics(); const issues = []; // Check error rates for (const [name, stats] of Object.entries(allMetrics.operations)) { const errorRate = (stats as any).errorRate; if (errorRate > 0.1) { issues.push(`High error rate for ${name}: ${(errorRate * 100).toFixed(1)}%`); } } if (issues.length > 3) { return { status: 'critical', message: issues.join(', ') }; } else if (issues.length > 0) { return { status: 'warning', message: issues.join(', ') }; } return { status: 'healthy' }; } /** * Handle memory leak */ private handleMemoryLeak(): void { logger.info('Handling memory leak'); // First try garbage collection this.triggerGarbageCollection(); // Clear caches and unused resources setTimeout(() => { resourceManager.cleanup().then(() => { logger.info('Resource cleanup completed'); // Check if memory improved setTimeout(() => { const currentUsage = process.memoryUsage().heapUsed; const improved = this.memorySnapshots[this.memorySnapshots.length - 1].usage - currentUsage; if (improved > 10 * 1024 * 1024) { logger.info(`Memory recovered: ${(improved / 1024 / 1024).toFixed(2)}MB`); } else { logger.warn('Memory leak persists after cleanup'); this.emit('persistent-memory-leak'); } }, 5000); }); }, 1000); } /** * Handle memory critical */ private handleMemoryCritical(usage: number): void { logger.error('Memory critical handler triggered', { usage }); // Immediate actions this.triggerGarbageCollection(); // Stop accepting new requests this.emit('pause-operations'); // Aggressive cleanup setTimeout(() => { this.performAggressiveCleanup(); }, 1000); } /** * Handle memory warning */ private handleMemoryWarning(usage: number): void { logger.warn('Memory warning handler triggered', { usage }); this.triggerGarbageCollection(); } /** * Handle metric alert */ private handleMetricAlert(alert: any): void { logger.info('Handling metric alert', alert); switch (alert.type) { case 'memory': this.handleMemoryWarning(alert.value); break; case 'error_rate': // Log and monitor metrics.increment('recovery.error_rate_alerts'); break; case 'response_time': // Check for system overload this.checkSystemOverload(); break; } } /** * Handle event loop block */ private handleEventLoopBlock(delay: number): void { logger.error('Event loop blocked', { delay }); metrics.increment('recovery.eventloop_blocks'); // If severe, trigger recovery if (delay > 10000) { this.attemptRecovery('eventloop_block', { delay }); } } /** * Check system overload */ private checkSystemOverload(): void { const usage = resourceManager.getUsage(); const metrics = this.getAllMetrics(); // Simple overload detection const overloaded = usage.processes > 8 || usage.connections > 80 || Object.values(metrics.operations).some((op: any) => op.avgTime > 5000); if (overloaded) { logger.warn('System overload detected'); this.emit('system-overload'); // Implement backpressure this.implementBackpressure(); } } /** * Trigger garbage collection */ private triggerGarbageCollection(): void { if (global.gc) { const before = process.memoryUsage().heapUsed; global.gc(); const after = process.memoryUsage().heapUsed; const freed = before - after; logger.info('Garbage collection completed', { freed: `${(freed / 1024 / 1024).toFixed(2)}MB`, before: `${(before / 1024 / 1024).toFixed(2)}MB`, after: `${(after / 1024 / 1024).toFixed(2)}MB` }); metrics.histogram('recovery.gc_freed', freed); } else { logger.warn('Garbage collection not available (run with --expose-gc)'); } } /** * Perform aggressive cleanup */ private async performAggressiveCleanup(): Promise<void> { logger.info('Performing aggressive cleanup'); // Clear all caches this.emit('clear-caches'); // Force close idle connections await resourceManager.cleanup(); // Clear metrics history metrics.reset(); // Force GC this.triggerGarbageCollection(); logger.info('Aggressive cleanup completed'); } /** * Implement backpressure */ private implementBackpressure(): void { logger.info('Implementing backpressure'); // Emit event for request limiting this.emit('enable-backpressure'); // Schedule removal after 30 seconds setTimeout(() => { logger.info('Removing backpressure'); this.emit('disable-backpressure'); }, 30000); } /** * Attempt recovery */ private async attemptRecovery(reason: string, details: any): Promise<void> { if (this.isRecovering) { logger.warn('Recovery already in progress'); return; } // Check recovery attempts const now = Date.now(); if (now - this.lastRecoveryTime < this.config.recoveryBackoff) { logger.warn('Recovery attempted too soon'); return; } this.isRecovering = true; this.lastRecoveryTime = now; this.recoveryAttempts++; logger.info('Attempting recovery', { reason, attempt: this.recoveryAttempts, details }); try { // Emit recovery start this.emit('recovery-start', { reason, attempt: this.recoveryAttempts }); // Recovery steps await this.performAggressiveCleanup(); // Wait for stabilization await new Promise(resolve => setTimeout(resolve, 5000)); // Check if recovery successful const health = await this.performHealthCheck(); if (health) { logger.info('Recovery successful'); this.emit('recovery-success'); this.recoveryAttempts = 0; } else { throw new Error('Recovery failed, system still unhealthy'); } } catch (error) { logger.error('Recovery failed', error); this.emit('recovery-failed', error); // Check if we should restart if (this.recoveryAttempts >= this.config.maxRecoveryAttempts) { logger.error('Max recovery attempts reached, initiating restart'); this.initiateGracefulRestart(); } } finally { this.isRecovering = false; } } /** * Initiate graceful restart */ private initiateGracefulRestart(): void { logger.info('Initiating graceful restart'); this.emit('shutdown-requested'); // Give time for cleanup setTimeout(() => { process.exit(1); // Exit with error code to trigger restart }, 5000); } private getAllMetrics(): any { return metrics.getAllMetrics(); } } // Export singleton instance export const autoRecovery = AutoRecoverySystem.getInstance(); // Log recovery events autoRecovery.on('recovery-start', (details) => { metrics.increment('recovery.attempts', 1, { reason: details.reason }); }); autoRecovery.on('recovery-success', () => { metrics.increment('recovery.success'); }); autoRecovery.on('recovery-failed', () => { metrics.increment('recovery.failures'); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Rajawatrajat/mcp-software-engineer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

auto-recovery.ts•14.4 KiB