Skip to main content
Glama

Prompt Auto-Optimizer MCP

by sloth-wq
disaster-recovery-manager.ts29.1 kB
/** * Disaster Recovery Manager - Automatic Failover and Emergency Procedures * * Provides comprehensive disaster recovery mechanisms: * - Automatic failover systems * - Service health monitoring * - Emergency shutdown procedures * - Disaster detection and classification * - Recovery procedure orchestration */ import { EventEmitter } from 'events'; import { ResilienceSystem } from '../resilience/index'; import { ComponentStatus, type ComponentHealth } from './component-recovery-manager'; export enum DisasterType { MEMORY_EXHAUSTION = 'memory_exhaustion', SERVICE_FAILURE = 'service_failure', DATA_CORRUPTION = 'data_corruption', NETWORK_PARTITION = 'network_partition', RESOURCE_STARVATION = 'resource_starvation', CASCADING_FAILURE = 'cascading_failure', PERFORMANCE_DEGRADATION = 'performance_degradation', SECURITY_BREACH = 'security_breach', CONFIGURATION_ERROR = 'configuration_error', EXTERNAL_DEPENDENCY_FAILURE = 'external_dependency_failure' } export enum RecoveryStatus { IDLE = 'idle', DETECTING = 'detecting', ANALYZING = 'analyzing', EXECUTING = 'executing', MONITORING = 'monitoring', COMPLETED = 'completed', FAILED = 'failed', ABORTED = 'aborted' } export interface DisasterRecoveryConfig { monitoringInterval: number; // milliseconds healthCheckTimeout: number; // milliseconds failoverTimeout: number; // milliseconds emergencyShutdownTimeout: number; // milliseconds autoRecoveryEnabled: boolean; escalationThresholds: { warningLevel: number; criticalLevel: number; emergencyLevel: number; }; retryAttempts: number; notificationEnabled: boolean; logLevel: 'debug' | 'info' | 'warn' | 'error'; } export interface FailoverConfig { primaryService: string; backupServices: string[]; healthCheckEndpoint?: string; switchoverTime: number; // milliseconds rollbackEnabled: boolean; dataConsistencyChecks: boolean; } export interface RecoveryProcedure { id: string; name: string; disasterType: DisasterType; steps: RecoveryStep[]; priority: number; estimatedDuration: number; // milliseconds prerequisites: string[]; rollbackSteps?: RecoveryStep[]; } export interface RecoveryStep { id: string; name: string; description?: string; action: 'backup' | 'restore' | 'failover' | 'restart' | 'validate' | 'notify' | 'custom'; parameters: Record<string, unknown>; timeout: number; // milliseconds rollbackPossible?: boolean; critical: boolean; rollbackAction?: string; dependencies?: string[]; } export interface EmergencyShutdownOptions { reason: string; preserveState: boolean; notifyOperators: boolean; gracefulTimeout: number; // milliseconds forceKill: boolean; } export interface DisasterEvent { id: string; type: DisasterType; severity: 'low' | 'medium' | 'high' | 'critical'; timestamp: Date; source: string; description: string; metrics?: Record<string, number>; affectedComponents?: string[]; recoveryProcedure?: string; } export interface DisasterRecoveryExecution { id: string; disasterEvent: DisasterEvent; procedure: RecoveryProcedure; status: RecoveryStatus; startTime: Date; endTime?: Date; completedSteps: string[]; failedSteps: string[]; currentStep?: string; progress: number; // 0-100 logs: string[]; metrics: { totalDuration: number; stepsCompleted: number; stepsTotal: number; errorsEncountered: number; }; } export interface MemoryExhaustionEvent { metrics: Record<string, number>; components: string[]; } export interface ServiceFailureEvent { service: string; metrics: Record<string, number>; } export interface DataCorruptionEvent { component: string; metrics: Record<string, number>; } /** * Disaster Recovery Manager Implementation */ export class DisasterRecoveryManager extends EventEmitter { private config: DisasterRecoveryConfig; private recoveryProcedures: Map<DisasterType, RecoveryProcedure[]> = new Map(); private activeExecutions: Map<string, DisasterRecoveryExecution> = new Map(); private monitoringTimer?: NodeJS.Timeout | undefined; private systemHealth: Map<string, ComponentHealth> = new Map(); private disasterHistory: DisasterEvent[] = []; private resilience: ResilienceSystem; constructor(config: Partial<DisasterRecoveryConfig> = {}) { super(); this.config = { monitoringInterval: 30000, // 30 seconds healthCheckTimeout: 10000, // 10 seconds failoverTimeout: 60000, // 1 minute emergencyShutdownTimeout: 30000, // 30 seconds autoRecoveryEnabled: true, escalationThresholds: { warningLevel: 0.3, criticalLevel: 0.7, emergencyLevel: 0.9 }, retryAttempts: 3, notificationEnabled: true, logLevel: 'info', ...config }; this.resilience = ResilienceSystem.getInstance(); this.initializeRecoveryProcedures(); } /** * Initialize disaster recovery manager */ async initialize(): Promise<void> { try { // Start system monitoring this.startSystemMonitoring(); // Validate recovery procedures await this.validateRecoveryProcedures(); // Register disaster detection handlers this.setupDisasterDetection(); this.emit('initialized', { procedureCount: this.getTotalProcedureCount(), monitoringEnabled: !!this.monitoringTimer }); } catch (error) { this.emit('error', { operation: 'initialize', error }); throw error; } } /** * Execute disaster recovery procedure */ async executeRecovery( disasterEvent: DisasterEvent, procedureId?: string ): Promise<DisasterRecoveryExecution> { return this.resilience.executeWithFullProtection( async () => { // Find appropriate recovery procedure const procedure = procedureId ? this.findProcedureById(procedureId) : this.findBestProcedure(disasterEvent); if (!procedure) { throw new Error(`No recovery procedure found for disaster type: ${disasterEvent.type}`); } // Create recovery execution const execution: DisasterRecoveryExecution = { id: this.generateExecutionId(), disasterEvent, procedure, status: RecoveryStatus.EXECUTING, startTime: new Date(), completedSteps: [], failedSteps: [], progress: 0, logs: [], metrics: { totalDuration: 0, stepsCompleted: 0, stepsTotal: procedure.steps.length, errorsEncountered: 0 } }; this.activeExecutions.set(execution.id, execution); try { // Execute recovery steps await this.executeRecoverySteps(execution); execution.status = RecoveryStatus.COMPLETED; execution.endTime = new Date(); execution.progress = 100; execution.metrics.totalDuration = execution.endTime.getTime() - execution.startTime.getTime(); this.emit('recoveryCompleted', execution); } catch (error) { execution.status = RecoveryStatus.FAILED; execution.endTime = new Date(); execution.logs.push(`Recovery failed: ${(error as Error).message}`); execution.metrics.errorsEncountered++; this.emit('recoveryFailed', { execution, error }); // Attempt rollback if configured if (procedure.rollbackSteps) { await this.executeRollback(execution); } throw error; } return execution; }, { serviceName: 'disaster-recovery', context: { name: 'execute-recovery', priority: 'critical' } } ); } /** * Perform automatic failover */ async performFailover(failoverConfig: FailoverConfig): Promise<{ success: boolean; newPrimaryService: string; switchoverTime: number; validationResults: Record<string, boolean>; }> { return this.resilience.executeWithFullProtection( async () => { const startTime = Date.now(); // Check backup services health const healthyBackups = await this.checkBackupServicesHealth(failoverConfig.backupServices); if (healthyBackups.length === 0) { throw new Error('No healthy backup services available for failover'); } // Select best backup service const newPrimaryService = healthyBackups[0]; if (!newPrimaryService) { throw new Error('Failed to select backup service'); } // Perform failover await this.switchPrimaryService(failoverConfig.primaryService, newPrimaryService); // Validate data consistency if required const validationResults: Record<string, boolean> = {}; if (failoverConfig.dataConsistencyChecks) { validationResults.dataConsistency = await this.validateDataConsistency(newPrimaryService); } const switchoverTime = Date.now() - startTime; this.emit('failoverCompleted', { oldPrimary: failoverConfig.primaryService, newPrimary: newPrimaryService, switchoverTime }); return { success: true, newPrimaryService: newPrimaryService, switchoverTime, validationResults }; }, { serviceName: 'disaster-recovery', context: { name: 'perform-failover', priority: 'critical' } } ); } /** * Execute emergency shutdown */ async emergencyShutdown(options: EmergencyShutdownOptions): Promise<void> { return this.resilience.executeWithFullProtection( async () => { this.emit('emergencyShutdownStarted', options); try { // Create emergency backup if requested if (options.preserveState) { await this.createEmergencyBackup(options.reason); } // Notify operators if enabled if (options.notifyOperators) { await this.notifyOperators('emergency_shutdown', options.reason); } // Stop all active recovery executions await this.abortActiveRecoveries('emergency_shutdown'); // Graceful shutdown with timeout const shutdownPromise = this.performGracefulShutdown(); const timeoutPromise = new Promise<void>((_, reject) => { setTimeout(() => reject(new Error('Graceful shutdown timeout')), options.gracefulTimeout); }); try { await Promise.race([shutdownPromise, timeoutPromise]); } catch (error) { if (options.forceKill) { await this.performForceShutdown(); } else { throw error; } } this.emit('emergencyShutdownCompleted', options); } catch (error) { this.emit('emergencyShutdownFailed', { options, error }); throw error; } }, { serviceName: 'disaster-recovery', context: { name: 'emergency-shutdown', priority: 'critical' } } ); } /** * Get disaster recovery status */ getRecoveryStatus(): { activeExecutions: number; recentDisasters: DisasterEvent[]; systemHealth: 'healthy' | 'degraded' | 'critical'; nextMonitoringCheck: Date; } { const recentDisasters = this.disasterHistory .filter(d => Date.now() - d.timestamp.getTime() < 3600000) // Last hour .slice(-10); const healthValues = Array.from(this.systemHealth.values()); let systemHealth: 'healthy' | 'degraded' | 'critical' = 'healthy'; if (healthValues.some(h => h.status === 'critical')) { systemHealth = 'critical'; } else if (healthValues.some(h => h.status === 'degraded')) { systemHealth = 'degraded'; } const nextMonitoringCheck = new Date(Date.now() + this.config.monitoringInterval); return { activeExecutions: this.activeExecutions.size, recentDisasters, systemHealth, nextMonitoringCheck }; } /** * Get health status */ async getHealthStatus(): Promise<ComponentHealth> { const activeExecutions = this.activeExecutions.size; const recentFailures = this.disasterHistory .filter(d => Date.now() - d.timestamp.getTime() < 3600000) .length; let status: 'healthy' | 'degraded' | 'critical' = 'healthy'; const recommendations: string[] = []; if (activeExecutions > 3) { status = 'critical'; recommendations.push('Multiple active disaster recovery executions'); } else if (activeExecutions > 0) { status = 'degraded'; recommendations.push('Recovery operations in progress'); } if (recentFailures > 5) { status = 'critical'; recommendations.push('High frequency of disasters detected'); } return { status: status as ComponentStatus, lastCheck: new Date(), metrics: { activeExecutions, recentFailures, procedureCount: this.getTotalProcedureCount() }, recommendations, errors: [], warnings: [], uptime: Date.now() - (new Date().getTime() - 3600000), // 1 hour mock uptime recoveryCount: this.activeExecutions.size }; } /** * Private helper methods */ private initializeRecoveryProcedures(): void { // Memory exhaustion recovery this.addRecoveryProcedure({ id: 'memory_exhaustion_recovery', name: 'Memory Exhaustion Recovery', disasterType: DisasterType.MEMORY_EXHAUSTION, priority: 1, estimatedDuration: 120000, // 2 minutes prerequisites: [], steps: [ { id: 'backup_critical_state', name: 'Backup Critical State', action: 'backup', parameters: { type: 'critical_only' }, timeout: 30000, critical: true }, { id: 'clear_caches', name: 'Clear Memory Caches', action: 'custom', parameters: { operation: 'clear_memory_caches' }, timeout: 10000, critical: false }, { id: 'gc_force', name: 'Force Garbage Collection', action: 'custom', parameters: { operation: 'force_gc' }, timeout: 5000, critical: false }, { id: 'validate_memory', name: 'Validate Memory Usage', action: 'validate', parameters: { check: 'memory_usage' }, timeout: 10000, critical: true } ] }); // Service failure recovery this.addRecoveryProcedure({ id: 'service_failure_recovery', name: 'Service Failure Recovery', disasterType: DisasterType.SERVICE_FAILURE, priority: 1, estimatedDuration: 180000, // 3 minutes prerequisites: [], steps: [ { id: 'identify_failed_service', name: 'Identify Failed Service', action: 'custom', parameters: { operation: 'service_diagnosis' }, timeout: 30000, critical: true }, { id: 'attempt_service_restart', name: 'Attempt Service Restart', action: 'restart', parameters: { graceful: true }, timeout: 60000, critical: true }, { id: 'validate_service_health', name: 'Validate Service Health', action: 'validate', parameters: { check: 'service_health' }, timeout: 30000, critical: true }, { id: 'restore_from_backup', name: 'Restore from Backup if Needed', action: 'restore', parameters: { type: 'service_specific' }, timeout: 120000, critical: false } ] }); // Data corruption recovery this.addRecoveryProcedure({ id: 'data_corruption_recovery', name: 'Data Corruption Recovery', disasterType: DisasterType.DATA_CORRUPTION, priority: 1, estimatedDuration: 300000, // 5 minutes prerequisites: [], steps: [ { id: 'isolate_corrupted_data', name: 'Isolate Corrupted Data', action: 'custom', parameters: { operation: 'isolate_corruption' }, timeout: 60000, critical: true }, { id: 'backup_current_state', name: 'Backup Current State', action: 'backup', parameters: { type: 'full_with_corruption_markers' }, timeout: 90000, critical: true }, { id: 'restore_clean_data', name: 'Restore Clean Data', action: 'restore', parameters: { type: 'validated_backup' }, timeout: 180000, critical: true }, { id: 'validate_data_integrity', name: 'Validate Data Integrity', action: 'validate', parameters: { check: 'comprehensive_integrity' }, timeout: 120000, critical: true } ] }); } private addRecoveryProcedure(procedure: RecoveryProcedure): void { if (!this.recoveryProcedures.has(procedure.disasterType)) { this.recoveryProcedures.set(procedure.disasterType, []); } const procedures = this.recoveryProcedures.get(procedure.disasterType)!; procedures.push(procedure); procedures.sort((a, b) => a.priority - b.priority); } private findBestProcedure(disasterEvent: DisasterEvent): RecoveryProcedure | null { const procedures = this.recoveryProcedures.get(disasterEvent.type); if (!procedures || procedures.length === 0) { return null; } // Return highest priority (lowest number) procedure const procedure = procedures[0]; return procedure ?? null; } private findProcedureById(procedureId: string): RecoveryProcedure | null { for (const procedures of this.recoveryProcedures.values()) { const procedure = procedures.find(p => p.id === procedureId); if (procedure) { return procedure; } } return null; } private async executeRecoverySteps(execution: DisasterRecoveryExecution): Promise<void> { for (let i = 0; i < execution.procedure.steps.length; i++) { const step = execution.procedure.steps[i]; if (!step) continue; execution.currentStep = step.id; try { execution.logs.push(`Starting step: ${step.name}`); await this.executeRecoveryStep(step, execution); execution.completedSteps.push(step.id); execution.metrics.stepsCompleted++; execution.progress = Math.round((execution.metrics.stepsCompleted / execution.metrics.stepsTotal) * 100); execution.logs.push(`Completed step: ${step.name}`); this.emit('recoveryStepCompleted', { execution: execution.id, step: step.id, progress: execution.progress }); } catch (error) { execution.failedSteps.push(step.id); execution.metrics.errorsEncountered++; execution.logs.push(`Failed step: ${step.name} - ${(error as Error).message}`); this.emit('recoveryStepFailed', { execution: execution.id, step: step.id, error }); if (step.critical) { throw new Error(`Critical step failed: ${step.name}`); } } } } private async executeRecoveryStep(step: RecoveryStep, _execution: DisasterRecoveryExecution): Promise<void> { const timeoutPromise = new Promise<never>((_, reject) => { setTimeout(() => reject(new Error(`Step timeout: ${step.name}`)), step.timeout); }); const stepPromise = this.performStepAction(step, _execution); await Promise.race([stepPromise, timeoutPromise]); } private async performStepAction(step: RecoveryStep, _execution: DisasterRecoveryExecution): Promise<void> { switch (step.action) { case 'backup': await this.performBackupAction(step.parameters); break; case 'restore': await this.performRestoreAction(step.parameters); break; case 'failover': await this.performFailoverAction(step.parameters); break; case 'restart': await this.performRestartAction(step.parameters); break; case 'validate': await this.performValidateAction(step.parameters); break; case 'notify': await this.performNotifyAction(step.parameters); break; case 'custom': await this.performCustomAction(step.parameters); break; default: throw new Error(`Unknown step action: ${step.action}`); } } private async performBackupAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would interact with StateBackupManager this.emit('backupActionPerformed', parameters); } private async performRestoreAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would interact with StateBackupManager this.emit('restoreActionPerformed', parameters); } private async performFailoverAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would perform service failover this.emit('failoverActionPerformed', parameters); } private async performRestartAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would restart services/components this.emit('restartActionPerformed', parameters); } private async performValidateAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would perform validation checks this.emit('validateActionPerformed', parameters); } private async performNotifyAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would send notifications this.emit('notifyActionPerformed', parameters); } private async performCustomAction(parameters: Record<string, unknown>): Promise<void> { // Implementation would perform custom operations this.emit('customActionPerformed', parameters); } private async executeRollback(execution: DisasterRecoveryExecution): Promise<void> { if (!execution.procedure.rollbackSteps) { return; } execution.logs.push('Starting rollback procedure'); for (const step of execution.procedure.rollbackSteps.reverse()) { try { await this.executeRecoveryStep(step, execution); execution.logs.push(`Rollback step completed: ${step.name}`); } catch (error) { execution.logs.push(`Rollback step failed: ${step.name} - ${(error as Error).message}`); } } } private startSystemMonitoring(): void { this.monitoringTimer = setInterval(async () => { try { await this.performSystemHealthCheck(); } catch (error) { this.emit('monitoringError', error); } }, this.config.monitoringInterval); } private async performSystemHealthCheck(): Promise<void> { // Implementation would check system health and detect disasters this.emit('healthCheckPerformed', { timestamp: new Date() }); } private setupDisasterDetection(): void { // Setup handlers for different disaster types this.on('memoryExhaustion', this.handleMemoryExhaustion.bind(this)); this.on('serviceFailure', this.handleServiceFailure.bind(this)); this.on('dataCorruption', this.handleDataCorruption.bind(this)); } private async handleMemoryExhaustion(event: MemoryExhaustionEvent): Promise<void> { const disasterEvent: DisasterEvent = { id: this.generateEventId(), type: DisasterType.MEMORY_EXHAUSTION, severity: 'critical', timestamp: new Date(), source: 'memory_monitor', description: 'Memory usage exceeded critical threshold', metrics: event.metrics, affectedComponents: event.components }; this.disasterHistory.push(disasterEvent); if (this.config.autoRecoveryEnabled) { await this.executeRecovery(disasterEvent); } } private async handleServiceFailure(event: ServiceFailureEvent): Promise<void> { const disasterEvent: DisasterEvent = { id: this.generateEventId(), type: DisasterType.SERVICE_FAILURE, severity: 'high', timestamp: new Date(), source: 'service_monitor', description: `Service failure detected: ${event.service}`, metrics: event.metrics, affectedComponents: [event.service] }; this.disasterHistory.push(disasterEvent); if (this.config.autoRecoveryEnabled) { await this.executeRecovery(disasterEvent); } } private async handleDataCorruption(event: DataCorruptionEvent): Promise<void> { const disasterEvent: DisasterEvent = { id: this.generateEventId(), type: DisasterType.DATA_CORRUPTION, severity: 'critical', timestamp: new Date(), source: 'integrity_monitor', description: `Data corruption detected: ${event.component}`, metrics: event.metrics, affectedComponents: [event.component] }; this.disasterHistory.push(disasterEvent); if (this.config.autoRecoveryEnabled) { await this.executeRecovery(disasterEvent); } } private async validateRecoveryProcedures(): Promise<void> { for (const [_type, procedures] of this.recoveryProcedures) { for (const procedure of procedures) { if (!this.isValidProcedure(procedure)) { throw new Error(`Invalid recovery procedure: ${procedure.id}`); } } } } private isValidProcedure(procedure: RecoveryProcedure): boolean { return !!( procedure.id && procedure.name && procedure.disasterType && procedure.steps && Array.isArray(procedure.steps) && procedure.steps.length > 0 ); } private async checkBackupServicesHealth(services: string[]): Promise<string[]> { const healthyServices: string[] = []; for (const service of services) { try { // Implementation would check service health // For now, assume all services are healthy healthyServices.push(service); } catch (error) { // eslint-disable-next-line no-console console.warn(`Service health check failed: ${service}`, error); } } return healthyServices; } private async switchPrimaryService(oldPrimary: string, newPrimary: string): Promise<void> { // Implementation would perform the actual service switch this.emit('serviceSwitched', { from: oldPrimary, to: newPrimary }); } private async validateDataConsistency(_service: string): Promise<boolean> { // Implementation would validate data consistency // For now, assume data is consistent return true; } private async createEmergencyBackup(reason: string): Promise<void> { // Implementation would create emergency backup this.emit('emergencyBackupCreated', { reason }); } private async notifyOperators(type: string, message: string): Promise<void> { // Implementation would notify operators this.emit('operatorsNotified', { type, message }); } private async abortActiveRecoveries(reason: string): Promise<void> { for (const execution of this.activeExecutions.values()) { if (execution.status === RecoveryStatus.EXECUTING) { execution.status = RecoveryStatus.ABORTED; execution.logs.push(`Aborted: ${reason}`); execution.endTime = new Date(); } } } private async performGracefulShutdown(): Promise<void> { // Implementation would perform graceful shutdown this.emit('gracefulShutdownPerformed'); } private async performForceShutdown(): Promise<void> { // Implementation would perform force shutdown this.emit('forceShutdownPerformed'); } private generateExecutionId(): string { return `exec_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } private generateEventId(): string { return `event_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } private getTotalProcedureCount(): number { let count = 0; for (const procedures of this.recoveryProcedures.values()) { count += procedures.length; } return count; } /** * Cleanup resources */ async cleanup(): Promise<void> { if (this.monitoringTimer) { clearInterval(this.monitoringTimer); this.monitoringTimer = undefined; } // Abort active executions await this.abortActiveRecoveries('cleanup'); this.removeAllListeners(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sloth-wq/prompt-auto-optimizer-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server