import { EventEmitter } from 'events';
import { Logger } from '../utils/logger.js';
export interface RetryContext {
sessionId: string;
operation: string;
attemptNumber: number;
maxAttempts: number;
error?: Error;
startTime: number;
metadata?: Record<string, any>;
}
export interface RetryStrategy {
maxAttempts: number;
baseDelay: number;
maxDelay: number;
backoffMultiplier: number;
jitterMax: number;
circuitBreakerThreshold: number;
circuitBreakerWindow: number;
shouldRetry: (error: Error, context: RetryContext) => boolean;
}
export interface CircuitBreakerState {
state: 'closed' | 'open' | 'half-open';
failures: number;
lastFailureTime: number;
nextAttemptTime: number;
successCount: number;
}
export class RetryManager extends EventEmitter {
private logger: Logger;
private circuitBreakers: Map<string, CircuitBreakerState>;
private retryStrategies: Map<string, RetryStrategy>;
private activeRetries: Map<string, RetryContext>;
// Default strategies for different error types
private static readonly DEFAULT_STRATEGIES: Record<string, RetryStrategy> = {
network: {
maxAttempts: 5,
baseDelay: 1000,
maxDelay: 16000,
backoffMultiplier: 2,
jitterMax: 1000,
circuitBreakerThreshold: 5,
circuitBreakerWindow: 60000,
shouldRetry: (error: Error) => {
const networkErrors = [
'ECONNREFUSED', 'ECONNRESET', 'ETIMEDOUT', 'ENETDOWN', 'ENETUNREACH',
'EHOSTDOWN', 'EHOSTUNREACH', 'ENOTFOUND', 'EAI_AGAIN'
];
return networkErrors.some(code => error.message.includes(code)) ||
error.message.toLowerCase().includes('network') ||
error.message.toLowerCase().includes('connection');
}
},
authentication: {
maxAttempts: 2,
baseDelay: 2000,
maxDelay: 8000,
backoffMultiplier: 2,
jitterMax: 500,
circuitBreakerThreshold: 3,
circuitBreakerWindow: 120000,
shouldRetry: (error: Error) => {
const authErrors = [
'authentication failed', 'auth failed', 'invalid credentials',
'access denied', 'permission denied', 'unauthorized'
];
const errorMsg = error.message.toLowerCase();
return authErrors.some(msg => errorMsg.includes(msg)) &&
!errorMsg.includes('invalid password'); // Don't retry bad passwords
}
},
resource: {
maxAttempts: 4,
baseDelay: 2000,
maxDelay: 32000,
backoffMultiplier: 2,
jitterMax: 2000,
circuitBreakerThreshold: 5,
circuitBreakerWindow: 180000,
shouldRetry: (error: Error) => {
const resourceErrors = [
'out of memory', 'resource temporarily unavailable',
'too many open files', 'no space left', 'quota exceeded',
'rate limit', 'throttled'
];
return resourceErrors.some(msg => error.message.toLowerCase().includes(msg));
}
},
ssh: {
maxAttempts: 3,
baseDelay: 1500,
maxDelay: 12000,
backoffMultiplier: 2,
jitterMax: 1000,
circuitBreakerThreshold: 4,
circuitBreakerWindow: 90000,
shouldRetry: (error: Error) => {
const sshErrors = [
'connection refused', 'connection reset', 'connection timeout',
'host unreachable', 'network unreachable', 'ssh connection lost'
];
const errorMsg = error.message.toLowerCase();
return sshErrors.some(msg => errorMsg.includes(msg)) &&
!errorMsg.includes('permission denied') &&
!errorMsg.includes('authentication failed');
}
},
generic: {
maxAttempts: 3,
baseDelay: 1000,
maxDelay: 8000,
backoffMultiplier: 2,
jitterMax: 500,
circuitBreakerThreshold: 5,
circuitBreakerWindow: 60000,
shouldRetry: (error: Error) => {
// Generic transient errors
const transientErrors = [
'timeout', 'temporary', 'busy', 'unavailable', 'try again'
];
return transientErrors.some(msg => error.message.toLowerCase().includes(msg));
}
}
};
constructor() {
super();
this.logger = new Logger('RetryManager');
this.circuitBreakers = new Map();
this.retryStrategies = new Map();
this.activeRetries = new Map();
// Initialize default strategies
Object.entries(RetryManager.DEFAULT_STRATEGIES).forEach(([name, strategy]) => {
this.retryStrategies.set(name, strategy);
});
}
/**
* Execute an operation with retry logic
*/
async executeWithRetry<T>(
operation: () => Promise<T>,
options: {
sessionId: string;
operationName: string;
strategyName?: string;
context?: Record<string, any>;
onRetry?: (context: RetryContext) => void;
}
): Promise<T> {
const strategyName = options.strategyName || this.classifyOperation(options.operationName);
const strategy = this.retryStrategies.get(strategyName) || this.retryStrategies.get('generic')!;
const circuitBreakerKey = `${options.sessionId}-${strategyName}`;
// Check circuit breaker
if (this.isCircuitOpen(circuitBreakerKey)) {
const breaker = this.circuitBreakers.get(circuitBreakerKey)!;
const timeUntilNextAttempt = Math.max(0, breaker.nextAttemptTime - Date.now());
throw new Error(
`Circuit breaker is OPEN for ${strategyName}. ` +
`Please wait ${Math.ceil(timeUntilNextAttempt / 1000)}s before retrying. ` +
`This typically indicates repeated failures that require manual intervention.`
);
}
const retryKey = `${options.sessionId}-${options.operationName}`;
const context: RetryContext = {
sessionId: options.sessionId,
operation: options.operationName,
attemptNumber: 0,
maxAttempts: strategy.maxAttempts,
startTime: Date.now(),
metadata: options.context
};
this.activeRetries.set(retryKey, context);
try {
return await this.attemptOperation(operation, strategy, context, circuitBreakerKey, options.onRetry);
} finally {
this.activeRetries.delete(retryKey);
}
}
private async attemptOperation<T>(
operation: () => Promise<T>,
strategy: RetryStrategy,
context: RetryContext,
circuitBreakerKey: string,
onRetry?: (context: RetryContext) => void
): Promise<T> {
while (context.attemptNumber < strategy.maxAttempts) {
context.attemptNumber++;
try {
this.logger.debug(
`Attempting ${context.operation} for session ${context.sessionId} ` +
`(attempt ${context.attemptNumber}/${context.maxAttempts})`
);
const result = await operation();
// Success - reset circuit breaker
this.recordSuccess(circuitBreakerKey);
if (context.attemptNumber > 1) {
this.logger.info(
`${context.operation} succeeded after ${context.attemptNumber} attempts ` +
`for session ${context.sessionId}`
);
this.emit('retry-success', {
...context,
totalDuration: Date.now() - context.startTime
});
}
return result;
} catch (error) {
context.error = error as Error;
// Record failure for circuit breaker
this.recordFailure(circuitBreakerKey);
// Check if we should retry this error
if (!strategy.shouldRetry(context.error, context)) {
this.logger.warn(
`${context.operation} failed with non-retryable error for session ${context.sessionId}: ${context.error.message}`
);
this.emit('retry-failed', {
...context,
reason: 'non-retryable-error',
totalDuration: Date.now() - context.startTime
});
throw this.enhanceError(context.error, context, 'This error type cannot be automatically retried');
}
// Check if we've exhausted retries
if (context.attemptNumber >= strategy.maxAttempts) {
this.logger.error(
`${context.operation} failed after ${context.attemptNumber} attempts ` +
`for session ${context.sessionId}: ${context.error.message}`
);
this.emit('retry-exhausted', {
...context,
totalDuration: Date.now() - context.startTime
});
throw this.enhanceError(
context.error,
context,
`Operation failed after ${context.maxAttempts} retry attempts. ` +
`Consider checking your configuration or waiting before retrying manually.`
);
}
// Calculate delay for next attempt
const delay = this.calculateDelay(context.attemptNumber, strategy);
this.logger.info(
`${context.operation} attempt ${context.attemptNumber} failed for session ${context.sessionId}. ` +
`Retrying in ${delay}ms. Error: ${context.error.message}`
);
this.emit('retry-attempt', {
...context,
delay,
nextAttempt: context.attemptNumber + 1
});
if (onRetry) {
onRetry(context);
}
// Wait before next attempt
await this.delay(delay);
}
}
// This should never be reached, but TypeScript needs it
throw context.error!;
}
private classifyOperation(operationName: string): string {
const networkOps = ['connect', 'send', 'receive', 'request', 'fetch'];
const authOps = ['authenticate', 'login', 'auth', 'credential'];
const sshOps = ['ssh', 'shell', 'terminal'];
const resourceOps = ['spawn', 'create', 'allocate', 'memory'];
const opLower = operationName.toLowerCase();
if (sshOps.some(op => opLower.includes(op))) return 'ssh';
if (networkOps.some(op => opLower.includes(op))) return 'network';
if (authOps.some(op => opLower.includes(op))) return 'authentication';
if (resourceOps.some(op => opLower.includes(op))) return 'resource';
return 'generic';
}
private calculateDelay(attemptNumber: number, strategy: RetryStrategy): number {
// Exponential backoff: baseDelay * (backoffMultiplier ^ (attemptNumber - 1))
const exponentialDelay = Math.min(
strategy.baseDelay * Math.pow(strategy.backoffMultiplier, attemptNumber - 1),
strategy.maxDelay
);
// Add jitter to avoid thundering herd
const jitter = Math.random() * strategy.jitterMax;
return Math.floor(exponentialDelay + jitter);
}
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
private isCircuitOpen(key: string): boolean {
const breaker = this.circuitBreakers.get(key);
if (!breaker) return false;
const now = Date.now();
if (breaker.state === 'open') {
if (now >= breaker.nextAttemptTime) {
// Transition to half-open
breaker.state = 'half-open';
breaker.successCount = 0;
this.logger.info(`Circuit breaker ${key} transitioning to HALF-OPEN`);
return false;
}
return true;
}
if (breaker.state === 'half-open') {
// Allow limited attempts in half-open state
return false;
}
return false;
}
private recordSuccess(key: string): void {
const breaker = this.circuitBreakers.get(key);
if (!breaker) return;
if (breaker.state === 'half-open') {
breaker.successCount++;
if (breaker.successCount >= 3) {
// Transition back to closed
breaker.state = 'closed';
breaker.failures = 0;
this.logger.info(`Circuit breaker ${key} transitioning to CLOSED`);
}
} else if (breaker.state === 'closed') {
// Reset failure count on success
breaker.failures = Math.max(0, breaker.failures - 1);
}
}
private recordFailure(key: string): void {
const strategy = this.getStrategyForKey(key);
let breaker = this.circuitBreakers.get(key);
if (!breaker) {
breaker = {
state: 'closed',
failures: 0,
lastFailureTime: Date.now(),
nextAttemptTime: 0,
successCount: 0
};
this.circuitBreakers.set(key, breaker);
}
breaker.failures++;
breaker.lastFailureTime = Date.now();
if (breaker.state === 'closed' && breaker.failures >= strategy.circuitBreakerThreshold) {
// Transition to open
breaker.state = 'open';
breaker.nextAttemptTime = Date.now() + strategy.circuitBreakerWindow;
this.logger.warn(
`Circuit breaker ${key} is now OPEN due to ${breaker.failures} consecutive failures. ` +
`Will remain open for ${strategy.circuitBreakerWindow / 1000}s`
);
this.emit('circuit-breaker-open', { key, failures: breaker.failures, windowMs: strategy.circuitBreakerWindow });
} else if (breaker.state === 'half-open') {
// Transition back to open
breaker.state = 'open';
breaker.nextAttemptTime = Date.now() + strategy.circuitBreakerWindow;
this.logger.warn(`Circuit breaker ${key} returning to OPEN state after failure in half-open`);
}
}
private getStrategyForKey(key: string): RetryStrategy {
// Extract strategy name from key format: sessionId-strategyName
const parts = key.split('-');
if (parts.length >= 2) {
const strategyName = parts[parts.length - 1];
return this.retryStrategies.get(strategyName) || this.retryStrategies.get('generic')!;
}
return this.retryStrategies.get('generic')!;
}
private enhanceError(originalError: Error, context: RetryContext, userGuidance: string): Error {
const enhancedError = new Error(
`${originalError.message}\n\n` +
`Operation: ${context.operation}\n` +
`Session: ${context.sessionId}\n` +
`Attempts: ${context.attemptNumber}/${context.maxAttempts}\n` +
`Duration: ${Date.now() - context.startTime}ms\n\n` +
`Guidance: ${userGuidance}\n\n` +
`To resolve this issue:\n` +
`1. Check your network connection and firewall settings\n` +
`2. Verify credentials and permissions\n` +
`3. Check if the target system is accessible and responsive\n` +
`4. Consider waiting a few minutes before retrying manually\n` +
`5. Review the system logs for additional error details`
);
enhancedError.name = originalError.name;
enhancedError.stack = originalError.stack;
return enhancedError;
}
/**
* Register a custom retry strategy
*/
registerStrategy(name: string, strategy: RetryStrategy): void {
this.retryStrategies.set(name, strategy);
this.logger.info(`Registered retry strategy: ${name}`);
}
/**
* Get current circuit breaker states
*/
getCircuitBreakerStates(): Record<string, CircuitBreakerState> {
const states: Record<string, CircuitBreakerState> = {};
this.circuitBreakers.forEach((state, key) => {
states[key] = { ...state };
});
return states;
}
/**
* Get active retry operations
*/
getActiveRetries(): RetryContext[] {
return Array.from(this.activeRetries.values());
}
/**
* Manually reset a circuit breaker
*/
resetCircuitBreaker(key: string): boolean {
const breaker = this.circuitBreakers.get(key);
if (breaker) {
breaker.state = 'closed';
breaker.failures = 0;
breaker.successCount = 0;
this.logger.info(`Manually reset circuit breaker: ${key}`);
return true;
}
return false;
}
/**
* Reset all circuit breakers
*/
resetAllCircuitBreakers(): void {
this.circuitBreakers.forEach((breaker, key) => {
breaker.state = 'closed';
breaker.failures = 0;
breaker.successCount = 0;
});
this.logger.info('Reset all circuit breakers');
}
/**
* Get retry statistics
*/
getRetryStats(): {
activeRetries: number;
circuitBreakers: { closed: number; open: number; halfOpen: number };
strategies: string[];
} {
const circuitBreakers = { closed: 0, open: 0, halfOpen: 0 };
this.circuitBreakers.forEach(breaker => {
if (breaker.state === 'closed') circuitBreakers.closed++;
else if (breaker.state === 'open') circuitBreakers.open++;
else circuitBreakers.halfOpen++;
});
return {
activeRetries: this.activeRetries.size,
circuitBreakers,
strategies: Array.from(this.retryStrategies.keys())
};
}
/**
* Clean up resources
*/
destroy(): void {
this.removeAllListeners();
this.circuitBreakers.clear();
this.activeRetries.clear();
this.logger.info('RetryManager destroyed');
}
}