index.ts•9.41 kB
/**
* GEPA Resilience System - Main Export
*
* Comprehensive resilience patterns for robust system operation:
* - Circuit Breaker Pattern
* - Intelligent Retry Mechanisms
* - Graceful Degradation
* - Timeout Management
*/
export {
CircuitBreaker,
CircuitBreakerState,
CircuitBreakerFactory
} from './circuit-breaker';
export type {
CircuitBreakerConfig,
CircuitBreakerMetrics
} from './circuit-breaker';
// Import classes for internal use
import { CircuitBreakerFactory } from './circuit-breaker';
import { RetryManager } from './retry-manager';
import { GracefulDegradationManager } from './graceful-degradation';
import { TimeoutManager } from './timeout-manager';
export {
RetryManager,
RetryHelper
} from './retry-manager';
export type {
RetryPolicy,
RetryResult,
RetryAttempt,
ContextualRule,
OperationContext as RetryOperationContext
} from './retry-manager';
export {
GracefulDegradationManager,
DegradationLevel,
ServicePriority,
UserFriendlyErrorHandler
} from './graceful-degradation';
export type {
FallbackConfig,
FallbackStrategy,
DegradationTrigger,
SystemHealth,
DegradationEvent
} from './graceful-degradation';
export {
TimeoutManager,
TimeoutHelper
} from './timeout-manager';
export type {
TimeoutConfig,
TimeoutEvent,
OperationContext as TimeoutOperationContext
} from './timeout-manager';
/**
* Resilience System Coordinator
*
* Provides a unified interface for all resilience patterns
*/
export class ResilienceSystem {
private static instance: ResilienceSystem;
private retryManager: RetryManager;
private degradationManager: GracefulDegradationManager;
private timeoutManager: TimeoutManager;
constructor() {
// Use static getInstance methods
this.retryManager = RetryManager.getInstance();
this.degradationManager = GracefulDegradationManager.getInstance();
this.timeoutManager = TimeoutManager.getInstance();
}
/**
* Get singleton instance
*/
static getInstance(): ResilienceSystem {
if (!this.instance) {
this.instance = new ResilienceSystem();
}
return this.instance;
}
/**
* Execute operation with full resilience protection
*/
async executeWithFullProtection<T>(
operation: () => Promise<T>,
config: {
serviceName: string;
timeoutConfig?: string;
retryPolicy?: string;
circuitBreaker?: boolean;
fallbackValue?: T;
context?: {
name: string;
priority: 'low' | 'medium' | 'high' | 'critical';
canAbort?: boolean;
metadata?: Record<string, any>;
};
}
): Promise<T> {
const {
serviceName,
timeoutConfig = 'generic',
retryPolicy = 'generic',
circuitBreaker = true,
fallbackValue,
context
} = config;
// Create circuit breaker if requested
const breaker = circuitBreaker
? CircuitBreakerFactory.createServiceCircuitBreaker(serviceName)
: undefined;
// Create resilient operation wrapper
const resilientOperation = async () => {
// Step 1: Timeout protection
const timeoutOperation = async () => {
// Step 2: Circuit breaker protection
const circuitOperation = async () => {
return operation();
};
return breaker
? await breaker.execute(circuitOperation)
: await circuitOperation();
};
return await this.timeoutManager.executeWithTimeout(
timeoutOperation,
timeoutConfig,
context ? {
name: context.name,
priority: context.priority || 'medium',
canAbort: context.canAbort || false,
metadata: context.metadata || {}
} : undefined
);
};
// Step 3: Retry with fallback protection
return await this.degradationManager.executeWithFallback(
serviceName,
async () => {
return await this.retryManager.executeWithRetry(
resilientOperation,
retryPolicy,
context,
breaker
);
},
context,
fallbackValue
);
}
/**
* Get system-wide resilience metrics
*/
async getResilienceMetrics(): Promise<{
circuitBreakers: Map<string, any>;
retryStats: Map<string, any>;
systemHealth: any;
activeTimeouts: Map<string, any>;
}> {
const circuitBreakers = new Map();
for (const [name, breaker] of CircuitBreakerFactory.getAllInstances()) {
circuitBreakers.set(name, breaker.getMetrics());
}
return {
circuitBreakers,
retryStats: this.retryManager.getRetryStats(),
systemHealth: await this.degradationManager.getSystemHealth(),
activeTimeouts: this.timeoutManager.getActiveTimeouts()
};
}
/**
* Emergency shutdown of all resilience components
*/
async emergencyShutdown(reason: string): Promise<void> {
// eslint-disable-next-line no-console
console.warn(`Emergency shutdown initiated: ${reason}`);
// Emergency abort all timeouts
this.timeoutManager.emergencyAbort(reason);
// Cleanup all components
await Promise.allSettled([
CircuitBreakerFactory.cleanup(),
this.retryManager.cleanup(),
this.degradationManager.cleanup(),
this.timeoutManager.cleanup()
]);
}
/**
* Get comprehensive system status
*/
async getSystemStatus(): Promise<{
status: 'healthy' | 'degraded' | 'critical';
components: Record<string, string>;
metrics: any;
recommendations: string[];
}> {
const metrics = await this.getResilienceMetrics();
const health = metrics.systemHealth;
let overallStatus: 'healthy' | 'degraded' | 'critical' = 'healthy';
const components: Record<string, string> = {};
const recommendations: string[] = [];
// Analyze circuit breakers
for (const [name, cbMetrics] of metrics.circuitBreakers) {
if (cbMetrics.state === 'OPEN') {
components[name] = 'circuit_open';
overallStatus = 'critical';
recommendations.push(`Circuit breaker '${name}' is open - investigate underlying service`);
} else if (cbMetrics.errorRate > 0.3) {
components[name] = 'high_error_rate';
overallStatus = 'degraded';
recommendations.push(`High error rate detected in '${name}' - monitor service health`);
} else {
components[name] = 'healthy';
}
}
// Analyze system health
if (health.overallLevel !== 'NORMAL') {
overallStatus = health.overallLevel === 'EMERGENCY' ? 'critical' : 'degraded';
recommendations.push(`System degradation level: ${health.overallLevel}`);
}
// Analyze active timeouts
if (metrics.activeTimeouts.size > 10) {
overallStatus = 'degraded';
recommendations.push('High number of active operations - system may be overloaded');
}
return {
status: overallStatus,
components,
metrics,
recommendations
};
}
}
/**
* Convenience functions for common resilience patterns
*/
/**
* Execute LLM operation with full resilience
*/
export async function executeLLMWithResilience<T>(
operation: () => Promise<T>,
options: {
fallbackValue?: T;
context?: { name: string; priority?: 'low' | 'medium' | 'high' | 'critical' };
} = {}
): Promise<T> {
const resilience = ResilienceSystem.getInstance();
return resilience.executeWithFullProtection(operation, {
serviceName: 'llm-adapter',
timeoutConfig: 'llm-adapter',
retryPolicy: 'llm-adapter',
circuitBreaker: true,
...(options.fallbackValue !== undefined && { fallbackValue: options.fallbackValue }),
context: {
name: 'llm-operation',
priority: 'high',
canAbort: true,
metadata: {},
...options.context
}
});
}
/**
* Execute database operation with resilience
*/
export async function executeDatabaseWithResilience<T>(
operation: () => Promise<T>,
options: {
fallbackValue?: T;
context?: { name: string; priority?: 'low' | 'medium' | 'high' | 'critical' };
} = {}
): Promise<T> {
const resilience = ResilienceSystem.getInstance();
return resilience.executeWithFullProtection(operation, {
serviceName: 'trajectory-store',
timeoutConfig: 'trajectory-store',
retryPolicy: 'trajectory-store',
circuitBreaker: true,
...(options.fallbackValue !== undefined && { fallbackValue: options.fallbackValue }),
context: {
name: 'database-operation',
priority: 'medium',
canAbort: true,
metadata: {},
...options.context
}
});
}
/**
* Execute computation with resilience
*/
export async function executeComputationWithResilience<T>(
operation: () => Promise<T>,
options: {
fallbackValue?: T;
context?: { name: string; priority?: 'low' | 'medium' | 'high' | 'critical' };
} = {}
): Promise<T> {
const resilience = ResilienceSystem.getInstance();
return resilience.executeWithFullProtection(operation, {
serviceName: 'pareto-frontier',
timeoutConfig: 'pareto-frontier',
retryPolicy: 'pareto-frontier',
circuitBreaker: false, // Computations don't typically need circuit breakers
...(options.fallbackValue !== undefined && { fallbackValue: options.fallbackValue }),
context: {
name: 'computation',
priority: 'medium',
canAbort: true,
metadata: {},
...options.context
}
});
}