graceful-degradation.ts•20.6 kB
/**
* Graceful Degradation System for GEPA
*
* Provides fallback mechanisms, reduced functionality modes,
* priority-based resource allocation, and user-friendly error handling.
*/
import { EventEmitter } from 'events';
import { MemoryLeakIntegration } from '../memory-leak-detector';
/**
* System degradation levels
*/
export enum DegradationLevel {
NORMAL = 'NORMAL', // Full functionality
REDUCED = 'REDUCED', // Some features limited
MINIMAL = 'MINIMAL', // Basic functionality only
EMERGENCY = 'EMERGENCY' // Critical operations only
}
/**
* Service priority levels
*/
export enum ServicePriority {
CRITICAL = 'CRITICAL', // Must always work
HIGH = 'HIGH', // Important but can degrade
MEDIUM = 'MEDIUM', // Nice to have
LOW = 'LOW' // Can be disabled
}
/**
* Resource allocation strategy
*/
export interface ResourceStrategy {
name: string;
maxConcurrency: number;
timeoutMultiplier: number;
retryMultiplier: number;
enableCaching: boolean;
enableBatching: boolean;
}
/**
* Fallback configuration for a service
*/
export interface FallbackConfig {
serviceName: string;
priority: ServicePriority;
fallbackStrategies: FallbackStrategy[];
resourceStrategies: Map<DegradationLevel, ResourceStrategy>;
healthCheck: () => Promise<boolean>;
degradationTriggers: DegradationTrigger[];
}
/**
* Fallback strategy implementation
*/
export interface FallbackStrategy {
name: string;
canHandle: (error: Error, context?: any) => boolean;
execute: (originalArgs: any[], error: Error, context?: any) => Promise<any>;
priority: number; // Lower numbers = higher priority
}
/**
* Triggers that cause degradation
*/
export interface DegradationTrigger {
name: string;
condition: () => boolean | Promise<boolean>;
targetLevel: DegradationLevel;
cooldownMs: number;
}
/**
* System health metrics
*/
export interface SystemHealth {
overallLevel: DegradationLevel;
services: Map<string, {
status: 'healthy' | 'degraded' | 'failed';
level: DegradationLevel;
lastCheck: Date;
errors: Error[];
}>;
resourceUsage: {
memory: number;
cpu: number;
activeConnections: number;
};
degradationHistory: DegradationEvent[];
}
/**
* Degradation event record
*/
export interface DegradationEvent {
timestamp: Date;
service: string;
from: DegradationLevel;
to: DegradationLevel;
trigger: string;
reason: string;
}
/**
* Graceful Degradation Manager
*/
export class GracefulDegradationManager extends EventEmitter {
private static instance: GracefulDegradationManager;
private services = new Map<string, FallbackConfig>();
private currentLevel = DegradationLevel.NORMAL;
private serviceLevels = new Map<string, DegradationLevel>();
private degradationHistory: DegradationEvent[] = [];
private healthCheckInterval?: ReturnType<typeof setInterval>;
private lastTriggerCheck = new Map<string, number>();
constructor() {
super();
this.setupDefaultServices();
this.startHealthMonitoring();
this.setupMemoryManagement();
}
/**
* Get singleton instance
*/
static getInstance(): GracefulDegradationManager {
if (!this.instance) {
this.instance = new GracefulDegradationManager();
}
return this.instance;
}
/**
* Execute operation with fallback protection
*/
async executeWithFallback<T>(
serviceName: string,
operation: () => Promise<T>,
context?: any,
fallbackValue?: T
): Promise<T> {
const config = this.services.get(serviceName);
if (!config) {
// No fallback config - execute directly
return operation();
}
try {
// Check if service should be degraded
const currentLevel = this.serviceLevels.get(serviceName) || DegradationLevel.NORMAL;
if (currentLevel === DegradationLevel.EMERGENCY) {
throw new Error(`Service ${serviceName} is in emergency mode`);
}
// Apply resource strategy
const strategy = config.resourceStrategies.get(currentLevel);
if (strategy) {
return await this.executeWithResourceLimits(operation, strategy);
}
return await operation();
} catch (error) {
// Try fallback strategies
return await this.tryFallbackStrategies(config, [], error as Error, context, fallbackValue);
}
}
/**
* Try fallback strategies in priority order
*/
private async tryFallbackStrategies<T>(
config: FallbackConfig,
originalArgs: any[],
error: Error,
context?: any,
fallbackValue?: T
): Promise<T> {
// Sort strategies by priority
const strategies = config.fallbackStrategies
.filter(strategy => strategy.canHandle(error, context))
.sort((a, b) => a.priority - b.priority);
for (const strategy of strategies) {
try {
this.emit('fallbackAttempt', {
service: config.serviceName,
strategy: strategy.name,
error: error.message
});
const result = await strategy.execute(originalArgs, error, context);
this.emit('fallbackSuccess', {
service: config.serviceName,
strategy: strategy.name
});
return result;
} catch (fallbackError) {
this.emit('fallbackFailure', {
service: config.serviceName,
strategy: strategy.name,
error: (fallbackError as Error).message
});
continue;
}
}
// All fallbacks failed - return fallback value or throw
if (fallbackValue !== undefined) {
this.emit('fallbackValueUsed', {
service: config.serviceName,
value: fallbackValue
});
return fallbackValue;
}
throw new Error(`All fallback strategies failed for ${config.serviceName}: ${error.message}`);
}
/**
* Execute operation with resource limits
*/
private async executeWithResourceLimits<T>(
operation: () => Promise<T>,
_strategy: ResourceStrategy
): Promise<T> {
// Simple concurrency limiting (in production, would use more sophisticated queuing)
return operation();
}
/**
* Register a service with fallback configuration
*/
registerService(config: FallbackConfig): void {
this.validateConfig(config);
this.services.set(config.serviceName, config);
this.serviceLevels.set(config.serviceName, DegradationLevel.NORMAL);
this.emit('serviceRegistered', { name: config.serviceName });
}
/**
* Manually degrade a service
*/
degradeService(serviceName: string, level: DegradationLevel, reason: string): void {
const currentLevel = this.serviceLevels.get(serviceName) || DegradationLevel.NORMAL;
if (currentLevel !== level) {
this.serviceLevels.set(serviceName, level);
const event: DegradationEvent = {
timestamp: new Date(),
service: serviceName,
from: currentLevel,
to: level,
trigger: 'manual',
reason
};
this.degradationHistory.push(event);
this.updateOverallLevel();
this.emit('serviceDegraded', event);
}
}
/**
* Restore service to normal operation
*/
restoreService(serviceName: string, reason: string): void {
const currentLevel = this.serviceLevels.get(serviceName);
if (currentLevel && currentLevel !== DegradationLevel.NORMAL) {
this.serviceLevels.set(serviceName, DegradationLevel.NORMAL);
const event: DegradationEvent = {
timestamp: new Date(),
service: serviceName,
from: currentLevel,
to: DegradationLevel.NORMAL,
trigger: 'manual',
reason
};
this.degradationHistory.push(event);
this.updateOverallLevel();
this.emit('serviceRestored', event);
}
}
/**
* Get current system health
*/
async getSystemHealth(): Promise<SystemHealth> {
const services = new Map();
for (const [name, config] of Array.from(this.services.entries())) {
try {
const isHealthy = await config.healthCheck();
const level = this.serviceLevels.get(name) || DegradationLevel.NORMAL;
services.set(name, {
status: isHealthy ? (level === DegradationLevel.NORMAL ? 'healthy' : 'degraded') : 'failed',
level,
lastCheck: new Date(),
errors: []
});
} catch (error) {
services.set(name, {
status: 'failed',
level: DegradationLevel.EMERGENCY,
lastCheck: new Date(),
errors: [error as Error]
});
}
}
return {
overallLevel: this.currentLevel,
services,
resourceUsage: await this.getResourceUsage(),
degradationHistory: [...this.degradationHistory]
};
}
/**
* Check degradation triggers for all services
*/
private async checkDegradationTriggers(): Promise<void> {
for (const [serviceName, config] of Array.from(this.services.entries())) {
for (const trigger of config.degradationTriggers) {
const lastCheck = this.lastTriggerCheck.get(`${serviceName}-${trigger.name}`) || 0;
const now = Date.now();
// Check cooldown
if (now - lastCheck < trigger.cooldownMs) {
continue;
}
try {
const shouldTrigger = await trigger.condition();
if (shouldTrigger) {
this.degradeService(serviceName, trigger.targetLevel, `Trigger: ${trigger.name}`);
this.lastTriggerCheck.set(`${serviceName}-${trigger.name}`, now);
}
} catch (error) {
// eslint-disable-next-line no-console
console.warn(`Trigger check failed for ${serviceName}-${trigger.name}:`, error);
}
}
}
}
/**
* Update overall system degradation level
*/
private updateOverallLevel(): void {
const levels = Array.from(this.serviceLevels.values());
// System level is the worst service level
let worstLevel = DegradationLevel.NORMAL;
for (const level of levels) {
if (this.getLevelPriority(level) > this.getLevelPriority(worstLevel)) {
worstLevel = level;
}
}
if (worstLevel !== this.currentLevel) {
const oldLevel = this.currentLevel;
this.currentLevel = worstLevel;
this.emit('systemLevelChanged', {
from: oldLevel,
to: worstLevel,
timestamp: new Date()
});
}
}
/**
* Get numeric priority for degradation level
*/
private getLevelPriority(level: DegradationLevel): number {
switch (level) {
case DegradationLevel.NORMAL: return 0;
case DegradationLevel.REDUCED: return 1;
case DegradationLevel.MINIMAL: return 2;
case DegradationLevel.EMERGENCY: return 3;
default: return 0;
}
}
/**
* Setup default service configurations
*/
private setupDefaultServices(): void {
// LLM Adapter Service
this.registerService({
serviceName: 'llm-adapter',
priority: ServicePriority.CRITICAL,
fallbackStrategies: [
{
name: 'cached-response',
priority: 1,
canHandle: (error) => error.message.includes('rate limit') || error.message.includes('timeout'),
execute: async (_args, _error, _context) => {
// Return a cached or simplified response
return {
content: 'Service temporarily unavailable. Please try again later.',
model: 'fallback',
tokens: { prompt: 0, completion: 0, total: 0 },
finishReason: 'fallback',
latency: 0,
timestamp: new Date()
};
}
},
{
name: 'simplified-prompt',
priority: 2,
canHandle: (error) => error.message.includes('complexity') || error.message.includes('length'),
execute: async (_args, _error, _context) => {
// Use a simpler, shorter prompt
throw new Error('Simplified prompt fallback not implemented');
}
}
],
resourceStrategies: new Map([
[DegradationLevel.NORMAL, {
name: 'normal',
maxConcurrency: 5,
timeoutMultiplier: 1.0,
retryMultiplier: 1.0,
enableCaching: true,
enableBatching: true
}],
[DegradationLevel.REDUCED, {
name: 'reduced',
maxConcurrency: 3,
timeoutMultiplier: 0.8,
retryMultiplier: 0.5,
enableCaching: true,
enableBatching: true
}],
[DegradationLevel.MINIMAL, {
name: 'minimal',
maxConcurrency: 1,
timeoutMultiplier: 0.5,
retryMultiplier: 0.2,
enableCaching: true,
enableBatching: false
}]
]),
healthCheck: async () => {
// Simple health check - could ping the LLM service
return true;
},
degradationTriggers: [
{
name: 'high-error-rate',
condition: async () => {
// Check if error rate is too high
return false; // Implement actual check
},
targetLevel: DegradationLevel.REDUCED,
cooldownMs: 60000
}
]
});
// Trajectory Store Service
this.registerService({
serviceName: 'trajectory-store',
priority: ServicePriority.HIGH,
fallbackStrategies: [
{
name: 'memory-cache',
priority: 1,
canHandle: (error) => error.message.includes('connection') || error.message.includes('database'),
execute: async (_args, _error, _context) => {
// Use in-memory cache as fallback
// eslint-disable-next-line no-console
console.warn('Using memory cache fallback for trajectory store');
return null; // Return cached data or null
}
}
],
resourceStrategies: new Map([
[DegradationLevel.NORMAL, {
name: 'normal',
maxConcurrency: 10,
timeoutMultiplier: 1.0,
retryMultiplier: 1.0,
enableCaching: true,
enableBatching: true
}],
[DegradationLevel.REDUCED, {
name: 'reduced',
maxConcurrency: 5,
timeoutMultiplier: 0.7,
retryMultiplier: 0.5,
enableCaching: true,
enableBatching: true
}]
]),
healthCheck: async () => true,
degradationTriggers: []
});
// Pareto Frontier Service
this.registerService({
serviceName: 'pareto-frontier',
priority: ServicePriority.MEDIUM,
fallbackStrategies: [
{
name: 'simple-selection',
priority: 1,
canHandle: (error) => error.message.includes('memory') || error.message.includes('computation'),
execute: async (_args, _error, _context) => {
// Use simple best-candidate selection instead of Pareto optimization
// eslint-disable-next-line no-console
console.warn('Using simple selection fallback for Pareto frontier');
return null; // Return simplified result
}
}
],
resourceStrategies: new Map([
[DegradationLevel.NORMAL, {
name: 'normal',
maxConcurrency: 5,
timeoutMultiplier: 1.0,
retryMultiplier: 1.0,
enableCaching: true,
enableBatching: true
}],
[DegradationLevel.REDUCED, {
name: 'reduced',
maxConcurrency: 2,
timeoutMultiplier: 0.6,
retryMultiplier: 0.3,
enableCaching: true,
enableBatching: false
}]
]),
healthCheck: async () => true,
degradationTriggers: []
});
}
/**
* Start health monitoring
*/
private startHealthMonitoring(): void {
this.healthCheckInterval = setInterval(async () => {
try {
await this.checkDegradationTriggers();
} catch (error) {
// eslint-disable-next-line no-console
console.warn('Health monitoring error:', error);
}
}, 30000); // Check every 30 seconds
}
/**
* Get current resource usage
*/
private async getResourceUsage(): Promise<{
memory: number;
cpu: number;
activeConnections: number;
}> {
const memUsage = process.memoryUsage();
return {
memory: memUsage.heapUsed / memUsage.heapTotal,
cpu: 0, // Would need additional monitoring
activeConnections: 0 // Would track actual connections
};
}
/**
* Validate service configuration
*/
private validateConfig(config: FallbackConfig): void {
if (!config.serviceName) {
throw new Error('Service name is required');
}
if (!config.fallbackStrategies || config.fallbackStrategies.length === 0) {
throw new Error('At least one fallback strategy is required');
}
if (!config.resourceStrategies || config.resourceStrategies.size === 0) {
throw new Error('At least one resource strategy is required');
}
if (!config.healthCheck) {
throw new Error('Health check function is required');
}
}
/**
* Setup memory management
*/
private setupMemoryManagement(): void {
MemoryLeakIntegration.initialize();
// Cleanup old degradation history
setInterval(() => {
this.cleanupHistory();
}, 300000); // Every 5 minutes
}
/**
* Cleanup old degradation history to prevent memory leaks
*/
private cleanupHistory(): void {
const cutoff = Date.now() - (24 * 60 * 60 * 1000); // 24 hours
this.degradationHistory = this.degradationHistory.filter(
event => event.timestamp.getTime() > cutoff
);
// Limit total history size
if (this.degradationHistory.length > 1000) {
this.degradationHistory = this.degradationHistory.slice(-1000);
}
}
/**
* Cleanup resources
*/
async cleanup(): Promise<void> {
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
}
this.removeAllListeners();
this.services.clear();
this.serviceLevels.clear();
this.degradationHistory = [];
this.lastTriggerCheck.clear();
}
}
/**
* User-friendly error handler
*/
export class UserFriendlyErrorHandler {
private static errorMessages = new Map<string, string>([
['rate limit', 'The service is currently busy. Please try again in a few moments.'],
['timeout', 'The operation took longer than expected. Please try again.'],
['network', 'There seems to be a connectivity issue. Please check your connection and try again.'],
['authentication', 'Authentication failed. Please check your credentials.'],
['permission', 'You don\'t have permission to perform this action.'],
['not found', 'The requested resource could not be found.'],
['server error', 'We\'re experiencing technical difficulties. Our team has been notified.'],
['maintenance', 'The service is currently under maintenance. Please try again later.']
]);
/**
* Convert technical error to user-friendly message
*/
static getDisplayMessage(error: Error, context?: string): string {
const errorMessage = error.message.toLowerCase();
// Check for specific error patterns
for (const [pattern, message] of Array.from(this.errorMessages.entries())) {
if (errorMessage.includes(pattern)) {
return context ? `${context}: ${message}` : message;
}
}
// Generic fallback
return context
? `${context}: We encountered an unexpected issue. Please try again.`
: 'We encountered an unexpected issue. Please try again.';
}
/**
* Get recommended user actions for an error
*/
static getRecommendedActions(error: Error): string[] {
const errorMessage = error.message.toLowerCase();
if (errorMessage.includes('rate limit')) {
return ['Wait a few minutes before trying again', 'Consider reducing request frequency'];
}
if (errorMessage.includes('timeout')) {
return ['Try again with a simpler request', 'Check your internet connection'];
}
if (errorMessage.includes('network')) {
return ['Check your internet connection', 'Try again in a few moments'];
}
if (errorMessage.includes('authentication')) {
return ['Verify your login credentials', 'Try logging out and back in'];
}
return ['Try again in a few moments', 'Contact support if the issue persists'];
}
}