incident-response.ts•35.1 kB
/**
* GEPA Incident Response System
*
* Automatic incident creation, escalation, and lifecycle management
* with intelligent alert routing and post-incident analysis.
*/
import { EventEmitter } from 'events';
import { ErrorEvent, ErrorLevel, ErrorCategory } from './error-tracking-system';
import { AnomalyEvent, AnomalySeverity } from './anomaly-detection';
import { PerformanceTracker } from '../../services/performance-tracker';
import { ResilienceSystem } from '../resilience/index';
import { MemoryLeakIntegration } from '../memory-leak-detector';
// Core Incident Types
export interface Incident {
id: string;
title: string;
description: string;
severity: IncidentSeverity;
status: IncidentStatus;
priority: IncidentPriority;
createdAt: number;
updatedAt: number;
resolvedAt?: number;
assignee?: string;
team?: string;
tags: string[];
affectedSystems: string[];
rootCause?: string;
resolution?: string;
relatedEvents: RelatedEvent[];
timeline: IncidentTimelineEntry[];
metadata: Record<string, any>;
}
export enum IncidentSeverity {
LOW = 'low',
MEDIUM = 'medium',
HIGH = 'high',
CRITICAL = 'critical'
}
export enum IncidentStatus {
OPEN = 'open',
INVESTIGATING = 'investigating',
IDENTIFIED = 'identified',
MONITORING = 'monitoring',
RESOLVED = 'resolved',
CLOSED = 'closed'
}
export enum IncidentPriority {
P1 = 'P1', // Critical - immediate response
P2 = 'P2', // High - 1 hour response
P3 = 'P3', // Medium - 4 hour response
P4 = 'P4' // Low - 24 hour response
}
export interface RelatedEvent {
type: 'error' | 'anomaly' | 'metric' | 'alert';
eventId: string;
timestamp: number;
description: string;
correlation: number; // 0-1 correlation score
}
export interface IncidentTimelineEntry {
timestamp: number;
action: string;
description: string;
actor: string; // system or user
metadata?: Record<string, any>;
}
export interface EscalationRule {
id: string;
name: string;
conditions: EscalationCondition[];
actions: EscalationAction[];
enabled: boolean;
}
export interface EscalationCondition {
type: 'time' | 'severity' | 'status' | 'custom';
operator: 'equals' | 'greater_than' | 'less_than' | 'contains';
value: any;
}
export interface EscalationAction {
type: 'assign' | 'notify' | 'escalate' | 'auto_resolve' | 'run_playbook';
target: string;
parameters: Record<string, any>;
}
export interface AlertRoute {
id: string;
name: string;
conditions: AlertCondition[];
destinations: AlertDestination[];
enabled: boolean;
}
export interface AlertCondition {
field: string;
operator: 'equals' | 'contains' | 'greater_than' | 'less_than';
value: any;
}
export interface AlertDestination {
type: 'email' | 'slack' | 'webhook' | 'pagerduty';
address: string;
urgency: 'low' | 'medium' | 'high' | 'critical';
}
export interface PostIncidentReport {
incidentId: string;
summary: string;
timeline: IncidentTimelineEntry[];
rootCauseAnalysis: string;
impactAssessment: {
duration: number;
affectedUsers: number;
businessImpact: string;
dataLoss: boolean;
};
lessonsLearned: string[];
actionItems: ActionItem[];
preventionMeasures: string[];
}
export interface ActionItem {
id: string;
description: string;
assignee: string;
dueDate: number;
priority: 'low' | 'medium' | 'high';
status: 'open' | 'in_progress' | 'completed';
}
export interface IncidentMetrics {
totalIncidents: number;
incidentsBySeverity: Record<IncidentSeverity, number>;
incidentsByStatus: Record<IncidentStatus, number>;
meanTimeToDetection: number;
meanTimeToResolution: number;
meanTimeToAcknowledgment: number;
escalationRate: number;
falsePositiveRate: number;
}
export interface IncidentResponseConfig {
enableAutoIncidentCreation: boolean;
enableAutoEscalation: boolean;
enableIntelligentRouting: boolean;
enablePostIncidentAnalysis: boolean;
maxIncidentAge: number;
autoResolveTimeouts: Record<IncidentSeverity, number>;
escalationTimeouts: Record<IncidentPriority, number>;
correlationWindowMs: number;
alertingEnabled: boolean;
}
/**
* Comprehensive Incident Response System
*/
export class IncidentResponseSystem extends EventEmitter {
private config: Required<IncidentResponseConfig>;
private incidents: Map<string, Incident> = new Map();
private escalationRules: Map<string, EscalationRule> = new Map();
private alertRoutes: Map<string, AlertRoute> = new Map();
private performanceTracker?: PerformanceTracker;
private resilienceSystem?: ResilienceSystem;
constructor(
config: Partial<IncidentResponseConfig> = {},
performanceTracker?: PerformanceTracker,
resilienceSystem?: ResilienceSystem
) {
super();
this.config = {
enableAutoIncidentCreation: config.enableAutoIncidentCreation ?? true,
enableAutoEscalation: config.enableAutoEscalation ?? true,
enableIntelligentRouting: config.enableIntelligentRouting ?? true,
enablePostIncidentAnalysis: config.enablePostIncidentAnalysis ?? true,
maxIncidentAge: config.maxIncidentAge ?? 2592000000, // 30 days
autoResolveTimeouts: config.autoResolveTimeouts ?? {
[IncidentSeverity.LOW]: 86400000, // 24 hours
[IncidentSeverity.MEDIUM]: 43200000, // 12 hours
[IncidentSeverity.HIGH]: 14400000, // 4 hours
[IncidentSeverity.CRITICAL]: 7200000 // 2 hours
},
escalationTimeouts: config.escalationTimeouts ?? {
[IncidentPriority.P1]: 900000, // 15 minutes
[IncidentPriority.P2]: 3600000, // 1 hour
[IncidentPriority.P3]: 14400000, // 4 hours
[IncidentPriority.P4]: 86400000 // 24 hours
},
correlationWindowMs: config.correlationWindowMs ?? 300000, // 5 minutes
alertingEnabled: config.alertingEnabled ?? true
};
if (performanceTracker) {
this.performanceTracker = performanceTracker;
}
if (resilienceSystem) {
this.resilienceSystem = resilienceSystem;
}
this.initializeDefaultRules();
this.initializeMemoryIntegration();
this.startBackgroundProcesses();
}
/**
* Create incident from error event
*/
createIncidentFromError(error: ErrorEvent): string | null {
if (!this.config.enableAutoIncidentCreation) return null;
// Check if incident should be created based on error severity
if (!this.shouldCreateIncident(error)) return null;
// Check for existing similar incidents
const existingIncident = this.findSimilarIncident(error);
if (existingIncident) {
this.addEventToIncident(existingIncident.id, {
type: 'error',
eventId: error.id,
timestamp: error.timestamp,
description: `Related error: ${error.message}`,
correlation: 0.8
});
return existingIncident.id;
}
// Create new incident
const incident = this.createIncident({
title: `Error Incident: ${error.message.substring(0, 100)}`,
description: `Incident created from error in ${error.source}: ${error.message}`,
severity: this.mapErrorLevelToIncidentSeverity(error.level),
priority: this.mapSeverityToPriority(this.mapErrorLevelToIncidentSeverity(error.level)),
affectedSystems: [error.source],
tags: [error.category, error.level],
relatedEvents: [{
type: 'error',
eventId: error.id,
timestamp: error.timestamp,
description: error.message,
correlation: 1.0
}],
metadata: {
triggerType: 'error',
errorCategory: error.category,
errorLevel: error.level,
errorFingerprint: error.fingerprint
}
});
return incident.id;
}
/**
* Create incident from anomaly event
*/
createIncidentFromAnomaly(anomaly: AnomalyEvent): string | null {
if (!this.config.enableAutoIncidentCreation) return null;
// Check if incident should be created based on anomaly severity
if (anomaly.severity === AnomalySeverity.LOW) return null;
// Check for existing similar incidents
const existingIncident = this.findSimilarAnomalyIncident(anomaly);
if (existingIncident) {
this.addEventToIncident(existingIncident.id, {
type: 'anomaly',
eventId: anomaly.id,
timestamp: anomaly.timestamp,
description: `Related anomaly: ${anomaly.description}`,
correlation: 0.8
});
return existingIncident.id;
}
// Create new incident
const incident = this.createIncident({
title: `Anomaly Incident: ${anomaly.type}`,
description: anomaly.description,
severity: this.mapAnomalySeverityToIncidentSeverity(anomaly.severity),
priority: this.mapSeverityToPriority(this.mapAnomalySeverityToIncidentSeverity(anomaly.severity)),
affectedSystems: anomaly.affectedSystems,
tags: [anomaly.type, anomaly.severity],
relatedEvents: [{
type: 'anomaly',
eventId: anomaly.id,
timestamp: anomaly.timestamp,
description: anomaly.description,
correlation: 1.0
}],
metadata: {
triggerType: 'anomaly',
anomalyType: anomaly.type,
confidence: anomaly.confidence,
detectionMethod: anomaly.detectionMethod
}
});
return incident.id;
}
/**
* Create manual incident
*/
createManualIncident(params: {
title: string;
description: string;
severity: IncidentSeverity;
affectedSystems: string[];
assignee?: string;
tags?: string[];
}): string {
const incident = this.createIncident({
...params,
priority: this.mapSeverityToPriority(params.severity),
relatedEvents: [],
metadata: { triggerType: 'manual' }
});
return incident.id;
}
/**
* Update incident status
*/
updateIncidentStatus(incidentId: string, status: IncidentStatus, details?: string): boolean {
const incident = this.incidents.get(incidentId);
if (!incident) return false;
const oldStatus = incident.status;
incident.status = status;
incident.updatedAt = Date.now();
// Add timeline entry
this.addTimelineEntry(incidentId, {
timestamp: Date.now(),
action: 'status_change',
description: `Status changed from ${oldStatus} to ${status}${details ? ': ' + details : ''}`,
actor: 'system'
});
// Handle status-specific logic
if (status === IncidentStatus.RESOLVED) {
incident.resolvedAt = Date.now();
this.handleIncidentResolution(incident);
}
this.emit('incident-updated', incident);
return true;
}
/**
* Assign incident to user or team
*/
assignIncident(incidentId: string, assignee: string, team?: string): boolean {
const incident = this.incidents.get(incidentId);
if (!incident) return false;
incident.assignee = assignee;
if (team) incident.team = team;
incident.updatedAt = Date.now();
this.addTimelineEntry(incidentId, {
timestamp: Date.now(),
action: 'assignment',
description: `Assigned to ${assignee}${team ? ` (${team})` : ''}`,
actor: 'system'
});
this.emit('incident-assigned', incident);
return true;
}
/**
* Add comment to incident
*/
addIncidentComment(incidentId: string, comment: string, author: string): boolean {
const incident = this.incidents.get(incidentId);
if (!incident) return false;
this.addTimelineEntry(incidentId, {
timestamp: Date.now(),
action: 'comment',
description: comment,
actor: author
});
incident.updatedAt = Date.now();
this.emit('incident-updated', incident);
return true;
}
/**
* Escalate incident
*/
escalateIncident(incidentId: string, reason: string): boolean {
const incident = this.incidents.get(incidentId);
if (!incident) return false;
// Increase priority
const newPriority = this.escalatePriority(incident.priority);
incident.priority = newPriority;
incident.updatedAt = Date.now();
this.addTimelineEntry(incidentId, {
timestamp: Date.now(),
action: 'escalation',
description: `Escalated to ${newPriority}: ${reason}`,
actor: 'system'
});
// Apply escalation actions
this.applyEscalationActions(incident, reason);
this.emit('incident-escalated', incident);
return true;
}
/**
* Get all incidents with optional filtering
*/
getIncidents(filters?: {
status?: IncidentStatus[];
severity?: IncidentSeverity[];
assignee?: string;
team?: string;
since?: number;
limit?: number;
}): Incident[] {
let incidents = Array.from(this.incidents.values());
if (filters) {
if (filters.status) {
incidents = incidents.filter(i => filters.status!.includes(i.status));
}
if (filters.severity) {
incidents = incidents.filter(i => filters.severity!.includes(i.severity));
}
if (filters.assignee) {
incidents = incidents.filter(i => i.assignee === filters.assignee);
}
if (filters.team) {
incidents = incidents.filter(i => i.team === filters.team);
}
if (filters.since) {
incidents = incidents.filter(i => i.createdAt >= filters.since!);
}
}
// Sort by creation time (newest first)
incidents.sort((a, b) => b.createdAt - a.createdAt);
if (filters?.limit) {
incidents = incidents.slice(0, filters.limit);
}
return incidents;
}
/**
* Get incident by ID
*/
getIncident(incidentId: string): Incident | null {
return this.incidents.get(incidentId) || null;
}
/**
* Get incident metrics
*/
getIncidentMetrics(timeWindow?: number): IncidentMetrics {
const window = timeWindow || 2592000000; // 30 days
const cutoff = Date.now() - window;
const recentIncidents = Array.from(this.incidents.values())
.filter(i => i.createdAt >= cutoff);
const totalIncidents = recentIncidents.length;
// Aggregate by severity
const incidentsBySeverity = {} as Record<IncidentSeverity, number>;
for (const severity of Object.values(IncidentSeverity)) {
incidentsBySeverity[severity] = recentIncidents.filter(i => i.severity === severity).length;
}
// Aggregate by status
const incidentsByStatus = {} as Record<IncidentStatus, number>;
for (const status of Object.values(IncidentStatus)) {
incidentsByStatus[status] = recentIncidents.filter(i => i.status === status).length;
}
// Calculate mean times
const resolvedIncidents = recentIncidents.filter(i => i.resolvedAt);
const meanTimeToResolution = resolvedIncidents.length > 0
? resolvedIncidents.reduce((sum, i) => sum + (i.resolvedAt! - i.createdAt), 0) / resolvedIncidents.length
: 0;
// Acknowledgment time (time to first assignment)
const acknowledgedIncidents = recentIncidents.filter(i => i.assignee);
const meanTimeToAcknowledgment = acknowledgedIncidents.length > 0
? acknowledgedIncidents.reduce((sum, i) => {
const ackTime = i.timeline.find(t => t.action === 'assignment')?.timestamp || i.createdAt;
return sum + (ackTime - i.createdAt);
}, 0) / acknowledgedIncidents.length
: 0;
// Escalation rate
const escalatedIncidents = recentIncidents.filter(i =>
i.timeline.some(t => t.action === 'escalation')
);
const escalationRate = totalIncidents > 0 ? escalatedIncidents.length / totalIncidents : 0;
return {
totalIncidents,
incidentsBySeverity,
incidentsByStatus,
meanTimeToDetection: 0, // Would need integration with monitoring alerts
meanTimeToResolution,
meanTimeToAcknowledgment,
escalationRate,
falsePositiveRate: 0 // Would be calculated with feedback
};
}
/**
* Generate post-incident report
*/
generatePostIncidentReport(incidentId: string): PostIncidentReport | null {
if (!this.config.enablePostIncidentAnalysis) return null;
const incident = this.incidents.get(incidentId);
if (!incident || incident.status !== IncidentStatus.RESOLVED) return null;
const duration = incident.resolvedAt! - incident.createdAt;
return {
incidentId,
summary: `Incident ${incident.title} was resolved after ${Math.round(duration / 60000)} minutes.`,
timeline: incident.timeline,
rootCauseAnalysis: incident.rootCause || 'Root cause analysis pending',
impactAssessment: {
duration,
affectedUsers: this.estimateAffectedUsers(incident),
businessImpact: this.assessBusinessImpact(incident),
dataLoss: false // Would be determined through investigation
},
lessonsLearned: this.extractLessonsLearned(incident),
actionItems: this.generateActionItems(incident),
preventionMeasures: this.generatePreventionMeasures(incident)
};
}
/**
* Configure escalation rule
*/
configureEscalationRule(rule: EscalationRule): void {
this.escalationRules.set(rule.id, rule);
}
/**
* Configure alert route
*/
configureAlertRoute(route: AlertRoute): void {
this.alertRoutes.set(route.id, route);
}
/**
* Clear incident history
*/
clearHistory(): void {
this.incidents.clear();
}
// Private methods
private createIncident(params: {
title: string;
description: string;
severity: IncidentSeverity;
priority: IncidentPriority;
affectedSystems: string[];
assignee?: string;
team?: string;
tags?: string[];
relatedEvents: RelatedEvent[];
metadata: Record<string, any>;
}): Incident {
const id = `incident_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
const now = Date.now();
const incident: Incident = {
id,
title: params.title,
description: params.description,
severity: params.severity,
status: IncidentStatus.OPEN,
priority: params.priority,
createdAt: now,
updatedAt: now,
...(params.assignee ? { assignee: params.assignee } : {}),
...(params.team ? { team: params.team } : {}),
tags: params.tags || [],
affectedSystems: params.affectedSystems,
relatedEvents: params.relatedEvents,
timeline: [{
timestamp: now,
action: 'created',
description: 'Incident created',
actor: 'system'
}],
metadata: params.metadata
};
this.incidents.set(id, incident);
// Route alerts
if (this.config.alertingEnabled) {
this.routeIncidentAlert(incident);
}
// Auto-assign if enabled
this.autoAssignIncident(incident);
this.emit('incident-created', incident);
// Track performance
if (this.performanceTracker) {
this.performanceTracker.recordMetric({
id: `incident_created_${id}`,
name: 'incident-created',
category: 'incident-response',
timestamp: now,
data: {
severity: params.severity,
priority: params.priority,
affectedSystems: params.affectedSystems.length
}
});
}
return incident;
}
private shouldCreateIncident(error: ErrorEvent): boolean {
// Create incidents for critical and fatal errors
if (error.level === ErrorLevel.CRITICAL || error.level === ErrorLevel.FATAL) {
return true;
}
// Create incidents for certain error categories
const criticalCategories = [
ErrorCategory.SYSTEM,
ErrorCategory.DATABASE,
ErrorCategory.MEMORY,
ErrorCategory.AUTHENTICATION
];
if (criticalCategories.includes(error.category) && error.level === ErrorLevel.ERROR) {
return true;
}
return false;
}
private findSimilarIncident(error: ErrorEvent): Incident | null {
const timeWindow = this.config.correlationWindowMs;
const cutoff = Date.now() - timeWindow;
// Look for open incidents with similar characteristics
const openIncidents = Array.from(this.incidents.values())
.filter(i =>
i.status !== IncidentStatus.RESOLVED &&
i.status !== IncidentStatus.CLOSED &&
i.createdAt >= cutoff
);
return openIncidents.find(incident => {
// Check if incident is related to same system
if (incident.affectedSystems.includes(error.source)) {
return true;
}
// Check if incident has similar error events
const hasRelatedError = incident.relatedEvents.some(event =>
event.type === 'error' &&
incident.metadata.errorCategory === error.category
);
return hasRelatedError;
}) || null;
}
private findSimilarAnomalyIncident(anomaly: AnomalyEvent): Incident | null {
const timeWindow = this.config.correlationWindowMs;
const cutoff = Date.now() - timeWindow;
const openIncidents = Array.from(this.incidents.values())
.filter(i =>
i.status !== IncidentStatus.RESOLVED &&
i.status !== IncidentStatus.CLOSED &&
i.createdAt >= cutoff
);
return openIncidents.find(incident => {
// Check for overlapping affected systems
const hasOverlap = incident.affectedSystems.some(system =>
anomaly.affectedSystems.includes(system)
);
if (hasOverlap) return true;
// Check for similar anomaly type
const hasSimilarAnomaly = incident.relatedEvents.some(event =>
event.type === 'anomaly' &&
incident.metadata.anomalyType === anomaly.type
);
return hasSimilarAnomaly;
}) || null;
}
private addEventToIncident(incidentId: string, event: RelatedEvent): void {
const incident = this.incidents.get(incidentId);
if (!incident) return;
incident.relatedEvents.push(event);
incident.updatedAt = Date.now();
this.addTimelineEntry(incidentId, {
timestamp: Date.now(),
action: 'event_correlation',
description: `Correlated ${event.type}: ${event.description}`,
actor: 'system'
});
}
private addTimelineEntry(incidentId: string, entry: IncidentTimelineEntry): void {
const incident = this.incidents.get(incidentId);
if (!incident) return;
incident.timeline.push(entry);
incident.updatedAt = Date.now();
}
private mapErrorLevelToIncidentSeverity(level: ErrorLevel): IncidentSeverity {
switch (level) {
case ErrorLevel.FATAL:
case ErrorLevel.CRITICAL:
return IncidentSeverity.CRITICAL;
case ErrorLevel.ERROR:
return IncidentSeverity.HIGH;
case ErrorLevel.WARNING:
return IncidentSeverity.MEDIUM;
default:
return IncidentSeverity.LOW;
}
}
private mapAnomalySeverityToIncidentSeverity(severity: AnomalySeverity): IncidentSeverity {
switch (severity) {
case AnomalySeverity.CRITICAL:
return IncidentSeverity.CRITICAL;
case AnomalySeverity.HIGH:
return IncidentSeverity.HIGH;
case AnomalySeverity.MEDIUM:
return IncidentSeverity.MEDIUM;
default:
return IncidentSeverity.LOW;
}
}
private mapSeverityToPriority(severity: IncidentSeverity): IncidentPriority {
switch (severity) {
case IncidentSeverity.CRITICAL:
return IncidentPriority.P1;
case IncidentSeverity.HIGH:
return IncidentPriority.P2;
case IncidentSeverity.MEDIUM:
return IncidentPriority.P3;
default:
return IncidentPriority.P4;
}
}
private escalatePriority(current: IncidentPriority): IncidentPriority {
switch (current) {
case IncidentPriority.P4: return IncidentPriority.P3;
case IncidentPriority.P3: return IncidentPriority.P2;
case IncidentPriority.P2: return IncidentPriority.P1;
default: return IncidentPriority.P1;
}
}
private handleIncidentResolution(incident: Incident): void {
// Trigger resilience system recovery if available
if (this.resilienceSystem) {
// Could trigger automatic recovery procedures
}
// Generate post-incident report if enabled
if (this.config.enablePostIncidentAnalysis) {
setTimeout(() => {
const report = this.generatePostIncidentReport(incident.id);
if (report) {
this.emit('post-incident-report', report);
}
}, 60000); // Generate report 1 minute after resolution
}
}
private applyEscalationActions(incident: Incident, reason: string): void {
// Apply relevant escalation rules
for (const rule of this.escalationRules.values()) {
if (!rule.enabled) continue;
const conditionsMet = rule.conditions.every(condition =>
this.evaluateEscalationCondition(incident, condition)
);
if (conditionsMet) {
for (const action of rule.actions) {
this.executeEscalationAction(incident, action, reason);
}
}
}
}
private evaluateEscalationCondition(incident: Incident, condition: EscalationCondition): boolean {
let fieldValue: any;
switch (condition.type) {
case 'severity':
fieldValue = incident.severity;
break;
case 'status':
fieldValue = incident.status;
break;
case 'time':
fieldValue = Date.now() - incident.createdAt;
break;
default:
return false;
}
switch (condition.operator) {
case 'equals':
return fieldValue === condition.value;
case 'greater_than':
return fieldValue > condition.value;
case 'less_than':
return fieldValue < condition.value;
case 'contains':
return String(fieldValue).includes(String(condition.value));
default:
return false;
}
}
private executeEscalationAction(incident: Incident, action: EscalationAction, reason: string): void {
switch (action.type) {
case 'assign':
this.assignIncident(incident.id, action.target, action.parameters.team);
break;
case 'notify':
this.sendEscalationNotification(incident, action.target, reason);
break;
case 'escalate':
// Already handled by caller
break;
case 'auto_resolve':
if (this.canAutoResolve(incident)) {
this.updateIncidentStatus(incident.id, IncidentStatus.RESOLVED, 'Auto-resolved by escalation rule');
}
break;
}
}
private canAutoResolve(incident: Incident): boolean {
// Only auto-resolve low severity incidents that are old enough
return incident.severity === IncidentSeverity.LOW &&
Date.now() - incident.createdAt > this.config.autoResolveTimeouts[incident.severity];
}
private routeIncidentAlert(incident: Incident): void {
if (!this.config.enableIntelligentRouting) return;
for (const route of this.alertRoutes.values()) {
if (!route.enabled) continue;
const conditionsMet = route.conditions.every(condition =>
this.evaluateAlertCondition(incident, condition)
);
if (conditionsMet) {
for (const destination of route.destinations) {
this.sendAlert(incident, destination);
}
}
}
}
private evaluateAlertCondition(incident: Incident, condition: AlertCondition): boolean {
let fieldValue: any;
switch (condition.field) {
case 'severity':
fieldValue = incident.severity;
break;
case 'priority':
fieldValue = incident.priority;
break;
case 'affectedSystems':
fieldValue = incident.affectedSystems.join(',');
break;
case 'tags':
fieldValue = incident.tags.join(',');
break;
default:
return false;
}
switch (condition.operator) {
case 'equals':
return fieldValue === condition.value;
case 'contains':
return String(fieldValue).includes(String(condition.value));
case 'greater_than':
return fieldValue > condition.value;
case 'less_than':
return fieldValue < condition.value;
default:
return false;
}
}
private sendAlert(incident: Incident, destination: AlertDestination): void {
// Emit alert event for external handling
this.emit('alert', {
incident,
destination,
timestamp: Date.now()
});
}
private sendEscalationNotification(incident: Incident, target: string, reason: string): void {
this.emit('escalation-notification', {
incident,
target,
reason,
timestamp: Date.now()
});
}
private autoAssignIncident(incident: Incident): void {
// Simple auto-assignment logic based on affected systems
const systemOwnership = {
'llm-adapter': 'ml-team',
'trajectory-store': 'backend-team',
'pareto-frontier': 'algorithm-team'
};
for (const system of incident.affectedSystems) {
const team = systemOwnership[system as keyof typeof systemOwnership];
if (team) {
incident.team = team;
this.addTimelineEntry(incident.id, {
timestamp: Date.now(),
action: 'auto_assignment',
description: `Auto-assigned to ${team} based on affected system: ${system}`,
actor: 'system'
});
break;
}
}
}
private estimateAffectedUsers(incident: Incident): number {
// Simple estimation based on severity and affected systems
const baseUsers = {
[IncidentSeverity.CRITICAL]: 1000,
[IncidentSeverity.HIGH]: 500,
[IncidentSeverity.MEDIUM]: 100,
[IncidentSeverity.LOW]: 10
};
return baseUsers[incident.severity] * incident.affectedSystems.length;
}
private assessBusinessImpact(incident: Incident): string {
if (incident.severity === IncidentSeverity.CRITICAL) {
return 'Severe - service completely unavailable';
} else if (incident.severity === IncidentSeverity.HIGH) {
return 'Significant - major features impacted';
} else if (incident.severity === IncidentSeverity.MEDIUM) {
return 'Moderate - some features degraded';
} else {
return 'Minimal - minor issues reported';
}
}
private extractLessonsLearned(incident: Incident): string[] {
// Generate lessons learned based on incident characteristics
const lessons: string[] = [];
if (incident.metadata.triggerType === 'error') {
lessons.push('Improve error handling and circuit breaker patterns');
}
if (incident.metadata.triggerType === 'anomaly') {
lessons.push('Enhance monitoring and alerting thresholds');
}
if (incident.relatedEvents.length > 5) {
lessons.push('Consider implementing better event correlation');
}
return lessons;
}
private generateActionItems(incident: Incident): ActionItem[] {
const items: ActionItem[] = [];
// Generate action items based on root cause and incident type
if (incident.metadata.errorCategory === ErrorCategory.MEMORY) {
items.push({
id: `action_${Date.now()}_1`,
description: 'Implement memory leak detection and monitoring',
assignee: incident.team || 'backend-team',
dueDate: Date.now() + 604800000, // 1 week
priority: 'high',
status: 'open'
});
}
return items;
}
private generatePreventionMeasures(incident: Incident): string[] {
const measures: string[] = [];
// Generate prevention measures based on incident analysis
if (incident.severity === IncidentSeverity.CRITICAL) {
measures.push('Implement additional monitoring and alerting');
measures.push('Review and strengthen circuit breaker patterns');
}
if (incident.affectedSystems.length > 1) {
measures.push('Improve system isolation and failure boundaries');
}
return measures;
}
private initializeDefaultRules(): void {
// Default escalation rule for critical incidents
this.configureEscalationRule({
id: 'critical-auto-escalate',
name: 'Auto-escalate critical incidents',
conditions: [
{ type: 'severity', operator: 'equals', value: IncidentSeverity.CRITICAL },
{ type: 'time', operator: 'greater_than', value: 900000 } // 15 minutes
],
actions: [
{ type: 'escalate', target: 'on-call-manager', parameters: {} },
{ type: 'notify', target: 'escalation-team', parameters: {} }
],
enabled: true
});
// Default alert route for critical incidents
this.configureAlertRoute({
id: 'critical-alerts',
name: 'Critical incident alerts',
conditions: [
{ field: 'severity', operator: 'equals', value: IncidentSeverity.CRITICAL }
],
destinations: [
{ type: 'slack', address: '#critical-alerts', urgency: 'critical' },
{ type: 'pagerduty', address: 'critical-service', urgency: 'critical' }
],
enabled: true
});
}
private estimateMemoryUsage(): number {
// Estimate memory based on incidents and rules
const incidentsSize = this.incidents.size * 2048; // Approximate size per incident
const escalationRulesSize = this.escalationRules.size * 512; // Approximate size per rule
const alertRoutesSize = this.alertRoutes.size * 256; // Approximate size per route
return incidentsSize + escalationRulesSize + alertRoutesSize;
}
private initializeMemoryIntegration(): void {
MemoryLeakIntegration.initialize();
setInterval(() => {
const memoryUsage = this.estimateMemoryUsage();
MemoryLeakIntegration.trackIncidentResponse('monitor', memoryUsage);
}, 60000);
}
private startBackgroundProcesses(): void {
// Auto-escalation check every 5 minutes
setInterval(() => {
if (!this.config.enableAutoEscalation) return;
const now = Date.now();
for (const incident of this.incidents.values()) {
if (incident.status === IncidentStatus.RESOLVED || incident.status === IncidentStatus.CLOSED) {
continue;
}
const ageMs = now - incident.createdAt;
const escalationTimeout = this.config.escalationTimeouts[incident.priority];
if (ageMs > escalationTimeout) {
this.escalateIncident(incident.id, 'Automatic escalation due to timeout');
}
}
}, 300000);
// Auto-resolution check every hour
setInterval(() => {
const now = Date.now();
for (const incident of this.incidents.values()) {
if (incident.status === IncidentStatus.RESOLVED || incident.status === IncidentStatus.CLOSED) {
continue;
}
const ageMs = now - incident.createdAt;
const autoResolveTimeout = this.config.autoResolveTimeouts[incident.severity];
if (incident.severity === IncidentSeverity.LOW && ageMs > autoResolveTimeout) {
this.updateIncidentStatus(incident.id, IncidentStatus.RESOLVED, 'Auto-resolved due to age');
}
}
}, 3600000);
// Cleanup old incidents
setInterval(() => {
const cutoff = Date.now() - this.config.maxIncidentAge;
for (const [id, incident] of this.incidents) {
if (incident.createdAt < cutoff && incident.status === IncidentStatus.CLOSED) {
this.incidents.delete(id);
}
}
}, 86400000); // Daily cleanup
}
}