index.ts•19.4 kB
/**
* GEPA Monitoring System - Main Export
*
* Comprehensive error monitoring, incident response, and observability
* for robust system operation and performance tracking.
*/
export {
ErrorTrackingSystem
} from './error-tracking-system';
export type {
ErrorEvent,
ErrorLevel,
ErrorCategory,
ErrorClassification,
ErrorPattern,
ErrorMetrics,
ErrorTrendPrediction,
ErrorTrackingConfig
} from './error-tracking-system';
export {
AnomalyDetectionSystem
} from './anomaly-detection';
export type {
AnomalyEvent,
AnomalyType,
AnomalySeverity,
ImpactAssessment,
BehavioralPattern,
StatisticalModel,
AnomalyDetectionConfig
} from './anomaly-detection';
export {
IncidentResponseSystem
} from './incident-response';
export type {
Incident,
IncidentSeverity,
IncidentStatus,
IncidentPriority,
RelatedEvent,
IncidentTimelineEntry,
EscalationRule,
AlertRoute,
PostIncidentReport,
IncidentMetrics,
IncidentResponseConfig
} from './incident-response';
export {
ObservabilityIntegration
} from './observability-integration';
export type {
MetricDefinition,
MetricType,
AggregationType,
MetricSample,
TraceSpan,
SpanStatus,
TraceLog,
LogEntry,
Dashboard,
DashboardPanel,
PanelType,
QueryDefinition,
VisualizationConfig,
AlertRule,
ObservabilityConfig
} from './observability-integration';
import { EventEmitter } from 'events';
import { ErrorTrackingSystem, ErrorEvent, ErrorTrackingConfig } from './error-tracking-system';
import { AnomalyDetectionSystem, AnomalyEvent, AnomalyDetectionConfig } from './anomaly-detection';
import { IncidentResponseSystem, Incident, IncidentResponseConfig } from './incident-response';
import { ObservabilityIntegration, ObservabilityConfig } from './observability-integration';
import { PerformanceTracker } from '../../services/performance-tracker';
import { ResilienceSystem } from '../resilience/index';
import { MemoryLeakIntegration } from '../memory-leak-detector';
/**
* Unified Monitoring System Configuration
*/
export interface MonitoringSystemConfig {
errorTracking?: Partial<ErrorTrackingConfig>;
anomalyDetection?: Partial<AnomalyDetectionConfig>;
incidentResponse?: Partial<IncidentResponseConfig>;
observability?: Partial<ObservabilityConfig>;
enableIntegrations?: boolean;
enableHealthChecks?: boolean;
healthCheckInterval?: number;
}
/**
* System Health Status
*/
export interface SystemHealthStatus {
overall: 'healthy' | 'degraded' | 'critical';
components: {
errorTracking: ComponentHealth;
anomalyDetection: ComponentHealth;
incidentResponse: ComponentHealth;
observability: ComponentHealth;
};
metrics: {
activeErrors: number;
activeAnomalies: number;
openIncidents: number;
systemLoad: number;
};
recommendations: string[];
lastUpdated: number;
}
export interface ComponentHealth {
status: 'healthy' | 'degraded' | 'critical';
uptime: number;
errorRate: number;
performance: number;
memoryUsage: number;
recommendations: string[];
}
/**
* Comprehensive GEPA Monitoring System
*
* Integrates error tracking, anomaly detection, incident response,
* and observability into a unified monitoring solution.
*/
export class MonitoringSystem extends EventEmitter {
private config: Required<MonitoringSystemConfig>;
private errorTracking!: ErrorTrackingSystem;
private anomalyDetection!: AnomalyDetectionSystem;
private incidentResponse!: IncidentResponseSystem;
private observability!: ObservabilityIntegration;
private performanceTracker: PerformanceTracker | undefined;
private resilienceSystem: ResilienceSystem | undefined;
private startTime: number;
constructor(
config: MonitoringSystemConfig = {},
performanceTracker?: PerformanceTracker,
resilienceSystem?: ResilienceSystem
) {
super();
this.config = {
errorTracking: config.errorTracking || {},
anomalyDetection: config.anomalyDetection || {},
incidentResponse: config.incidentResponse || {},
observability: config.observability || {},
enableIntegrations: config.enableIntegrations ?? true,
enableHealthChecks: config.enableHealthChecks ?? true,
healthCheckInterval: config.healthCheckInterval ?? 60000 // 1 minute
};
this.performanceTracker = performanceTracker;
this.resilienceSystem = resilienceSystem;
this.startTime = Date.now();
this.initializeComponents();
this.setupIntegrations();
this.startHealthChecks();
}
/**
* Initialize all monitoring components
*/
private initializeComponents(): void {
// Initialize error tracking
this.errorTracking = new ErrorTrackingSystem(
this.config.errorTracking,
this.performanceTracker
);
// Initialize anomaly detection
this.anomalyDetection = new AnomalyDetectionSystem(
this.config.anomalyDetection
);
// Initialize incident response
this.incidentResponse = new IncidentResponseSystem(
this.config.incidentResponse,
this.performanceTracker,
this.resilienceSystem
);
// Initialize observability
this.observability = new ObservabilityIntegration(
this.config.observability,
this.performanceTracker
);
}
/**
* Setup integrations between components
*/
private setupIntegrations(): void {
if (!this.config.enableIntegrations) return;
// Error tracking -> Anomaly detection
this.errorTracking.on('error-tracked', (error: ErrorEvent) => {
const anomalies = this.anomalyDetection.analyzeErrorEvent(error);
anomalies.forEach(anomaly => this.handleAnomaly(anomaly));
// Record in observability
this.observability.handleErrorEvent(error);
});
this.errorTracking.on('critical-error', (error: ErrorEvent) => {
// Auto-create incident for critical errors
const incidentId = this.incidentResponse.createIncidentFromError(error);
if (incidentId) {
this.emit('incident-auto-created', { incidentId, trigger: error });
}
});
// Anomaly detection -> Incident response
this.anomalyDetection.on('anomaly-detected', (anomaly: AnomalyEvent) => {
this.handleAnomaly(anomaly);
});
this.anomalyDetection.on('anomaly-alert', (anomaly: AnomalyEvent) => {
// Auto-create incident for high-severity anomalies
const incidentId = this.incidentResponse.createIncidentFromAnomaly(anomaly);
if (incidentId) {
this.emit('incident-auto-created', { incidentId, trigger: anomaly });
}
});
// Incident response -> Observability
this.incidentResponse.on('incident-created', (incident: Incident) => {
this.observability.handleIncidentEvent(incident);
this.emit('incident-created', incident);
});
this.incidentResponse.on('incident-escalated', (incident: Incident) => {
this.observability.handleIncidentEvent(incident);
this.emit('incident-escalated', incident);
});
// Performance tracking integration
if (this.performanceTracker) {
this.performanceTracker.on('metric', (metric) => {
const anomalies = this.anomalyDetection.analyzePerformanceMetric(metric);
anomalies.forEach(anomaly => this.handleAnomaly(anomaly));
});
}
// Resilience system integration
if (this.resilienceSystem) {
// Monitor resilience system health
setInterval(async () => {
try {
const status = await this.resilienceSystem!.getSystemStatus();
if (status.status === 'critical') {
this.trackError(new Error(`Resilience system critical: ${status.recommendations.join(', ')}`), {
source: 'resilience-system',
category: 'system'
});
}
} catch (error) {
this.trackError(error as Error, {
source: 'resilience-system',
category: 'system'
});
}
}, 300000); // Every 5 minutes
}
}
/**
* Track an error event
*/
trackError(error: Error | ErrorEvent, context: Record<string, any> = {}): string {
return this.errorTracking.trackError(error, context);
}
/**
* Get current system health status
*/
async getSystemHealth(): Promise<SystemHealthStatus> {
const now = Date.now();
// Get component health
const errorTrackingHealth = this.errorTracking.getHealthStatus();
const anomalyMetrics = this.anomalyDetection.getAnomalyMetrics();
const incidentMetrics = this.incidentResponse.getIncidentMetrics();
const observabilityMetrics = this.observability.getObservabilityMetrics();
// Calculate component health scores
const componentHealth = {
errorTracking: {
status: errorTrackingHealth.status,
uptime: now - this.startTime,
errorRate: errorTrackingHealth.errorRate,
performance: 100 - Math.min(100, errorTrackingHealth.errorRate * 10),
memoryUsage: 0, // Would be calculated from actual memory usage
recommendations: errorTrackingHealth.recommendations
} as ComponentHealth,
anomalyDetection: {
status: anomalyMetrics.totalAnomalies > 10 ? 'degraded' : 'healthy',
uptime: now - this.startTime,
errorRate: 0,
performance: Math.max(0, 100 - anomalyMetrics.totalAnomalies),
memoryUsage: 0,
recommendations: anomalyMetrics.totalAnomalies > 10 ? ['High anomaly count detected'] : []
} as ComponentHealth,
incidentResponse: {
status: incidentMetrics.incidentsByStatus.open > 5 ? 'degraded' : 'healthy',
uptime: now - this.startTime,
errorRate: 0,
performance: 100 - Math.min(100, incidentMetrics.incidentsByStatus.open * 5),
memoryUsage: 0,
recommendations: incidentMetrics.incidentsByStatus.open > 5 ? ['Multiple open incidents'] : []
} as ComponentHealth,
observability: {
status: 'healthy',
uptime: now - this.startTime,
errorRate: 0,
performance: 100,
memoryUsage: observabilityMetrics.memoryUsage.total,
recommendations: observabilityMetrics.memoryUsage.total > 100 * 1024 * 1024 ? ['High memory usage'] : []
} as ComponentHealth
};
// Determine overall health
const componentStatuses = Object.values(componentHealth).map(c => c.status);
let overall: 'healthy' | 'degraded' | 'critical' = 'healthy';
if (componentStatuses.some(s => s === 'critical')) {
overall = 'critical';
} else if (componentStatuses.some(s => s === 'degraded')) {
overall = 'degraded';
}
// Collect recommendations
const recommendations = Object.values(componentHealth)
.flatMap(c => c.recommendations);
return {
overall,
components: componentHealth,
metrics: {
activeErrors: errorTrackingHealth.criticalErrors,
activeAnomalies: anomalyMetrics.totalAnomalies,
openIncidents: incidentMetrics.incidentsByStatus.open || 0,
systemLoad: this.calculateSystemLoad()
},
recommendations,
lastUpdated: now
};
}
/**
* Get comprehensive monitoring metrics
*/
getMonitoringMetrics(): {
errorTracking: any;
anomalyDetection: any;
incidentResponse: any;
observability: any;
systemHealth: any;
} {
return {
errorTracking: this.errorTracking.getErrorMetrics(),
anomalyDetection: this.anomalyDetection.getAnomalyMetrics(),
incidentResponse: this.incidentResponse.getIncidentMetrics(),
observability: this.observability.getObservabilityMetrics(),
systemHealth: null // Will be populated by getSystemHealth()
};
}
/**
* Create a dashboard for monitoring metrics
*/
createMonitoringDashboard(): string {
const dashboardId = 'monitoring-overview';
this.observability.createDashboard({
id: dashboardId,
name: 'GEPA Monitoring Overview',
description: 'Comprehensive monitoring dashboard for GEPA system',
refreshInterval: 30000,
timeRange: { from: 'now-1h', to: 'now' },
filters: [],
panels: [
{
id: 'system-health',
title: 'System Health Status',
type: 'single_stat' as any,
query: {
metric: 'system_health_score',
aggregation: 'average' as any
},
visualization: {
colorScheme: 'green-yellow-red',
thresholds: [
{ value: 0, color: 'red' },
{ value: 70, color: 'yellow' },
{ value: 90, color: 'green' }
],
unit: '%'
},
position: { x: 0, y: 0, width: 6, height: 3 }
},
{
id: 'error-rate',
title: 'Error Rate',
type: 'time_series' as any,
query: {
metric: 'errors_total',
aggregation: 'rate' as any,
groupBy: ['level']
},
visualization: {
colorScheme: 'red',
unit: 'errors/min'
},
position: { x: 6, y: 0, width: 6, height: 6 }
},
{
id: 'active-incidents',
title: 'Active Incidents',
type: 'single_stat' as any,
query: {
metric: 'incidents_total',
aggregation: 'sum' as any,
filters: { status: 'open' }
},
visualization: {
colorScheme: 'orange',
unit: 'count'
},
position: { x: 0, y: 3, width: 3, height: 3 }
},
{
id: 'anomalies-detected',
title: 'Anomalies (Last Hour)',
type: 'single_stat' as any,
query: {
metric: 'anomalies_total',
aggregation: 'sum' as any
},
visualization: {
colorScheme: 'purple',
unit: 'count'
},
position: { x: 3, y: 3, width: 3, height: 3 }
}
],
metadata: {
createdBy: 'monitoring-system',
version: '1.0'
}
});
return dashboardId;
}
/**
* Emergency shutdown of monitoring system
*/
async emergencyShutdown(reason: string): Promise<void> {
// eslint-disable-next-line no-console
console.warn(`Monitoring system emergency shutdown: ${reason}`);
// Create incident for shutdown
this.incidentResponse.createManualIncident({
title: 'Monitoring System Emergency Shutdown',
description: `Emergency shutdown initiated: ${reason}`,
severity: 'critical' as any,
affectedSystems: ['monitoring-system']
});
// Stop all background processes
this.removeAllListeners();
// Clear data if needed for memory
if (reason.includes('memory')) {
this.errorTracking.clearHistory();
this.anomalyDetection.clearHistory();
this.incidentResponse.clearHistory();
this.observability.clearData();
}
}
/**
* Clear all monitoring data
*/
clearAllData(): void {
this.errorTracking.clearHistory();
this.anomalyDetection.clearHistory();
this.incidentResponse.clearHistory();
this.observability.clearData();
}
// Private methods
private handleAnomaly(anomaly: AnomalyEvent): void {
// Record anomaly in observability
this.observability.handleAnomalyEvent(anomaly);
// Emit for external handling
this.emit('anomaly-detected', anomaly);
// Record performance impact
if (this.performanceTracker) {
this.performanceTracker.recordMetric({
id: `anomaly_${anomaly.id}`,
name: 'anomaly-detected',
category: 'monitoring',
timestamp: anomaly.timestamp,
data: {
type: anomaly.type,
severity: anomaly.severity,
confidence: anomaly.confidence
}
});
}
}
private calculateSystemLoad(): number {
// Simplified system load calculation
const errorHealth = this.errorTracking.getHealthStatus();
const anomalyMetrics = this.anomalyDetection.getAnomalyMetrics();
const incidentMetrics = this.incidentResponse.getIncidentMetrics();
let load = 0;
load += Math.min(50, errorHealth.errorRate * 5); // Error contribution
load += Math.min(30, anomalyMetrics.totalAnomalies); // Anomaly contribution
load += Math.min(20, (incidentMetrics.incidentsByStatus.open || 0) * 5); // Incident contribution
return Math.min(100, load);
}
private startHealthChecks(): void {
if (!this.config.enableHealthChecks) return;
setInterval(async () => {
try {
const health = await this.getSystemHealth();
// Record health metrics
this.observability.recordMetric('system_health_score',
health.overall === 'healthy' ? 100 :
health.overall === 'degraded' ? 50 : 0
);
// Emit health status
this.emit('health-check', health);
// Create incidents for critical health issues
if (health.overall === 'critical') {
this.incidentResponse.createManualIncident({
title: 'System Health Critical',
description: `System health is critical: ${health.recommendations.join(', ')}`,
severity: 'critical' as any,
affectedSystems: ['monitoring-system']
});
}
} catch (error) {
this.trackError(error as Error, {
source: 'health-check',
category: 'system'
});
}
}, this.config.healthCheckInterval);
// Memory usage tracking
setInterval(() => {
const memoryUsage = this.estimateMemoryUsage();
MemoryLeakIntegration.trackMonitoringSystem('monitor', memoryUsage);
this.observability.recordMetric('monitoring_memory_usage', memoryUsage, {
component: 'monitoring-system'
});
}, 60000);
}
private estimateMemoryUsage(): number {
const baseSize = 16384; // Base system size
// This would ideally get actual memory usage from each component
// For now, we'll estimate based on data sizes
return baseSize +
this.observability.getObservabilityMetrics().memoryUsage.total +
1024 * 1024; // Additional overhead
}
}
/**
* Convenience function to create a fully configured monitoring system
*/
export function createMonitoringSystem(
config: MonitoringSystemConfig = {},
performanceTracker?: PerformanceTracker,
resilienceSystem?: ResilienceSystem
): MonitoringSystem {
// Initialize memory leak integration
MemoryLeakIntegration.initialize();
const system = new MonitoringSystem(config, performanceTracker, resilienceSystem);
// Create default monitoring dashboard
system.createMonitoringDashboard();
return system;
}
/**
* Global monitoring instance (singleton)
*/
let globalMonitoringSystem: MonitoringSystem | null = null;
export function getGlobalMonitoringSystem(): MonitoringSystem | null {
return globalMonitoringSystem;
}
export function setGlobalMonitoringSystem(system: MonitoringSystem): void {
globalMonitoringSystem = system;
}
/**
* Convenience function for error tracking
*/
export function trackGlobalError(error: Error | ErrorEvent, context: Record<string, any> = {}): string | null {
const system = getGlobalMonitoringSystem();
return system ? system.trackError(error, context) : null;
}