Skip to main content
Glama
by Coder-RL
health-checker.ts44.6 kB
import { EventEmitter } from 'events'; import * as crypto from 'crypto'; export interface HealthCheck { id: string; name: string; description: string; type: 'http' | 'tcp' | 'database' | 'custom' | 'composite' | 'dependency'; target: HealthTarget; configuration: HealthCheckConfig; schedule: ScheduleConfig; thresholds: ThresholdConfig; actions: ActionConfig[]; tags: string[]; dependencies: string[]; enabled: boolean; status: 'healthy' | 'unhealthy' | 'degraded' | 'unknown' | 'maintenance'; lastCheck: Date; nextCheck: Date; consecutiveFailures: number; consecutiveSuccesses: number; uptime: number; // percentage created: Date; modified: Date; } export interface HealthTarget { id: string; name: string; type: 'service' | 'database' | 'external-api' | 'infrastructure' | 'application'; endpoint: string; region?: string; environment: string; criticality: 'low' | 'medium' | 'high' | 'critical'; metadata: Record<string, any>; } export interface HealthCheckConfig { timeout: number; // seconds retries: number; retryDelay: number; // seconds method?: string; // for HTTP checks headers?: Record<string, string>; body?: string; expectedStatus?: number[]; expectedContent?: string; expectedContentType?: string; authentication?: AuthConfig; ssl?: SSLConfig; proxy?: ProxyConfig; customScript?: string; parameters?: Record<string, any>; } export interface AuthConfig { type: 'basic' | 'bearer' | 'api-key' | 'oauth2' | 'certificate'; credentials: Record<string, string>; refreshToken?: string; tokenEndpoint?: string; } export interface SSLConfig { verify: boolean; ca?: string; cert?: string; key?: string; allowSelfSigned: boolean; } export interface ProxyConfig { enabled: boolean; host: string; port: number; auth?: AuthConfig; } export interface ScheduleConfig { interval: number; // seconds jitter: number; // seconds (randomization) timezone: string; activeHours?: { start: string; // HH:MM end: string; // HH:MM days: number[]; // 0-6, Sunday = 0 }; maintenanceWindows?: MaintenanceWindow[]; } export interface MaintenanceWindow { id: string; name: string; start: Date; end: Date; recurring: boolean; recurrencePattern?: string; // cron expression } export interface ThresholdConfig { responseTime: { warning: number; // ms critical: number; // ms }; failureThreshold: number; // consecutive failures before marking unhealthy recoveryThreshold: number; // consecutive successes before marking healthy uptimeThreshold: { warning: number; // percentage critical: number; // percentage }; customThresholds?: CustomThreshold[]; } export interface CustomThreshold { metric: string; warning: number; critical: number; operator: 'greater-than' | 'less-than' | 'equals' | 'not-equals'; } export interface ActionConfig { id: string; name: string; type: 'notification' | 'webhook' | 'script' | 'escalation' | 'auto-recovery'; trigger: 'failure' | 'recovery' | 'degraded' | 'threshold-breach'; conditions: ActionCondition[]; configuration: Record<string, any>; cooldown: number; // seconds enabled: boolean; lastExecuted?: Date; } export interface ActionCondition { type: 'consecutive-failures' | 'uptime-below' | 'response-time-above' | 'custom'; value: number; duration?: number; // seconds } export interface HealthCheckResult { id: string; checkId: string; timestamp: Date; status: 'success' | 'failure' | 'timeout' | 'error'; responseTime: number; // ms statusCode?: number; message: string; details?: HealthCheckDetails; metrics: HealthMetrics; error?: string; retryCount: number; } export interface HealthCheckDetails { endpoint: string; method: string; headers: Record<string, string>; body?: string; responseHeaders?: Record<string, string>; responseBody?: string; contentLength?: number; resolvedIP?: string; certificateInfo?: CertificateInfo; } export interface CertificateInfo { subject: string; issuer: string; validFrom: Date; validTo: Date; fingerprint: string; serialNumber: string; } export interface HealthMetrics { availability: number; // percentage responseTime: number; // ms throughput: number; // checks per minute errorRate: number; // percentage mttr: number; // mean time to recovery in seconds mtbf: number; // mean time between failures in seconds customMetrics: Record<string, number>; } export interface HealthSummary { timestamp: Date; overallStatus: 'healthy' | 'degraded' | 'unhealthy' | 'critical'; totalChecks: number; healthyChecks: number; unhealthyChecks: number; degradedChecks: number; unknownChecks: number; averageResponseTime: number; overallAvailability: number; criticalServices: ServiceStatus[]; recentIncidents: HealthIncident[]; upcomingMaintenance: MaintenanceWindow[]; } export interface ServiceStatus { serviceId: string; serviceName: string; status: string; lastCheck: Date; responseTime: number; availability: number; checkCount: number; } export interface HealthIncident { id: string; checkId: string; serviceName: string; severity: 'low' | 'medium' | 'high' | 'critical'; status: 'open' | 'investigating' | 'resolved' | 'closed'; startTime: Date; endTime?: Date; duration?: number; // seconds description: string; impact: string; resolution?: string; assignedTo?: string; updates: IncidentUpdate[]; } export interface IncidentUpdate { id: string; timestamp: Date; status: string; message: string; author: string; } export interface CircuitBreakerConfig { enabled: boolean; failureThreshold: number; successThreshold: number; timeout: number; // seconds resetTimeout: number; // seconds monitoringPeriod: number; // seconds skipOnCircuitOpen: boolean; } export interface CircuitBreakerState { checkId: string; state: 'closed' | 'open' | 'half-open'; failures: number; successes: number; lastFailure: Date; lastSuccess: Date; lastStateChange: Date; nextAttemptTime: Date; } export class HealthChecker extends EventEmitter { private checks = new Map<string, HealthCheck>(); private results = new Map<string, HealthCheckResult[]>(); private incidents = new Map<string, HealthIncident>(); private circuitBreakers = new Map<string, CircuitBreakerState>(); private scheduledChecks = new Map<string, NodeJS.Timeout>(); private maintenanceWindows: MaintenanceWindow[] = []; private metricsAggregation: NodeJS.Timeout | null = null; private cleanupInterval: NodeJS.Timeout | null = null; constructor() { super(); this.startMetricsAggregation(); this.startCleanupProcess(); } addHealthCheck(check: Omit<HealthCheck, 'id' | 'status' | 'lastCheck' | 'nextCheck' | 'consecutiveFailures' | 'consecutiveSuccesses' | 'uptime' | 'created' | 'modified'>): string { const healthCheck: HealthCheck = { id: crypto.randomUUID(), status: 'unknown', lastCheck: new Date(0), nextCheck: new Date(), consecutiveFailures: 0, consecutiveSuccesses: 0, uptime: 100, created: new Date(), modified: new Date(), ...check }; this.checks.set(healthCheck.id, healthCheck); this.results.set(healthCheck.id, []); // Initialize circuit breaker if enabled if (check.configuration.customScript?.includes('circuitBreaker')) { this.initializeCircuitBreaker(healthCheck.id); } // Schedule the health check this.scheduleHealthCheck(healthCheck); this.emit('health-check-added', healthCheck); return healthCheck.id; } removeHealthCheck(checkId: string): boolean { const check = this.checks.get(checkId); if (!check) { return false; } // Cancel scheduled check const scheduledCheck = this.scheduledChecks.get(checkId); if (scheduledCheck) { clearTimeout(scheduledCheck); this.scheduledChecks.delete(checkId); } // Clean up data this.checks.delete(checkId); this.results.delete(checkId); this.circuitBreakers.delete(checkId); // Close any open incidents for (const incident of this.incidents.values()) { if (incident.checkId === checkId && incident.status !== 'closed') { incident.status = 'closed'; incident.endTime = new Date(); incident.duration = incident.endTime.getTime() - incident.startTime.getTime(); } } this.emit('health-check-removed', { checkId }); return true; } updateHealthCheck(checkId: string, updates: Partial<HealthCheck>): boolean { const check = this.checks.get(checkId); if (!check) { return false; } Object.assign(check, updates); check.modified = new Date(); // Reschedule if schedule changed if (updates.schedule) { const scheduledCheck = this.scheduledChecks.get(checkId); if (scheduledCheck) { clearTimeout(scheduledCheck); } this.scheduleHealthCheck(check); } this.emit('health-check-updated', check); return true; } async executeHealthCheck(checkId: string, force: boolean = false): Promise<HealthCheckResult> { const check = this.checks.get(checkId); if (!check) { throw new Error(`Health check not found: ${checkId}`); } if (!force && !check.enabled) { throw new Error(`Health check is disabled: ${checkId}`); } // Check if in maintenance window if (!force && this.isInMaintenanceWindow(check)) { const result: HealthCheckResult = { id: crypto.randomUUID(), checkId, timestamp: new Date(), status: 'success', responseTime: 0, message: 'Skipped - maintenance window', metrics: this.getDefaultMetrics(), retryCount: 0 }; this.recordResult(check, result); return result; } // Check circuit breaker const circuitBreaker = this.circuitBreakers.get(checkId); if (circuitBreaker && circuitBreaker.state === 'open' && !force) { if (Date.now() < circuitBreaker.nextAttemptTime.getTime()) { const result: HealthCheckResult = { id: crypto.randomUUID(), checkId, timestamp: new Date(), status: 'failure', responseTime: 0, message: 'Circuit breaker is open', metrics: this.getDefaultMetrics(), retryCount: 0 }; this.recordResult(check, result); return result; } else { // Transition to half-open circuitBreaker.state = 'half-open'; circuitBreaker.lastStateChange = new Date(); } } let result: HealthCheckResult; const startTime = Date.now(); try { result = await this.performHealthCheck(check); // Update circuit breaker on success if (circuitBreaker) { this.handleCircuitBreakerSuccess(circuitBreaker); } } catch (error) { const responseTime = Date.now() - startTime; result = { id: crypto.randomUUID(), checkId, timestamp: new Date(), status: 'error', responseTime, message: (error as Error).message, error: (error as Error).stack, metrics: this.getDefaultMetrics(), retryCount: 0 }; // Update circuit breaker on failure if (circuitBreaker) { this.handleCircuitBreakerFailure(circuitBreaker); } } this.recordResult(check, result); return result; } private async performHealthCheck(check: HealthCheck): Promise<HealthCheckResult> { const config = check.configuration; let retryCount = 0; let lastError: Error | null = null; while (retryCount <= config.retries) { try { const result = await this.executeCheck(check, retryCount); return result; } catch (error) { lastError = error as Error; retryCount++; if (retryCount <= config.retries) { await this.delay(config.retryDelay * 1000); } } } throw lastError; } private async executeCheck(check: HealthCheck, retryCount: number): Promise<HealthCheckResult> { const startTime = Date.now(); switch (check.type) { case 'http': return this.executeHttpCheck(check, startTime, retryCount); case 'tcp': return this.executeTcpCheck(check, startTime, retryCount); case 'database': return this.executeDatabaseCheck(check, startTime, retryCount); case 'custom': return this.executeCustomCheck(check, startTime, retryCount); case 'composite': return this.executeCompositeCheck(check, startTime, retryCount); case 'dependency': return this.executeDependencyCheck(check, startTime, retryCount); default: throw new Error(`Unsupported health check type: ${check.type}`); } } private async executeHttpCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { const config = check.configuration; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), config.timeout * 1000); try { const headers = { ...config.headers }; // Add authentication if (config.authentication) { await this.addAuthentication(headers, config.authentication); } const response = await fetch(check.target.endpoint, { method: config.method || 'GET', headers, body: config.body, signal: controller.signal }); clearTimeout(timeoutId); const responseTime = Date.now() - startTime; const responseBody = await response.text(); // Check expected status if (config.expectedStatus && !config.expectedStatus.includes(response.status)) { throw new Error(`Unexpected status code: ${response.status}`); } // Check expected content if (config.expectedContent && !responseBody.includes(config.expectedContent)) { throw new Error(`Expected content not found: ${config.expectedContent}`); } // Check expected content type const contentType = response.headers.get('content-type') || ''; if (config.expectedContentType && !contentType.includes(config.expectedContentType)) { throw new Error(`Unexpected content type: ${contentType}`); } return { id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: 'success', responseTime, statusCode: response.status, message: 'Health check successful', details: { endpoint: check.target.endpoint, method: config.method || 'GET', headers: headers, body: config.body, responseHeaders: Object.fromEntries(response.headers.entries()), responseBody: responseBody.substring(0, 1000), // Limit body size contentLength: parseInt(response.headers.get('content-length') || '0') }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }; } catch (error) { clearTimeout(timeoutId); const responseTime = Date.now() - startTime; const status = error instanceof DOMException && error.name === 'AbortError' ? 'timeout' : 'failure'; throw new Error(`HTTP check failed: ${(error as Error).message}`); } } private async executeTcpCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { const url = new URL(check.target.endpoint); const host = url.hostname; const port = parseInt(url.port) || (url.protocol === 'https:' ? 443 : 80); return new Promise((resolve, reject) => { const net = require('net'); const socket = new net.Socket(); const timeout = setTimeout(() => { socket.destroy(); reject(new Error('TCP connection timeout')); }, check.configuration.timeout * 1000); socket.connect(port, host, () => { clearTimeout(timeout); const responseTime = Date.now() - startTime; socket.destroy(); resolve({ id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: 'success', responseTime, message: 'TCP connection successful', details: { endpoint: check.target.endpoint, method: 'TCP', headers: {} }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }); }); socket.on('error', (error) => { clearTimeout(timeout); reject(error); }); }); } private async executeDatabaseCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { // Database health check implementation would depend on the specific database // This is a simplified example try { // Simulate database connection and query await this.delay(Math.random() * 100); // Random delay up to 100ms const responseTime = Date.now() - startTime; return { id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: 'success', responseTime, message: 'Database connection successful', details: { endpoint: check.target.endpoint, method: 'DATABASE', headers: {} }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }; } catch (error) { throw new Error(`Database check failed: ${(error as Error).message}`); } } private async executeCustomCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { if (!check.configuration.customScript) { throw new Error('Custom script not provided'); } try { // In production, this would execute the custom script in a secure environment // For now, we'll simulate the execution await this.delay(Math.random() * 200); const responseTime = Date.now() - startTime; return { id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: 'success', responseTime, message: 'Custom check successful', details: { endpoint: check.target.endpoint, method: 'CUSTOM', headers: {} }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }; } catch (error) { throw new Error(`Custom check failed: ${(error as Error).message}`); } } private async executeCompositeCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { const dependencyResults: HealthCheckResult[] = []; let allHealthy = true; let totalResponseTime = 0; // Execute all dependency checks for (const dependencyId of check.dependencies) { try { const dependencyResult = await this.executeHealthCheck(dependencyId); dependencyResults.push(dependencyResult); totalResponseTime += dependencyResult.responseTime; if (dependencyResult.status !== 'success') { allHealthy = false; } } catch (error) { allHealthy = false; dependencyResults.push({ id: crypto.randomUUID(), checkId: dependencyId, timestamp: new Date(), status: 'error', responseTime: 0, message: (error as Error).message, metrics: this.getDefaultMetrics(), retryCount: 0 }); } } const responseTime = Date.now() - startTime; return { id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: allHealthy ? 'success' : 'failure', responseTime, message: allHealthy ? 'All dependencies healthy' : 'One or more dependencies unhealthy', details: { endpoint: check.target.endpoint, method: 'COMPOSITE', headers: {}, responseBody: JSON.stringify(dependencyResults) }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }; } private async executeDependencyCheck(check: HealthCheck, startTime: number, retryCount: number): Promise<HealthCheckResult> { // Dependency checks verify that required services are available const dependencies = check.dependencies; const unhealthyDependencies: string[] = []; for (const dependencyId of dependencies) { const dependencyCheck = this.checks.get(dependencyId); if (!dependencyCheck || dependencyCheck.status !== 'healthy') { unhealthyDependencies.push(dependencyId); } } const responseTime = Date.now() - startTime; const isHealthy = unhealthyDependencies.length === 0; return { id: crypto.randomUUID(), checkId: check.id, timestamp: new Date(), status: isHealthy ? 'success' : 'failure', responseTime, message: isHealthy ? 'All dependencies available' : `Unhealthy dependencies: ${unhealthyDependencies.join(', ')}`, details: { endpoint: check.target.endpoint, method: 'DEPENDENCY', headers: {}, responseBody: JSON.stringify({ unhealthyDependencies }) }, metrics: this.calculateMetrics(check.id, responseTime), retryCount }; } private async addAuthentication(headers: Record<string, string>, auth: AuthConfig): Promise<void> { switch (auth.type) { case 'basic': const basicAuth = Buffer.from(`${auth.credentials.username}:${auth.credentials.password}`).toString('base64'); headers['Authorization'] = `Basic ${basicAuth}`; break; case 'bearer': headers['Authorization'] = `Bearer ${auth.credentials.token}`; break; case 'api-key': headers[auth.credentials.headerName || 'X-API-Key'] = auth.credentials.apiKey; break; case 'oauth2': // Would implement OAuth2 token refresh logic headers['Authorization'] = `Bearer ${auth.credentials.accessToken}`; break; } } private calculateMetrics(checkId: string, responseTime: number): HealthMetrics { const results = this.results.get(checkId) || []; const recentResults = results.slice(-100); // Last 100 results if (recentResults.length === 0) { return this.getDefaultMetrics(); } const successCount = recentResults.filter(r => r.status === 'success').length; const availability = (successCount / recentResults.length) * 100; const averageResponseTime = recentResults.reduce((sum, r) => sum + r.responseTime, 0) / recentResults.length; const errorRate = ((recentResults.length - successCount) / recentResults.length) * 100; // Calculate MTTR and MTBF let mttr = 0; let mtbf = 0; let failureStart: Date | null = null; let lastRecovery: Date | null = null; let failureDurations: number[] = []; let successDurations: number[] = []; for (let i = 0; i < recentResults.length; i++) { const result = recentResults[i]; if (result.status !== 'success' && !failureStart) { failureStart = result.timestamp; } else if (result.status === 'success' && failureStart) { failureDurations.push(result.timestamp.getTime() - failureStart.getTime()); failureStart = null; lastRecovery = result.timestamp; } else if (result.status === 'success' && lastRecovery && i > 0) { const prevResult = recentResults[i - 1]; if (prevResult.status === 'success') { successDurations.push(result.timestamp.getTime() - prevResult.timestamp.getTime()); } } } if (failureDurations.length > 0) { mttr = failureDurations.reduce((sum, d) => sum + d, 0) / failureDurations.length / 1000; // Convert to seconds } if (successDurations.length > 0) { mtbf = successDurations.reduce((sum, d) => sum + d, 0) / successDurations.length / 1000; // Convert to seconds } return { availability, responseTime: averageResponseTime, throughput: recentResults.length / 60, // Assuming results span roughly 1 hour errorRate, mttr, mtbf, customMetrics: {} }; } private getDefaultMetrics(): HealthMetrics { return { availability: 0, responseTime: 0, throughput: 0, errorRate: 0, mttr: 0, mtbf: 0, customMetrics: {} }; } private recordResult(check: HealthCheck, result: HealthCheckResult): void { // Store result const results = this.results.get(check.id) || []; results.push(result); // Keep only recent results (last 1000) if (results.length > 1000) { results.splice(0, results.length - 1000); } this.results.set(check.id, results); // Update check status const previousStatus = check.status; if (result.status === 'success') { check.consecutiveSuccesses++; check.consecutiveFailures = 0; if (check.consecutiveSuccesses >= check.thresholds.recoveryThreshold) { check.status = 'healthy'; } } else { check.consecutiveFailures++; check.consecutiveSuccesses = 0; if (check.consecutiveFailures >= check.thresholds.failureThreshold) { check.status = 'unhealthy'; } else if (check.consecutiveFailures > 1) { check.status = 'degraded'; } } // Check response time thresholds if (result.responseTime > check.thresholds.responseTime.critical) { check.status = check.status === 'healthy' ? 'degraded' : check.status; } check.lastCheck = result.timestamp; check.nextCheck = new Date(Date.now() + check.schedule.interval * 1000); // Calculate uptime const recentResults = results.slice(-100); if (recentResults.length > 0) { const successCount = recentResults.filter(r => r.status === 'success').length; check.uptime = (successCount / recentResults.length) * 100; } // Handle status change if (previousStatus !== check.status) { this.handleStatusChange(check, previousStatus, result); } // Execute actions this.executeActions(check, result); // Schedule next check this.scheduleHealthCheck(check); this.emit('health-check-completed', { check, result }); } private handleStatusChange(check: HealthCheck, previousStatus: string, result: HealthCheckResult): void { if (check.status === 'unhealthy' && previousStatus !== 'unhealthy') { // Create incident const incident: HealthIncident = { id: crypto.randomUUID(), checkId: check.id, serviceName: check.target.name, severity: this.mapCriticalityToSeverity(check.target.criticality), status: 'open', startTime: result.timestamp, description: `${check.name} is unhealthy: ${result.message}`, impact: this.calculateImpact(check), updates: [{ id: crypto.randomUUID(), timestamp: result.timestamp, status: 'detected', message: `Service became unhealthy: ${result.message}`, author: 'health-checker' }] }; this.incidents.set(incident.id, incident); this.emit('incident-created', incident); } else if (check.status === 'healthy' && previousStatus === 'unhealthy') { // Resolve incident for (const incident of this.incidents.values()) { if (incident.checkId === check.id && incident.status !== 'closed') { incident.status = 'resolved'; incident.endTime = result.timestamp; incident.duration = incident.endTime.getTime() - incident.startTime.getTime(); incident.resolution = `Service recovered: ${result.message}`; incident.updates.push({ id: crypto.randomUUID(), timestamp: result.timestamp, status: 'resolved', message: `Service recovered: ${result.message}`, author: 'health-checker' }); this.emit('incident-resolved', incident); break; } } } this.emit('status-changed', { checkId: check.id, serviceName: check.target.name, previousStatus, newStatus: check.status, timestamp: result.timestamp }); } private mapCriticalityToSeverity(criticality: string): 'low' | 'medium' | 'high' | 'critical' { switch (criticality) { case 'critical': return 'critical'; case 'high': return 'high'; case 'medium': return 'medium'; case 'low': return 'low'; default: return 'medium'; } } private calculateImpact(check: HealthCheck): string { switch (check.target.criticality) { case 'critical': return 'Critical service outage - immediate attention required'; case 'high': return 'High impact service degradation'; case 'medium': return 'Moderate service impact'; case 'low': return 'Low impact service issue'; default: return 'Service issue detected'; } } private executeActions(check: HealthCheck, result: HealthCheckResult): void { for (const action of check.actions) { if (!action.enabled) continue; const shouldExecute = this.shouldExecuteAction(action, check, result); if (shouldExecute) { this.executeAction(action, check, result); } } } private shouldExecuteAction(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): boolean { // Check cooldown if (action.lastExecuted) { const timeSinceLastExecution = Date.now() - action.lastExecuted.getTime(); if (timeSinceLastExecution < action.cooldown * 1000) { return false; } } // Check trigger switch (action.trigger) { case 'failure': if (result.status === 'success') return false; break; case 'recovery': if (result.status !== 'success') return false; break; case 'degraded': if (check.status !== 'degraded') return false; break; case 'threshold-breach': // Check specific threshold conditions break; } // Check conditions for (const condition of action.conditions) { if (!this.evaluateActionCondition(condition, check, result)) { return false; } } return true; } private evaluateActionCondition(condition: ActionCondition, check: HealthCheck, result: HealthCheckResult): boolean { switch (condition.type) { case 'consecutive-failures': return check.consecutiveFailures >= condition.value; case 'uptime-below': return check.uptime < condition.value; case 'response-time-above': return result.responseTime > condition.value; case 'custom': // Custom condition evaluation would be implemented here return true; default: return true; } } private async executeAction(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { action.lastExecuted = new Date(); try { switch (action.type) { case 'notification': await this.sendNotification(action, check, result); break; case 'webhook': await this.callWebhook(action, check, result); break; case 'script': await this.executeScript(action, check, result); break; case 'escalation': await this.escalateIncident(action, check, result); break; case 'auto-recovery': await this.attemptAutoRecovery(action, check, result); break; } this.emit('action-executed', { actionId: action.id, actionType: action.type, checkId: check.id, success: true }); } catch (error) { this.emit('action-failed', { actionId: action.id, actionType: action.type, checkId: check.id, error: (error as Error).message }); } } private async sendNotification(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { // Notification implementation would integrate with email, SMS, Slack, etc. this.emit('notification-sent', { recipient: action.configuration.recipient, subject: `Health Check Alert: ${check.name}`, message: `${check.target.name} is ${check.status}: ${result.message}`, checkId: check.id }); } private async callWebhook(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { const payload = { checkId: check.id, checkName: check.name, targetName: check.target.name, status: check.status, result: result, timestamp: new Date().toISOString() }; const response = await fetch(action.configuration.url, { method: action.configuration.method || 'POST', headers: { 'Content-Type': 'application/json', ...action.configuration.headers }, body: JSON.stringify(payload) }); if (!response.ok) { throw new Error(`Webhook failed: ${response.statusText}`); } } private async executeScript(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { // Script execution would be implemented with proper security measures this.emit('script-executed', { script: action.configuration.script, checkId: check.id }); } private async escalateIncident(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { // Find open incident for this check const incident = Array.from(this.incidents.values()) .find(i => i.checkId === check.id && i.status === 'open'); if (incident) { incident.updates.push({ id: crypto.randomUUID(), timestamp: new Date(), status: 'escalated', message: `Incident escalated to: ${action.configuration.escalateTo}`, author: 'health-checker' }); this.emit('incident-escalated', { incidentId: incident.id, escalatedTo: action.configuration.escalateTo }); } } private async attemptAutoRecovery(action: ActionConfig, check: HealthCheck, result: HealthCheckResult): Promise<void> { // Auto-recovery implementation would depend on the specific service this.emit('auto-recovery-attempted', { checkId: check.id, recoveryAction: action.configuration.recoveryAction }); } private initializeCircuitBreaker(checkId: string): void { const circuitBreaker: CircuitBreakerState = { checkId, state: 'closed', failures: 0, successes: 0, lastFailure: new Date(0), lastSuccess: new Date(0), lastStateChange: new Date(), nextAttemptTime: new Date() }; this.circuitBreakers.set(checkId, circuitBreaker); } private handleCircuitBreakerSuccess(circuitBreaker: CircuitBreakerState): void { circuitBreaker.successes++; circuitBreaker.lastSuccess = new Date(); if (circuitBreaker.state === 'half-open') { // Transition to closed after enough successes if (circuitBreaker.successes >= 3) { // Configurable threshold circuitBreaker.state = 'closed'; circuitBreaker.failures = 0; circuitBreaker.lastStateChange = new Date(); } } else if (circuitBreaker.state === 'closed') { circuitBreaker.failures = 0; } } private handleCircuitBreakerFailure(circuitBreaker: CircuitBreakerState): void { circuitBreaker.failures++; circuitBreaker.lastFailure = new Date(); if (circuitBreaker.state === 'closed' && circuitBreaker.failures >= 5) { // Configurable threshold circuitBreaker.state = 'open'; circuitBreaker.lastStateChange = new Date(); circuitBreaker.nextAttemptTime = new Date(Date.now() + 60000); // 1 minute timeout } else if (circuitBreaker.state === 'half-open') { circuitBreaker.state = 'open'; circuitBreaker.lastStateChange = new Date(); circuitBreaker.nextAttemptTime = new Date(Date.now() + 60000); } } private scheduleHealthCheck(check: HealthCheck): void { // Clear existing schedule const existingSchedule = this.scheduledChecks.get(check.id); if (existingSchedule) { clearTimeout(existingSchedule); } if (!check.enabled) { return; } // Calculate next execution time with jitter const jitter = Math.random() * check.schedule.jitter * 1000; const delay = (check.schedule.interval * 1000) + jitter; const timeout = setTimeout(async () => { try { await this.executeHealthCheck(check.id); } catch (error) { // Error is already handled in executeHealthCheck } }, delay); this.scheduledChecks.set(check.id, timeout); } private isInMaintenanceWindow(check: HealthCheck): boolean { const now = new Date(); for (const window of check.schedule.maintenanceWindows || []) { if (now >= window.start && now <= window.end) { return true; } } return false; } private startMetricsAggregation(): void { this.metricsAggregation = setInterval(() => { this.aggregateMetrics(); }, 60000); // Every minute } private aggregateMetrics(): void { const summary = this.generateHealthSummary(); this.emit('metrics-aggregated', summary); } private startCleanupProcess(): void { this.cleanupInterval = setInterval(() => { this.cleanupOldData(); }, 3600000); // Every hour } private cleanupOldData(): void { const cutoffTime = Date.now() - 7 * 24 * 60 * 60 * 1000; // 7 days // Clean up old results for (const [checkId, results] of this.results) { const filteredResults = results.filter(r => r.timestamp.getTime() > cutoffTime); this.results.set(checkId, filteredResults); } // Clean up old incidents for (const [incidentId, incident] of this.incidents) { if (incident.endTime && incident.endTime.getTime() < cutoffTime) { this.incidents.delete(incidentId); } } } private delay(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } // Public API methods generateHealthSummary(): HealthSummary { const checks = Array.from(this.checks.values()); const activeIncidents = Array.from(this.incidents.values()) .filter(i => i.status === 'open' || i.status === 'investigating') .sort((a, b) => b.startTime.getTime() - a.startTime.getTime()) .slice(0, 10); const healthyChecks = checks.filter(c => c.status === 'healthy').length; const unhealthyChecks = checks.filter(c => c.status === 'unhealthy').length; const degradedChecks = checks.filter(c => c.status === 'degraded').length; const unknownChecks = checks.filter(c => c.status === 'unknown').length; let overallStatus: 'healthy' | 'degraded' | 'unhealthy' | 'critical'; if (unhealthyChecks > 0) { const criticalUnhealthy = checks.filter(c => c.status === 'unhealthy' && c.target.criticality === 'critical').length; overallStatus = criticalUnhealthy > 0 ? 'critical' : 'unhealthy'; } else if (degradedChecks > 0) { overallStatus = 'degraded'; } else { overallStatus = 'healthy'; } const criticalServices = checks .filter(c => c.target.criticality === 'critical') .map(c => ({ serviceId: c.target.id, serviceName: c.target.name, status: c.status, lastCheck: c.lastCheck, responseTime: this.getAverageResponseTime(c.id), availability: c.uptime, checkCount: (this.results.get(c.id) || []).length })); return { timestamp: new Date(), overallStatus, totalChecks: checks.length, healthyChecks, unhealthyChecks, degradedChecks, unknownChecks, averageResponseTime: this.calculateOverallAverageResponseTime(), overallAvailability: this.calculateOverallAvailability(), criticalServices, recentIncidents: activeIncidents, upcomingMaintenance: this.maintenanceWindows.filter(w => w.start > new Date()) }; } private getAverageResponseTime(checkId: string): number { const results = this.results.get(checkId) || []; if (results.length === 0) return 0; const recentResults = results.slice(-10); return recentResults.reduce((sum, r) => sum + r.responseTime, 0) / recentResults.length; } private calculateOverallAverageResponseTime(): number { const allResults = Array.from(this.results.values()).flat(); if (allResults.length === 0) return 0; const recentResults = allResults.slice(-100); return recentResults.reduce((sum, r) => sum + r.responseTime, 0) / recentResults.length; } private calculateOverallAvailability(): number { const checks = Array.from(this.checks.values()); if (checks.length === 0) return 100; const totalUptime = checks.reduce((sum, c) => sum + c.uptime, 0); return totalUptime / checks.length; } getHealthChecks(): HealthCheck[] { return Array.from(this.checks.values()); } getHealthCheck(checkId: string): HealthCheck | null { return this.checks.get(checkId) || null; } getHealthCheckResults(checkId: string, limit: number = 100): HealthCheckResult[] { const results = this.results.get(checkId) || []; return results.slice(-limit); } getIncidents(): HealthIncident[] { return Array.from(this.incidents.values()); } getIncident(incidentId: string): HealthIncident | null { return this.incidents.get(incidentId) || null; } getCircuitBreakerStates(): CircuitBreakerState[] { return Array.from(this.circuitBreakers.values()); } addMaintenanceWindow(window: MaintenanceWindow): void { this.maintenanceWindows.push(window); this.emit('maintenance-window-added', window); } getStats(): any { const checks = Array.from(this.checks.values()); const results = Array.from(this.results.values()).flat(); const incidents = Array.from(this.incidents.values()); return { checks: { total: checks.length, enabled: checks.filter(c => c.enabled).length, healthy: checks.filter(c => c.status === 'healthy').length, unhealthy: checks.filter(c => c.status === 'unhealthy').length, degraded: checks.filter(c => c.status === 'degraded').length, unknown: checks.filter(c => c.status === 'unknown').length }, results: { total: results.length, successful: results.filter(r => r.status === 'success').length, failed: results.filter(r => r.status === 'failure').length, errors: results.filter(r => r.status === 'error').length, timeouts: results.filter(r => r.status === 'timeout').length }, incidents: { total: incidents.length, open: incidents.filter(i => i.status === 'open').length, investigating: incidents.filter(i => i.status === 'investigating').length, resolved: incidents.filter(i => i.status === 'resolved').length, closed: incidents.filter(i => i.status === 'closed').length }, circuitBreakers: { total: this.circuitBreakers.size, closed: Array.from(this.circuitBreakers.values()).filter(cb => cb.state === 'closed').length, open: Array.from(this.circuitBreakers.values()).filter(cb => cb.state === 'open').length, halfOpen: Array.from(this.circuitBreakers.values()).filter(cb => cb.state === 'half-open').length }, maintenance: this.maintenanceWindows.length }; } destroy(): void { // Clear all scheduled checks for (const timeout of this.scheduledChecks.values()) { clearTimeout(timeout); } // Clear intervals if (this.metricsAggregation) { clearInterval(this.metricsAggregation); } if (this.cleanupInterval) { clearInterval(this.cleanupInterval); } this.checks.clear(); this.results.clear(); this.incidents.clear(); this.circuitBreakers.clear(); this.scheduledChecks.clear(); this.maintenanceWindows = []; this.removeAllListeners(); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Coder-RL/Claude_MCPServer_Dev1'

If you have feedback or need assistance with the MCP directory API, please join our Discord server