Columbia MCP Server

groups: - name: mcp_alerts rules: - alert: HighCPUUsage expr: rate(process_cpu_seconds_total[5m]) * 100 > 80 for: 5m labels: severity: warning annotations: summary: High CPU usage detected description: "CPU usage is above 80% for 5 minutes" - alert: HighMemoryUsage expr: (process_resident_memory_bytes / container_memory_limit_bytes) * 100 > 85 for: 5m labels: severity: warning annotations: summary: High memory usage detected description: "Memory usage is above 85% for 5 minutes" - alert: HighRedisMemory expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 5m labels: severity: warning annotations: summary: High Redis memory usage description: "Redis memory usage is above 80% for 5 minutes" - alert: SlowEndpoints expr: http_request_duration_seconds{quantile="0.9"} > 1 for: 5m labels: severity: warning annotations: summary: Slow endpoint detected description: "90th percentile of request duration is above 1s for 5 minutes" - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5 for: 5m labels: severity: critical annotations: summary: High error rate detected description: "Error rate is above 5% for 5 minutes" - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: summary: Redis instance is down description: "Redis instance has been down for more than 1 minute" - alert: NginxDown expr: nginx_up == 0 for: 1m labels: severity: critical annotations: summary: Nginx instance is down description: "Nginx instance has been down for more than 1 minute"