Skip to main content
Glama
alerts.yml8.45 kB
# Prometheus Alert Rules for KYC MCP Server groups: - name: kyc_mcp_server_alerts interval: 30s rules: # Instance down alert - alert: InstanceDown expr: up == 0 for: 2m labels: severity: critical component: infrastructure annotations: summary: "Instance {{ $labels.instance }} is down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." # High CPU usage - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning component: system annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% (current value: {{ $value }}%)" # Critical CPU usage - alert: CriticalCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 2m labels: severity: critical component: system annotations: summary: "Critical CPU usage on {{ $labels.instance }}" description: "CPU usage is above 95% (current value: {{ $value }}%)" # High memory usage - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 5m labels: severity: warning component: system annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 80% (current value: {{ $value }}%)" # Critical memory usage - alert: CriticalMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 95 for: 2m labels: severity: critical component: system annotations: summary: "Critical memory usage on {{ $labels.instance }}" description: "Memory usage is above 95% (current value: {{ $value }}%)" # High disk usage - alert: HighDiskUsage expr: (node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"} - node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"}) / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"} * 100 > 80 for: 5m labels: severity: warning component: system annotations: summary: "High disk usage on {{ $labels.instance }}" description: "Disk usage on {{ $labels.mountpoint }} is above 80% (current value: {{ $value }}%)" # Critical disk usage - alert: CriticalDiskUsage expr: (node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"} - node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"}) / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|vfat"} * 100 > 90 for: 2m labels: severity: critical component: system annotations: summary: "Critical disk usage on {{ $labels.instance }}" description: "Disk usage on {{ $labels.mountpoint }} is above 90% (current value: {{ $value }}%)" # High disk I/O - alert: HighDiskIO expr: rate(node_disk_io_time_seconds_total[5m]) > 0.8 for: 5m labels: severity: warning component: system annotations: summary: "High disk I/O on {{ $labels.instance }}" description: "Disk I/O time is high on {{ $labels.device }} (current value: {{ $value }})" # Redis down - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical component: redis annotations: summary: "Redis is down on {{ $labels.instance }}" description: "Redis instance has been down for more than 1 minute." # High Redis memory usage - alert: HighRedisMemoryUsage expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 5m labels: severity: warning component: redis annotations: summary: "High Redis memory usage on {{ $labels.instance }}" description: "Redis memory usage is above 80% (current value: {{ $value }}%)" # High number of rejected connections - alert: HighRejectedConnections expr: rate(redis_rejected_connections_total[5m]) > 0 for: 2m labels: severity: warning component: redis annotations: summary: "Redis rejecting connections on {{ $labels.instance }}" description: "Redis is rejecting connections (rate: {{ $value }} per second)" # Application error rate - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5 for: 5m labels: severity: warning component: application annotations: summary: "High error rate on {{ $labels.instance }}" description: "Error rate is above 5% (current value: {{ $value }}%)" # High response time - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning component: application annotations: summary: "High response time on {{ $labels.instance }}" description: "95th percentile response time is above 1 second (current value: {{ $value }}s)" # Rate limit exceeded frequently - alert: FrequentRateLimitExceeded expr: rate(rate_limit_exceeded_total[5m]) > 10 for: 5m labels: severity: warning component: application annotations: summary: "Frequent rate limit exceeded on {{ $labels.instance }}" description: "Rate limit is being exceeded frequently (rate: {{ $value }} per second)" # Container restart - alert: ContainerRestart expr: rate(container_last_seen[5m]) > 0 for: 1m labels: severity: warning component: docker annotations: summary: "Container {{ $labels.name }} restarted" description: "Container {{ $labels.name }} has restarted" # High container CPU usage - alert: HighContainerCPU expr: rate(container_cpu_usage_seconds_total[5m]) * 100 > 80 for: 5m labels: severity: warning component: docker annotations: summary: "High CPU usage in container {{ $labels.name }}" description: "Container CPU usage is above 80% (current value: {{ $value }}%)" # High container memory usage - alert: HighContainerMemory expr: container_memory_usage_bytes / container_spec_memory_limit_bytes * 100 > 80 for: 5m labels: severity: warning component: docker annotations: summary: "High memory usage in container {{ $labels.name }}" description: "Container memory usage is above 80% (current value: {{ $value }}%)" # SSL certificate expiring soon - alert: SSLCertificateExpiringSoon expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30 for: 1h labels: severity: warning component: ssl annotations: summary: "SSL certificate expiring soon" description: "SSL certificate will expire in {{ $value }} days" # SSL certificate expired - alert: SSLCertificateExpired expr: ssl_certificate_expiry_seconds - time() < 0 for: 1m labels: severity: critical component: ssl annotations: summary: "SSL certificate expired" description: "SSL certificate has expired" # Too many open file descriptors - alert: TooManyOpenFileDescriptors expr: process_open_fds / process_max_fds * 100 > 80 for: 5m labels: severity: warning component: system annotations: summary: "Too many open file descriptors on {{ $labels.instance }}" description: "Open file descriptors usage is above 80% (current value: {{ $value }}%)"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/CTD-Techs/CTD-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server