prometheus.rules.yml•1.23 kB
groups:
- name: sync_alerts
rules:
- alert: SyncJobFailed
expr: sync_operations_total{status="failed"} > 0
for: 1m
labels:
severity: critical
annotations:
summary: Sync job failed
description: The sync job has failed. Please investigate immediately.
- alert: SyncJobSlow
expr: histogram_quantile(0.95, sum(rate(sync_operation_duration_seconds_bucket[5m])) by (le)) > 3600
for: 5m
labels:
severity: warning
annotations:
summary: Sync job is running slow
description: The 95th percentile of sync job duration is over 1 hour.
- name: memory_leak_alerts
rules:
- alert: MemoryLeaksDetected
expr: memscope_leaked_allocations > 0
for: 2m
labels:
severity: critical
annotations:
summary: Memory leaks detected by memscope
description: One or more leaked allocations have been observed. Export report via /memory/leaks.
- alert: RisingActiveMemory
expr: increase(memscope_active_memory_bytes[10m]) > 104857600
for: 10m
labels:
severity: warning
annotations:
summary: Active heap grew >100MB in the last 10m
description: Possible leak or memory growth trend. Investigate recent changes.