alerts: add ServiceDown alert to detect "dead" services (#1196)

2024-11-21 14:44:00 +00:00 · 2021-04-08 16:23:10 +01:00 · 2021-04-08 16:23:10 +01:00 · c4f6b79d76
commit c4f6b79d76
parent 712725b4a5
1 changed files with 10 additions and 1 deletions
--- a/deployment/docker/alerts.yml
+++ b/deployment/docker/alerts.yml
@ -3,8 +3,8 @@
 # and threshold calibration according to every specific setup.
 groups:
  - name: serviceHealth
+    # note the `job` filter and update accordingly to your setup
    rules:
-      # note the `job` filter and update accordingly to your setup
      - alert: TooManyRestarts
        expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
        labels:
@ -14,6 +14,15 @@ groups:
          description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
            It might be crashlooping."

+      - alert: ServiceDown
+        expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
+        for: 2m
+        labels:
+          severity: "critical"
+        annotations:
+          summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
+          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
+
  # Alerts group for VM cluster assumes that Grafana dashboard
  # https://grafana.com/grafana/dashboards/11176 is installed.
  # Please, update the `dashboard` annotation according to your setup.