diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index fd61a943e..ad5fd58a3 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -3,8 +3,8 @@ # and threshold calibration according to every specific setup. groups: - name: serviceHealth + # note the `job` filter and update accordingly to your setup rules: - # note the `job` filter and update accordingly to your setup - alert: TooManyRestarts expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2 labels: @@ -14,6 +14,15 @@ groups: description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. It might be crashlooping." + - alert: ServiceDown + expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0 + for: 2m + labels: + severity: "critical" + annotations: + summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." + # Alerts group for VM cluster assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/11176 is installed. # Please, update the `dashboard` annotation according to your setup.