alerts: add ServiceDown alert to detect "dead" services (#1196)

This commit is contained in:
Roman Khavronenko 2021-04-08 16:23:10 +01:00 committed by GitHub
parent 712725b4a5
commit c4f6b79d76
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -3,8 +3,8 @@
# and threshold calibration according to every specific setup.
groups:
- name: serviceHealth
# note the `job` filter and update accordingly to your setup
rules:
# note the `job` filter and update accordingly to your setup
- alert: TooManyRestarts
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
labels:
@ -14,6 +14,15 @@ groups:
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
It might be crashlooping."
- alert: ServiceDown
expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
for: 2m
labels:
severity: "critical"
annotations:
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
# Alerts group for VM cluster assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/11176 is installed.
# Please, update the `dashboard` annotation according to your setup.