mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
alerts: add ServiceDown
alert to detect "dead" services (#1196)
This commit is contained in:
parent
712725b4a5
commit
c4f6b79d76
1 changed files with 10 additions and 1 deletions
|
@ -3,8 +3,8 @@
|
||||||
# and threshold calibration according to every specific setup.
|
# and threshold calibration according to every specific setup.
|
||||||
groups:
|
groups:
|
||||||
- name: serviceHealth
|
- name: serviceHealth
|
||||||
rules:
|
|
||||||
# note the `job` filter and update accordingly to your setup
|
# note the `job` filter and update accordingly to your setup
|
||||||
|
rules:
|
||||||
- alert: TooManyRestarts
|
- alert: TooManyRestarts
|
||||||
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
|
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
|
||||||
labels:
|
labels:
|
||||||
|
@ -14,6 +14,15 @@ groups:
|
||||||
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
|
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
|
||||||
It might be crashlooping."
|
It might be crashlooping."
|
||||||
|
|
||||||
|
- alert: ServiceDown
|
||||||
|
expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
annotations:
|
||||||
|
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
||||||
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||||
|
|
||||||
# Alerts group for VM cluster assumes that Grafana dashboard
|
# Alerts group for VM cluster assumes that Grafana dashboard
|
||||||
# https://grafana.com/grafana/dashboards/11176 is installed.
|
# https://grafana.com/grafana/dashboards/11176 is installed.
|
||||||
# Please, update the `dashboard` annotation according to your setup.
|
# Please, update the `dashboard` annotation according to your setup.
|
||||||
|
|
Loading…
Reference in a new issue