mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
alerts: add ServiceDown
alert to detect "dead" services (#1196)
This commit is contained in:
parent
712725b4a5
commit
c4f6b79d76
1 changed files with 10 additions and 1 deletions
|
@ -3,8 +3,8 @@
|
|||
# and threshold calibration according to every specific setup.
|
||||
groups:
|
||||
- name: serviceHealth
|
||||
# note the `job` filter and update accordingly to your setup
|
||||
rules:
|
||||
# note the `job` filter and update accordingly to your setup
|
||||
- alert: TooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
|
||||
labels:
|
||||
|
@ -14,6 +14,15 @@ groups:
|
|||
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
|
||||
It might be crashlooping."
|
||||
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: "critical"
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
# Alerts group for VM cluster assumes that Grafana dashboard
|
||||
# https://grafana.com/grafana/dashboards/11176 is installed.
|
||||
# Please, update the `dashboard` annotation according to your setup.
|
||||
|
|
Loading…
Reference in a new issue