mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-03-21 15:45:01 +00:00
Alerts single update (#1510)
* alerts: move `ProcessNearFDLimits` to `vm-health` group since it is relevant for all services * alerts: add new `TooHighMemoryUsage` alerting rule
This commit is contained in:
parent
66eb60f20d
commit
408ba43092
1 changed files with 21 additions and 12 deletions
|
@ -19,11 +19,31 @@ groups:
|
||||||
expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0
|
expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: "critical"
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
||||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||||
|
|
||||||
|
- alert: ProcessNearFDLimits
|
||||||
|
expr: (process_max_fds - process_open_fds) < 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
||||||
|
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
||||||
|
Consider to increase the limit as fast as possible."
|
||||||
|
|
||||||
|
- alert: TooHighMemoryUsage
|
||||||
|
expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
|
||||||
|
description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
|
||||||
|
Consider to either increase available memory or decrease the load on the process."
|
||||||
|
|
||||||
# Alerts group for VM single assumes that Grafana dashboard
|
# Alerts group for VM single assumes that Grafana dashboard
|
||||||
# https://grafana.com/grafana/dashboards/10229 is installed.
|
# https://grafana.com/grafana/dashboards/10229 is installed.
|
||||||
# Pls update the `dashboard` annotation according to your setup.
|
# Pls update the `dashboard` annotation according to your setup.
|
||||||
|
@ -166,17 +186,6 @@ groups:
|
||||||
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
|
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
|
||||||
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
|
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
|
||||||
|
|
||||||
- alert: ProcessNearFDLimits
|
|
||||||
expr: (process_max_fds - process_open_fds) < 100
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=75&var-instance={{ $labels.instance }}"
|
|
||||||
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
|
||||||
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
|
||||||
Consider to increase the limit as fast as possible."
|
|
||||||
|
|
||||||
- alert: LabelsLimitExceededOnIngestion
|
- alert: LabelsLimitExceededOnIngestion
|
||||||
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
|
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
|
Loading…
Reference in a new issue