Alerts single update (#1510)

* alerts: move `ProcessNearFDLimits` to `vm-health` group since it is relevant for all services

* alerts: add new `TooHighMemoryUsage` alerting rule
This commit is contained in:
Roman Khavronenko 2021-08-02 15:51:24 +03:00 committed by GitHub
parent 66eb60f20d
commit 408ba43092
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -19,11 +19,31 @@ groups:
expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0 expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0
for: 2m for: 2m
labels: labels:
severity: "critical" severity: critical
annotations: annotations:
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: ProcessNearFDLimits
expr: (process_max_fds - process_open_fds) < 100
for: 5m
labels:
severity: critical
annotations:
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
Consider to increase the limit as fast as possible."
- alert: TooHighMemoryUsage
expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
Consider to either increase available memory or decrease the load on the process."
# Alerts group for VM single assumes that Grafana dashboard # Alerts group for VM single assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/10229 is installed. # https://grafana.com/grafana/dashboards/10229 is installed.
# Pls update the `dashboard` annotation according to your setup. # Pls update the `dashboard` annotation according to your setup.
@ -166,17 +186,6 @@ groups:
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series." for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
- alert: ProcessNearFDLimits
expr: (process_max_fds - process_open_fds) < 100
for: 5m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=75&var-instance={{ $labels.instance }}"
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
Consider to increase the limit as fast as possible."
- alert: LabelsLimitExceededOnIngestion - alert: LabelsLimitExceededOnIngestion
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
for: 15m for: 15m