From 8fb68152e67712ed2c16dcfccf7cf4d0af140835 Mon Sep 17 00:00:00 2001 From: hagen1778 Date: Mon, 11 Dec 2023 15:17:30 +0100 Subject: [PATCH] alerts: simplify aggregation of alerting rules This is follow-up after https://github.com/VictoriaMetrics/VictoriaMetrics/commit/75196d7234afde97f9be46b36f25a0f2675731f9 It updates some of the alerting rules to remove unnecessary aggregations. It keeps aggregations for expressions which are using multiple time series filters to make sure their label will match. Signed-off-by: hagen1778 --- deployment/docker/alerts-cluster.yml | 6 +++--- deployment/docker/alerts-health.yml | 4 ++-- deployment/docker/alerts-vmalert.yml | 12 ++++++------ deployment/docker/alerts.yml | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml index 2994b6f9a..2d6a1b8ac 100644 --- a/deployment/docker/alerts-cluster.yml +++ b/deployment/docker/alerts-cluster.yml @@ -81,7 +81,7 @@ groups: Possible reasons for errors are misconfiguration, overload, network blips or unreachable components." - alert: RowsRejectedOnIngestion - expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 + expr: rate(vm_rows_ignored_total[5m]) > 0 for: 15m labels: severity: warning @@ -113,7 +113,7 @@ groups: expr: | sum(increase(vm_new_timeseries_created_total[24h])) > - (sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3) + (sum(vm_cache_entries{type="storage/hour_metric_ids"}) * 3) for: 15m labels: severity: warning @@ -155,7 +155,7 @@ groups: Consider to increase the limit as fast as possible." - alert: LabelsLimitExceededOnIngestion - expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 + expr: increase(vm_metrics_with_dropped_labels_total[5m]) > 0 for: 15m labels: severity: warning diff --git a/deployment/docker/alerts-health.yml b/deployment/docker/alerts-health.yml index caf830d93..808850339 100644 --- a/deployment/docker/alerts-health.yml +++ b/deployment/docker/alerts-health.yml @@ -55,7 +55,7 @@ groups: Consider to either increase available CPU resources or decrease the load on the process." - alert: TooManyLogs - expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0 + expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 for: 15m labels: severity: warning @@ -65,7 +65,7 @@ groups: Worth to check logs for specific error messages." - alert: TooManyTSIDMisses - expr: sum(rate(vm_missing_tsids_for_metric_id_total[5m])) by (job, instance) > 0 + expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0 for: 10m labels: severity: critical diff --git a/deployment/docker/alerts-vmalert.yml b/deployment/docker/alerts-vmalert.yml index 44af7ecaa..53f86a287 100644 --- a/deployment/docker/alerts-vmalert.yml +++ b/deployment/docker/alerts-vmalert.yml @@ -18,7 +18,7 @@ groups: Check vmalert's logs for detailed error message." - alert: AlertingRulesError - expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) by(job, instance, group, file) > 0 + expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0 for: 5m labels: severity: warning @@ -29,7 +29,7 @@ groups: Check vmalert's logs for detailed error message." - alert: RecordingRulesError - expr: sum(increase(vmalert_recording_rules_errors_total[5m])) by(job, instance, group, file) > 0 + expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0 for: 5m labels: severity: warning @@ -40,7 +40,7 @@ groups: Check vmalert's logs for detailed error message." - alert: RecordingRulesNoData - expr: sum(vmalert_recording_rules_last_evaluation_samples) by(job, group, recording, file) < 1 + expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1 for: 30m labels: severity: info @@ -52,7 +52,7 @@ groups: or incorrect query expression." - alert: TooManyMissedIterations - expr: sum(increase(vmalert_iteration_missed_total[5m])) by(job, instance, group, file) > 0 + expr: increase(vmalert_iteration_missed_total[5m]) > 0 for: 15m labels: severity: warning @@ -65,7 +65,7 @@ groups: If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries." - alert: RemoteWriteErrors - expr: sum(increase(vmalert_remotewrite_errors_total[5m])) by(job, instance) > 0 + expr: increase(vmalert_remotewrite_errors_total[5m]) > 0 for: 15m labels: severity: warning @@ -75,7 +75,7 @@ groups: or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." - alert: AlertmanagerErrors - expr: sum(increase(vmalert_alerts_send_errors_total[5m])) by(job, instance, addr) > 0 + expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 for: 15m labels: severity: warning diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 62df9af9d..d962c9fd6 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -61,7 +61,7 @@ groups: Please verify if clients are sending correct requests." - alert: RowsRejectedOnIngestion - expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 + expr: rate(vm_rows_ignored_total[5m]) > 0 for: 15m labels: severity: warning @@ -124,7 +124,7 @@ groups: See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183" - alert: LabelsLimitExceededOnIngestion - expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 + expr: increase(vm_metrics_with_dropped_labels_total[5m]) > 0 for: 15m labels: severity: warning