From 75196d7234afde97f9be46b36f25a0f2675731f9 Mon Sep 17 00:00:00 2001 From: 7840vz <122374011+7840vz@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:01:29 +0300 Subject: [PATCH] alerts: inverse grouping in vmagent alerts (#5429) Aggregations with by() have one sideeffect, that any custom labels you add to hosts are dropped too which can be used for alerts routing. Therefore, some good practice could be to use without() instead, with labels, like without(path) , or without(url) to get same aggregations but with any external labels left intact. --- deployment/docker/alerts-vmagent.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deployment/docker/alerts-vmagent.yml b/deployment/docker/alerts-vmagent.yml index 6d7201fff..e850bc0cb 100644 --- a/deployment/docker/alerts-vmagent.yml +++ b/deployment/docker/alerts-vmagent.yml @@ -10,7 +10,7 @@ groups: concurrency: 2 rules: - alert: PersistentQueueIsDroppingData - expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0 + expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0 for: 10m labels: severity: critical @@ -21,7 +21,7 @@ groups: on instance {{ $labels.instance }} for the last 10m." - alert: RejectedRemoteWriteDataBlocksAreDropped - expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0 + expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0 for: 15m labels: severity: warning @@ -31,7 +31,7 @@ groups: remote-write server data blocks. Check the logs to find the reason for rejects." - alert: TooManyScrapeErrors - expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0 + expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0 for: 15m labels: severity: warning @@ -41,9 +41,9 @@ groups: - alert: TooManyWriteErrors expr: | - (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance) + (sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type) + - sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0 + sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0 for: 15m labels: severity: warning @@ -52,7 +52,7 @@ groups: summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m." - alert: TooManyRemoteWriteErrors - expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0 + expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0 for: 15m labels: severity: warning @@ -65,9 +65,9 @@ groups: - alert: RemoteWriteConnectionIsSaturated expr: | ( - sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) + rate(vmagent_remotewrite_send_duration_seconds_total[5m]) / - max(vmagent_remotewrite_queues) by(job, instance, url) + vmagent_remotewrite_queues ) > 0.9 for: 15m labels: