From d0e41909699186341a727859d8ca41b95eba6ddd Mon Sep 17 00:00:00 2001 From: hagen1778 Date: Tue, 16 Jan 2024 09:49:39 +0100 Subject: [PATCH] deployment/alerts: add `job` label to `DiskRunsOutOfSpace` alerting rule So it is easier to understand to which installation the triggered instance belongs. Signed-off-by: hagen1778 --- deployment/docker/alerts-cluster.yml | 8 ++++---- deployment/docker/alerts.yml | 8 ++++---- docs/CHANGELOG.md | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml index 2d6a1b8ac7..4817d81b31 100644 --- a/deployment/docker/alerts-cluster.yml +++ b/deployment/docker/alerts-cluster.yml @@ -34,17 +34,17 @@ groups: - alert: DiskRunsOutOfSpace expr: | - sum(vm_data_size_bytes) by(instance) / + sum(vm_data_size_bytes) by(job, instance) / ( - sum(vm_free_disk_space_bytes) by(instance) + - sum(vm_data_size_bytes) by(instance) + sum(vm_free_disk_space_bytes) by(job, instance) + + sum(vm_data_size_bytes) by(job, instance) ) > 0.8 for: 30m labels: severity: critical annotations: dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=200&var-instance={{ $labels.instance }}" - summary: "Instance {{ $labels.instance }} will run out of disk space soon" + summary: "Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon" description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index d962c9fd62..785417278c 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -34,17 +34,17 @@ groups: - alert: DiskRunsOutOfSpace expr: | - sum(vm_data_size_bytes) by(instance) / + sum(vm_data_size_bytes) by(job, instance) / ( - sum(vm_free_disk_space_bytes) by(instance) + - sum(vm_data_size_bytes) by(instance) + sum(vm_free_disk_space_bytes) by(job, instance) + + sum(vm_data_size_bytes) by(job, instance) ) > 0.8 for: 30m labels: severity: critical annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}" - summary: "Instance {{ $labels.instance }} will run out of disk space soon" + summary: "Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon" description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 41ee0efd7e..0f72ef000c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -43,6 +43,7 @@ The sandbox cluster installation is running under the constant load generated by * FEATURE: dashboards/cluster: add panels for detailed visualization of traffic usage between vmstorage, vminsert, vmselect components and their clients. New panels are available in the rows dedicated to specific components. * FEATURE: dashboards/cluster: update "Slow Queries" panel to show percentage of the slow queries to the total number of read queries served by vmselect. The percentage value should make it more clear for users whether there is a service degradation. * FEATURE [vmctl](https://docs.victoriametrics.com/vmctl.html): add `-vm-native-src-insecure-skip-verify` and `-vm-native-dst-insecure-skip-verify` command-line flags for native protocol. It can be used for skipping TLS certificate verification when connecting to the source or destination addresses. +* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): add `job` label to `DiskRunsOutOfSpace` alerting rule, so it is easier to understand to which installation the triggered instance belongs. * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly return full results when `-search.skipSlowReplicas` command-line flag is passed to `vmselect` and when [vmstorage groups](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmstorage-groups-at-vmselect) are in use. Previously partial results could be returned in this case. * BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped.