From 4369bc1df2260e0e48657aad156534e3352a7657 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 8 Feb 2024 16:43:39 +0800 Subject: [PATCH] deployment/dashboards: fix `Storage full ETA` panels (#5747) During background downsampling, rate(vm_deduplicated_samples_total{type="merge"}) could be much bigger than rate(vm_rows_added_to_storage_total) and it could last quite some time, which causes negative values of Storage full ETA and confuses users, see playground. Instead of trying to get more accurate results during downsampling, I think it's ok to ignore vm_deduplicated_samples_total at all, it's more reasonable to see Storage full ETA increase after downsampling. --------- Co-authored-by: Aliaksandr Valialkin --- dashboards/victoriametrics-cluster.json | 8 ++++---- dashboards/victoriametrics.json | 4 ++-- dashboards/vm/victoriametrics-cluster.json | 8 ++++---- dashboards/vm/victoriametrics.json | 4 ++-- deployment/docker/alerts-cluster.yml | 5 +---- deployment/docker/alerts.yml | 5 +---- docs/CHANGELOG.md | 1 + 7 files changed, 15 insertions(+), 20 deletions(-) diff --git a/dashboards/victoriametrics-cluster.json b/dashboards/victoriametrics-cluster.json index 80bc199a10..8ede168a1e 100644 --- a/dashboards/victoriametrics-cluster.json +++ b/dashboards/victoriametrics-cluster.json @@ -4857,7 +4857,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Shows the approx time needed to reach 100% of disk capacity for at least one vmstorage node based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", + "description": "Shows the approx time needed to reach 100% of disk capacity for at least one vmstorage node based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -4952,7 +4952,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "min(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n))", + "expr": "min(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n))", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -9050,7 +9050,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", + "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -9139,7 +9139,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n)", + "expr": "vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n)", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 6b75227802..35db488aec 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -4122,7 +4122,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", + "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -4211,7 +4211,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"} \n/ ignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job\", instance=~\"$instance\"}[1d]) \n - ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) \n / sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n )", + "expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"} \n/ ignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job\", instance=~\"$instance\"}[1d]) \n * scalar(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) \n / sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n )", "format": "time_series", "hide": false, "interval": "", diff --git a/dashboards/vm/victoriametrics-cluster.json b/dashboards/vm/victoriametrics-cluster.json index 7226feb264..e5c894dfaf 100644 --- a/dashboards/vm/victoriametrics-cluster.json +++ b/dashboards/vm/victoriametrics-cluster.json @@ -4858,7 +4858,7 @@ "type": "victoriametrics-datasource", "uid": "$ds" }, - "description": "Shows the approx time needed to reach 100% of disk capacity for at least one vmstorage node based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", + "description": "Shows the approx time needed to reach 100% of disk capacity for at least one vmstorage node based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -4953,7 +4953,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "min(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n))", + "expr": "min(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n))", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -9051,7 +9051,7 @@ "type": "victoriametrics-datasource", "uid": "$ds" }, - "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", + "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -9140,7 +9140,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n)", + "expr": "vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])\n * scalar(\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n)", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/dashboards/vm/victoriametrics.json b/dashboards/vm/victoriametrics.json index ec8c1e6749..6e95f9ad91 100644 --- a/dashboards/vm/victoriametrics.json +++ b/dashboards/vm/victoriametrics.json @@ -4123,7 +4123,7 @@ "type": "victoriametrics-datasource", "uid": "$ds" }, - "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", + "description": "Shows the approx time needed to reach 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* compression.\n\nnote: this panel doesn't count deduplication operations which could release disk and increase the time.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.", "fieldConfig": { "defaults": { "color": { @@ -4212,7 +4212,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"} \n/ ignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job\", instance=~\"$instance\"}[1d]) \n - ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) \n / sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n )", + "expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"} \n/ ignoring(path) (\n rate(vm_rows_added_to_storage_total{job=~\"$job\", instance=~\"$instance\"}[1d]) \n * scalar(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) \n / sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n )", "format": "time_series", "hide": false, "interval": "", diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml index 09d8cf9573..aab0850dc5 100644 --- a/deployment/docker/alerts-cluster.yml +++ b/deployment/docker/alerts-cluster.yml @@ -13,10 +13,7 @@ groups: expr: | vm_free_disk_space_bytes / ignoring(path) ( - ( - rate(vm_rows_added_to_storage_total[1d]) - - ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d]) - ) + rate(vm_rows_added_to_storage_total[1d]) * scalar( sum(vm_data_size_bytes{type!~"indexdb.*"}) / sum(vm_rows{type!~"indexdb.*"}) diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 785417278c..13e69abe78 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -13,10 +13,7 @@ groups: expr: | vm_free_disk_space_bytes / ignoring(path) ( - ( - rate(vm_rows_added_to_storage_total[1d]) - - ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d]) - ) + rate(vm_rows_added_to_storage_total[1d]) * scalar( sum(vm_data_size_bytes{type!~"indexdb.*"}) / sum(vm_rows{type!~"indexdb.*"}) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 4f7ff35cbd..a4f51e054a 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -59,6 +59,7 @@ The v1.97.x line will be supported for at least 12 months since [v1.97.0](https: * BUGFIX: fix `runtime error: slice bounds out of range` panic, which can occur during query execution. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5733). The bug has been introduced in `v1.97.0`. * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle `avg_over_time({some_filter}[d]) keep_metric_names` queries, where [`some_filter`](https://docs.victoriametrics.com/keyconcepts/#filtering) matches multiple time series with multiple names, while `d` is bigger or equal to `3h`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5556). * BUGFIX: [dashboards/single](https://grafana.com/grafana/dashboards/10229): fix typo in query for `version` annotation which falsely produced many version change events. +* BUGFIX: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): avoid possible negative results in `Storage full ETA` panels which caused by background merge. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5747). ## [v1.97.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.97.0)