From 73340dcb01f2788a51cdcd995fb98e49eb472136 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 5 Dec 2022 08:35:33 +0100 Subject: [PATCH] dashboards: add `Disk space usage %` and `Disk space usage % by type` panels (#3436) The new panels have been added to the vmstorage and drilldown rows. `Disk space usage %` is supposed to show disk space usage percentage. This panel is now also referred by `DiskRunsOutOfSpace` alerting rule. This panel has Drilldown option to show absolute values. `Disk space usage % by type` shows the relation between datapoints and indexdb size. It supposed to help identify cases when indexdb starts to take too much disk space. This panel has Drilldown option to show absolute values. Signed-off-by: hagen1778 --- dashboards/victoriametrics-cluster.json | 507 ++++++++++++++++++------ deployment/docker/alerts-cluster.yml | 2 +- 2 files changed, 392 insertions(+), 117 deletions(-) diff --git a/dashboards/victoriametrics-cluster.json b/dashboards/victoriametrics-cluster.json index 75be1b925..1274cd6f2 100644 --- a/dashboards/victoriametrics-cluster.json +++ b/dashboards/victoriametrics-cluster.json @@ -1612,8 +1612,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1629,7 +1628,7 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 30 }, "id": 66, "links": [], @@ -1724,8 +1723,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1741,7 +1739,7 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 30 }, "id": 138, "links": [], @@ -1835,8 +1833,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1852,7 +1849,7 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 38 }, "id": 64, "links": [], @@ -1942,8 +1939,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1972,7 +1968,7 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 38 }, "id": 122, "links": [], @@ -2080,8 +2076,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2113,7 +2108,7 @@ "h": 8, "w": 12, "x": 0, - "y": 30 + "y": 46 }, "id": 117, "links": [], @@ -2201,8 +2196,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2218,7 +2212,7 @@ "h": 8, "w": 12, "x": 12, - "y": 30 + "y": 46 }, "id": 119, "options": { @@ -2306,8 +2300,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2323,7 +2316,7 @@ "h": 8, "w": 12, "x": 0, - "y": 38 + "y": 54 }, "id": 68, "links": [], @@ -2411,8 +2404,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2428,7 +2420,7 @@ "h": 8, "w": 12, "x": 12, - "y": 38 + "y": 54 }, "id": 120, "options": { @@ -2516,8 +2508,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2533,7 +2524,7 @@ "h": 8, "w": 12, "x": 0, - "y": 46 + "y": 62 }, "id": 70, "links": [], @@ -2675,7 +2666,7 @@ "h": 8, "w": 12, "x": 0, - "y": 15 + "y": 31 }, "id": 102, "options": { @@ -2789,7 +2780,7 @@ "h": 8, "w": 12, "x": 12, - "y": 15 + "y": 31 }, "id": 108, "options": { @@ -2890,7 +2881,7 @@ "h": 8, "w": 12, "x": 0, - "y": 23 + "y": 39 }, "id": 142, "links": [ @@ -3001,7 +2992,7 @@ "h": 8, "w": 12, "x": 12, - "y": 23 + "y": 39 }, "id": 107, "options": { @@ -3100,7 +3091,7 @@ "h": 8, "w": 12, "x": 0, - "y": 31 + "y": 47 }, "id": 170, "links": [], @@ -3206,7 +3197,7 @@ "h": 8, "w": 12, "x": 12, - "y": 31 + "y": 47 }, "id": 116, "links": [], @@ -3308,7 +3299,7 @@ "h": 9, "w": 12, "x": 0, - "y": 39 + "y": 55 }, "id": 144, "options": { @@ -3411,7 +3402,7 @@ "h": 9, "w": 12, "x": 12, - "y": 39 + "y": 55 }, "id": 58, "links": [], @@ -3515,7 +3506,7 @@ "h": 7, "w": 24, "x": 0, - "y": 48 + "y": 64 }, "id": 183, "options": { @@ -3663,7 +3654,7 @@ "h": 9, "w": 12, "x": 0, - "y": 5 + "y": 21 }, "id": 76, "links": [], @@ -3779,7 +3770,7 @@ "h": 9, "w": 12, "x": 12, - "y": 5 + "y": 21 }, "id": 86, "links": [], @@ -3904,7 +3895,7 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 30 }, "id": 80, "links": [], @@ -4009,7 +4000,7 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 30 }, "id": 78, "links": [], @@ -4125,7 +4116,7 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 38 }, "id": 82, "options": { @@ -4232,7 +4223,7 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 38 }, "id": 74, "options": { @@ -4334,7 +4325,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4445,7 +4437,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4557,7 +4550,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4702,7 +4696,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4839,7 +4834,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -4942,7 +4938,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5084,7 +5081,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5187,7 +5185,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5246,7 +5245,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Shows amount of on-disk space occupied by data points.", + "description": "Shows the percentage of used disk space. It is recommended to have at least 20% of free disk space for the best performance.", "fieldConfig": { "defaults": { "color": { @@ -5259,7 +5258,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -5276,20 +5275,27 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "normal" + "mode": "none" }, "thresholdsStyle": { - "mode": "off" + "mode": "line" } }, - "links": [], + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/oS7Bi_0Wz?viewPanel=200&var-ds=$ds&var-instance=$instance&${__url_time_range}" + } + ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5297,7 +5303,7 @@ } ] }, - "unit": "bytes" + "unit": "percentunit" }, "overrides": [] }, @@ -5307,7 +5313,7 @@ "x": 0, "y": 37 }, - "id": 18, + "id": 20, "links": [], "options": { "legend": { @@ -5324,7 +5330,7 @@ }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, "pluginVersion": "9.1.0", @@ -5335,15 +5341,43 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!=\"indexdb\"}) ", + "expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "disk usage", + "legendFormat": "max", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "min(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "min", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "avg(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "avg", + "range": true, + "refId": "C" } ], - "title": "Disk space usage (datapoints) ($instance)", + "title": "Disk space usage % ($instance)", "type": "timeseries" }, { @@ -5394,7 +5428,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5457,7 +5492,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Shows amount of on-disk space occupied by inverted index.", + "description": "Shows the percentage of used disk space by type: datapoints or indexdb. Normally, indexdb takes much less space comparing to datapoints. But with high churn rate the size of the indexdb could grow significantly.\n\nThe sum of the % can be > 100% since panel shows max % per-job and per-instance. It means different instance can have different ratio between datapoints and indexdb size.", "fieldConfig": { "defaults": { "color": { @@ -5470,7 +5505,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -5487,28 +5522,31 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "normal" + "mode": "none" }, "thresholdsStyle": { - "mode": "off" + "mode": "line" } }, - "links": [], + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/oS7Bi_0Wz?viewPanel=201&var-ds=$ds&var-instance=$instance&${__url_time_range}" + } + ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" - }, - { - "color": "red", - "value": 80 + "color": "green", + "value": null } ] }, - "unit": "bytes" + "unit": "percentunit" }, "overrides": [] }, @@ -5518,7 +5556,7 @@ "x": 0, "y": 45 }, - "id": 20, + "id": 202, "links": [], "options": { "legend": { @@ -5535,7 +5573,7 @@ }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, "pluginVersion": "9.1.0", @@ -5546,15 +5584,29 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type=\"indexdb\"})", + "expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type=\"indexdb\"}) by(job, instance)\n / \n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "disk usage", + "legendFormat": "indexdb", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) by(job, instance)\n / \n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "datapoints", + "range": true, + "refId": "B" } ], - "title": "Disk space usage (index) ($instance)", + "title": "Disk space usage % by type ($instance)", "type": "timeseries" }, { @@ -5605,7 +5657,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5740,7 +5793,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -5755,7 +5809,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 53 }, "id": 135, @@ -5862,8 +5916,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5879,7 +5932,7 @@ "h": 8, "w": 12, "x": 0, - "y": 82 + "y": 98 }, "id": 92, "links": [], @@ -5969,8 +6022,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6006,7 +6058,7 @@ "h": 8, "w": 12, "x": 12, - "y": 82 + "y": 98 }, "id": 95, "links": [], @@ -6112,8 +6164,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6129,7 +6180,7 @@ "h": 8, "w": 12, "x": 0, - "y": 90 + "y": 106 }, "id": 163, "links": [], @@ -6257,8 +6308,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6274,7 +6324,7 @@ "h": 8, "w": 12, "x": 12, - "y": 90 + "y": 106 }, "id": 165, "links": [], @@ -6398,8 +6448,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6415,7 +6464,7 @@ "h": 8, "w": 12, "x": 0, - "y": 98 + "y": 114 }, "id": 178, "links": [], @@ -6506,8 +6555,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6523,7 +6571,7 @@ "h": 8, "w": 12, "x": 12, - "y": 98 + "y": 114 }, "id": 180, "links": [], @@ -6630,7 +6678,7 @@ "h": 8, "w": 12, "x": 0, - "y": 106 + "y": 122 }, "id": 179, "links": [], @@ -6737,7 +6785,7 @@ "h": 8, "w": 12, "x": 12, - "y": 106 + "y": 122 }, "id": 181, "links": [], @@ -6855,7 +6903,7 @@ "h": 8, "w": 24, "x": 0, - "y": 114 + "y": 130 }, "id": 93, "links": [], @@ -6991,7 +7039,7 @@ "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 24 }, "id": 97, "links": [], @@ -7117,7 +7165,7 @@ "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 24 }, "id": 99, "links": [], @@ -7241,7 +7289,7 @@ "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 32 }, "id": 185, "links": [], @@ -7385,7 +7433,7 @@ "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 32 }, "id": 187, "links": [], @@ -7523,7 +7571,7 @@ "h": 8, "w": 12, "x": 0, - "y": 24 + "y": 40 }, "id": 90, "links": [], @@ -7631,7 +7679,7 @@ "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 40 }, "id": 88, "links": [], @@ -7738,7 +7786,7 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 48 }, "id": 139, "links": [], @@ -7845,7 +7893,7 @@ "h": 8, "w": 12, "x": 12, - "y": 32 + "y": 48 }, "id": 114, "links": [], @@ -7911,10 +7959,15 @@ }, "id": 198, "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, "content": "Drilldown row is used by other panels on the dashboard to show more detailed metrics per-instance.", "mode": "markdown" }, - "pluginVersion": "9.1.0", + "pluginVersion": "9.2.6", "transparent": true, "type": "text" }, @@ -7966,7 +8019,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -8067,7 +8121,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -8168,7 +8223,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -8271,7 +8327,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -8328,6 +8385,224 @@ ], "title": "Storage full ETA ($instance)", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the percentage of used disk space. It is recommended to have at least 20% of free disk space for the best performance.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 200, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Disk space usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 201, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type=\"indexdb\"}) by(job, instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{job}}:{{instance}} (indexdb)", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) by(job, instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{job}}:{{instance}} (datapoints)", + "range": true, + "refId": "B" + } + ], + "title": "Disk space usage by type ($instance)", + "type": "timeseries" } ], "title": "Drilldown", diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml index 1a99a08fb..15c305452 100644 --- a/deployment/docker/alerts-cluster.yml +++ b/deployment/docker/alerts-cluster.yml @@ -43,7 +43,7 @@ groups: labels: severity: critical annotations: - dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=110&var-instance={{ $labels.instance }}" + dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=200&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} will run out of disk space soon" description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance.