From 27f1c65074b1daafd07359b98f7b89b4f798a786 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 18 Jul 2022 13:31:35 +0200 Subject: [PATCH] vmagent: expose metric `vmagent_remotewrite_queues` (#2871) The new metric `vmagent_remotewrite_queues` exports a static value of number of configured remote write queus. This metric is useful to calculate total saturation per each configured URL with given number of queues. See corresponding changes to vmagent alerts and dashboard. Signed-off-by: hagen1778 --- app/vmagent/remotewrite/client.go | 3 ++ dashboards/vmagent.json | 81 ++++++++++++++++--------------- deployment/docker/alerts.yml | 4 +- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/app/vmagent/remotewrite/client.go b/app/vmagent/remotewrite/client.go index 54cb61ea8..d50eeea31 100644 --- a/app/vmagent/remotewrite/client.go +++ b/app/vmagent/remotewrite/client.go @@ -154,6 +154,9 @@ func (c *client) init(argIdx, concurrency int, sanitizedURL string) { c.packetsDropped = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_packets_dropped_total{url=%q}`, c.sanitizedURL)) c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.sanitizedURL)) c.sendDuration = metrics.GetOrCreateFloatCounter(fmt.Sprintf(`vmagent_remotewrite_send_duration_seconds_total{url=%q}`, c.sanitizedURL)) + metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_queues{url=%q}`, c.sanitizedURL), func() float64 { + return float64(*queues) + }) for i := 0; i < concurrency; i++ { c.wg.Add(1) go func() { diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 0c4b8a610..c292ede53 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -6,7 +6,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.5.3" + "version": "8.4.4" }, { "type": "panel", @@ -61,12 +61,12 @@ } ] }, - "description": "Overview for VictoriaMetrics vmagent v1.73.0 or higher", + "description": "Overview for VictoriaMetrics vmagent v1.80.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, - "iteration": 1656943336787, + "iteration": 1657810604530, "links": [ { "icon": "doc", @@ -154,7 +154,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -218,7 +218,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -285,7 +285,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -344,7 +344,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -490,7 +490,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -589,7 +589,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -702,7 +702,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -805,7 +805,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -946,7 +946,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -1039,7 +1039,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -1138,7 +1138,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -1237,7 +1237,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -1344,7 +1344,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2457,7 +2457,7 @@ "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 43 }, "hiddenSeries": false, "id": 60, @@ -2480,7 +2480,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2555,7 +2555,7 @@ "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 43 }, "hiddenSeries": false, "id": 66, @@ -2578,7 +2578,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2652,7 +2652,7 @@ "h": 8, "w": 12, "x": 0, - "y": 12 + "y": 51 }, "hiddenSeries": false, "id": 61, @@ -2675,7 +2675,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2748,7 +2748,7 @@ "h": 8, "w": 12, "x": 12, - "y": 12 + "y": 51 }, "hiddenSeries": false, "id": 65, @@ -2771,7 +2771,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2837,7 +2837,7 @@ "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 59 }, "heatmap": {}, "hideZeroBuckets": false, @@ -2881,9 +2881,10 @@ "dashLength": 10, "dashes": false, "datasource": { + "type": "prometheus", "uid": "$ds" }, - "description": "Shows saturation of every connection to remote storage. If the threshold of 0.9sec is reached, then the connection is saturated by more than 90% and vmagent won't be able to keep up. This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.\n", + "description": "Shows saturation of every connection to remote storage. If the threshold of 90% is reached, then the connection is saturated (busy or slow) by more than 90%, so vmagent won't be able to keep up and can start buffering data. \n\nThis usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.\n", "fieldConfig": { "defaults": { "links": [] @@ -2896,7 +2897,7 @@ "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 59 }, "hiddenSeries": false, "id": 84, @@ -2919,7 +2920,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -2930,7 +2931,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(vmagent_remotewrite_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by (instance, url)", + "expr": "sum(rate(vmagent_remotewrite_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by (instance, url)\n/\nmax(vmagent_remotewrite_queues{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}) by(instance, url)", "interval": "", "legendFormat": "", "refId": "A" @@ -2943,7 +2944,7 @@ "fill": true, "line": true, "op": "gt", - "value": 0.9, + "value": 90, "yaxis": "left" } ], @@ -2963,7 +2964,7 @@ "yaxes": [ { "$$hashKey": "object:662", - "format": "s", + "format": "percentunit", "logBase": 1, "min": "0", "show": true @@ -2997,7 +2998,7 @@ "h": 8, "w": 12, "x": 0, - "y": 28 + "y": 67 }, "heatmap": {}, "hideZeroBuckets": false, @@ -3053,7 +3054,7 @@ "h": 8, "w": 12, "x": 12, - "y": 28 + "y": 67 }, "heatmap": {}, "hideZeroBuckets": false, @@ -3104,7 +3105,7 @@ "h": 8, "w": 12, "x": 0, - "y": 36 + "y": 75 }, "hiddenSeries": false, "id": 88, @@ -3124,7 +3125,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -3207,7 +3208,7 @@ "h": 8, "w": 12, "x": 12, - "y": 36 + "y": 75 }, "hiddenSeries": false, "id": 90, @@ -3227,7 +3228,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.5.3", + "pluginVersion": "8.4.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -4567,7 +4568,7 @@ } ], "refresh": "", - "schemaVersion": 36, + "schemaVersion": 35, "style": "dark", "tags": [ "vmagent", @@ -4577,7 +4578,9 @@ "list": [ { "current": { - "selected": false + "selected": true, + "text": "VM", + "value": "VM" }, "hide": 0, "includeAll": false, diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 0feaf4590..6f34571c3 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -270,7 +270,9 @@ groups: Ensure that destination is up and reachable." - alert: RemoteWriteConnectionIsSaturated - expr: rate(vmagent_remotewrite_send_duration_seconds_total[5m]) > 0.9 + expr: | + sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) + > 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url) for: 15m labels: severity: warning