From a47127c1a633aa80ffc3ab23764a7159c954a7e7 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Sun, 14 Jan 2024 22:52:47 +0200 Subject: [PATCH] app/vmalert/remotewrite: properly calculate vmalert_remotewrite_dropped_rows_total It was calculating the number of dropped time series instead of the number of dropped samples. While at it, drop vmalert_remotewrite_dropped_bytes_total metric, since it was inconsistently calculated - at one place it was calculating raw protobuf-encoded sample sizes, while at another place it was calculating the size of snappy-compressed prompbmarshal.WriteRequest protobuf message. Additionally, this metric has zero practical sense, so just drop it in order to reduce the level of confusion. --- app/vmalert/remotewrite/client.go | 10 +++++----- docs/CHANGELOG.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/app/vmalert/remotewrite/client.go b/app/vmalert/remotewrite/client.go index 6c012a350..ac3dffa65 100644 --- a/app/vmalert/remotewrite/client.go +++ b/app/vmalert/remotewrite/client.go @@ -123,14 +123,12 @@ func (c *Client) Push(s prompbmarshal.TimeSeries) error { case <-c.doneCh: rwErrors.Inc() droppedRows.Add(len(s.Samples)) - droppedBytes.Add(s.Size()) return fmt.Errorf("client is closed") case c.input <- s: return nil default: rwErrors.Inc() droppedRows.Add(len(s.Samples)) - droppedBytes.Add(s.Size()) return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+ "Queue size is controlled by -remoteWrite.maxQueueSize flag", c.maxQueueSize) @@ -195,7 +193,6 @@ var ( sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`) sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`) droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`) - droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`) sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`) bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`) @@ -276,8 +273,11 @@ L: } rwErrors.Inc() - droppedRows.Add(len(wr.Timeseries)) - droppedBytes.Add(len(b)) + rows := 0 + for _, ts := range wr.Timeseries { + rows += len(ts.Samples) + } + droppedRows.Add(rows) logger.Errorf("attempts to send remote-write request failed - dropping %d time series", len(wr.Timeseries)) } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 15bb378d3..aaa01c9d2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -180,7 +180,7 @@ Released at 2023-11-15 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded. -* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_dropped_rows_total` and `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_dropped_rows_total` metric if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087). * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component. * BUGFIX: dashboards: respect `job` and `instance` filters for `alerts` annotation in cluster and single-node dashboards.