From f68d93cca20918c1049bc5c2f23af85912d49f11 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 16 May 2023 18:51:38 +0200 Subject: [PATCH] vmalert: follow-up after 669becd0119eedeedb667df6d93bb5382e22df2b (#4318) * vmalert: follow-up after 669becd0119eedeedb667df6d93bb5382e22df2b Signed-off-by: hagen1778 * vmalert: follow-up after 669becd0119eedeedb667df6d93bb5382e22df2b Signed-off-by: hagen1778 * vmalert: follow-up after 669becd0119eedeedb667df6d93bb5382e22df2b Signed-off-by: hagen1778 --------- Signed-off-by: hagen1778 --- app/vmalert/remotewrite/remotewrite.go | 49 +++++++++++++++++--------- docs/CHANGELOG.md | 1 + 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/app/vmalert/remotewrite/remotewrite.go b/app/vmalert/remotewrite/remotewrite.go index 9ba29bb96..8157e4198 100644 --- a/app/vmalert/remotewrite/remotewrite.go +++ b/app/vmalert/remotewrite/remotewrite.go @@ -186,13 +186,9 @@ var ( ) // flush is a blocking function that marshals WriteRequest and sends -// it to remote write endpoint. Flush performs limited amount of retries +// it to remote-write endpoint. Flush performs limited amount of retries // if request fails. func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) { - const ( - retryCount = 5 - retryBackoff = time.Second - ) if len(wr.Timeseries) < 1 { return } @@ -207,29 +203,42 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) { b := snappy.Encode(nil, data) - attempts := 0 - for ; attempts < retryCount; attempts++ { + const ( + retryCount = 5 + retryBackoff = time.Second + ) + + for attempts := 0; attempts < retryCount; attempts++ { err := c.send(ctx, b) if err == nil { sentRows.Add(len(wr.Timeseries)) sentBytes.Add(len(b)) return } - logger.Warnf("attempt %d to send request failed: %s", attempts+1, err) - if _, ok := err.(*retriableError); ok { - // sleeping to avoid remote db hammering - time.Sleep(retryBackoff) - continue - } else { + _, isRetriable := err.(*retriableError) + logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, isRetriable) + + if !isRetriable { + // exit fast if error isn't retriable break } + + // check if request has been cancelled before backoff + select { + case <-ctx.Done(): + break + default: + } + + // sleeping to avoid remote db hammering + time.Sleep(retryBackoff) } droppedRows.Add(len(wr.Timeseries)) droppedBytes.Add(len(b)) - logger.Errorf("all %d attempts to send request failed - dropping %d time series", - attempts, len(wr.Timeseries)) + logger.Errorf("attempts to send remote-write request failed - dropping %d time series", + len(wr.Timeseries)) } func (c *Client) send(ctx context.Context, data []byte) error { @@ -258,14 +267,22 @@ func (c *Client) send(ctx context.Context, data []byte) error { req.URL.Redacted(), err, len(data), r.Size()) } defer func() { _ = resp.Body.Close() }() + body, _ := io.ReadAll(resp.Body) + + // according to https://prometheus.io/docs/concepts/remote_write_spec/ + // Prometheus remote Write compatible receivers MUST switch resp.StatusCode / 100 { case 2: + // respond with a HTTP 2xx status code when the write is successful. return nil case 5: + // respond with HTTP status code 5xx when the write fails and SHOULD be retried. return &retriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q", resp.StatusCode, req.URL.Redacted(), body)} default: + // respond with HTTP status code 4xx when the request is invalid, will never be able to succeed + // and should not be retried. return fmt.Errorf("unexpected response code %d for %s. Response body %q", resp.StatusCode, req.URL.Redacted(), body) } @@ -276,5 +293,5 @@ type retriableError struct { } func (e *retriableError) Error() string { - return e.Error() + return e.err.Error() } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index a3e8ff0a5..83a12664d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -39,6 +39,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): detect alerting rules which don't match any series. See [these docs](https://docs.victoriametrics.com/vmalert.html#never-firing-alerts) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support loading rules via HTTP URL. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3352). Thanks to @Haleygo for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4212). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add buttons for filtering groups/rules with errors or with no-match warning in web UI for page `/groups`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not retry remote-write requests for responses with 4XX status codes. This aligns with [Prometheus remote write specification](https://prometheus.io/docs/concepts/remote_write_spec/). Thanks to @MichaHoffmann for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4134). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to filter incoming requests by IP. See [these docs](https://docs.victoriametrics.com/vmauth.html#ip-filters) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3491). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to proxy requests to the specified backends for unauthorized users. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4083). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to specify default route for unmatched requests. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4084).