vmalert: follow-up after 669becd011 (#4318)

* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: follow-up after 669becd011

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-05-16 18:51:38 +02:00 committed by GitHub
parent 242050ba94
commit f68d93cca2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 16 deletions

View file

@ -186,13 +186,9 @@ var (
) )
// flush is a blocking function that marshals WriteRequest and sends // flush is a blocking function that marshals WriteRequest and sends
// it to remote write endpoint. Flush performs limited amount of retries // it to remote-write endpoint. Flush performs limited amount of retries
// if request fails. // if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) { func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
const (
retryCount = 5
retryBackoff = time.Second
)
if len(wr.Timeseries) < 1 { if len(wr.Timeseries) < 1 {
return return
} }
@ -207,29 +203,42 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
b := snappy.Encode(nil, data) b := snappy.Encode(nil, data)
attempts := 0 const (
for ; attempts < retryCount; attempts++ { retryCount = 5
retryBackoff = time.Second
)
for attempts := 0; attempts < retryCount; attempts++ {
err := c.send(ctx, b) err := c.send(ctx, b)
if err == nil { if err == nil {
sentRows.Add(len(wr.Timeseries)) sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b)) sentBytes.Add(len(b))
return return
} }
logger.Warnf("attempt %d to send request failed: %s", attempts+1, err)
if _, ok := err.(*retriableError); ok { _, isRetriable := err.(*retriableError)
// sleeping to avoid remote db hammering logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, isRetriable)
time.Sleep(retryBackoff)
continue if !isRetriable {
} else { // exit fast if error isn't retriable
break break
} }
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
break
default:
}
// sleeping to avoid remote db hammering
time.Sleep(retryBackoff)
} }
droppedRows.Add(len(wr.Timeseries)) droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b)) droppedBytes.Add(len(b))
logger.Errorf("all %d attempts to send request failed - dropping %d time series", logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
attempts, len(wr.Timeseries)) len(wr.Timeseries))
} }
func (c *Client) send(ctx context.Context, data []byte) error { func (c *Client) send(ctx context.Context, data []byte) error {
@ -258,14 +267,22 @@ func (c *Client) send(ctx context.Context, data []byte) error {
req.URL.Redacted(), err, len(data), r.Size()) req.URL.Redacted(), err, len(data), r.Size())
} }
defer func() { _ = resp.Body.Close() }() defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 { switch resp.StatusCode / 100 {
case 2: case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil return nil
case 5: case 5:
// respond with HTTP status code 5xx when the write fails and SHOULD be retried.
return &retriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q", return &retriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)} resp.StatusCode, req.URL.Redacted(), body)}
default: default:
// respond with HTTP status code 4xx when the request is invalid, will never be able to succeed
// and should not be retried.
return fmt.Errorf("unexpected response code %d for %s. Response body %q", return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body) resp.StatusCode, req.URL.Redacted(), body)
} }
@ -276,5 +293,5 @@ type retriableError struct {
} }
func (e *retriableError) Error() string { func (e *retriableError) Error() string {
return e.Error() return e.err.Error()
} }

View file

@ -39,6 +39,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): detect alerting rules which don't match any series. See [these docs](https://docs.victoriametrics.com/vmalert.html#never-firing-alerts) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): detect alerting rules which don't match any series. See [these docs](https://docs.victoriametrics.com/vmalert.html#never-firing-alerts) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support loading rules via HTTP URL. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3352). Thanks to @Haleygo for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4212). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support loading rules via HTTP URL. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3352). Thanks to @Haleygo for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4212).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add buttons for filtering groups/rules with errors or with no-match warning in web UI for page `/groups`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add buttons for filtering groups/rules with errors or with no-match warning in web UI for page `/groups`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not retry remote-write requests for responses with 4XX status codes. This aligns with [Prometheus remote write specification](https://prometheus.io/docs/concepts/remote_write_spec/). Thanks to @MichaHoffmann for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4134).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to filter incoming requests by IP. See [these docs](https://docs.victoriametrics.com/vmauth.html#ip-filters) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3491). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to filter incoming requests by IP. See [these docs](https://docs.victoriametrics.com/vmauth.html#ip-filters) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3491).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to proxy requests to the specified backends for unauthorized users. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4083). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to proxy requests to the specified backends for unauthorized users. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4083).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to specify default route for unmatched requests. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4084). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to specify default route for unmatched requests. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4084).