mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
fix: do not use exponential backoff for first retry of scrape request (#3824)
* fix: do not use exponential backoff for first retry of scrape request (#3293) * lib/promscrape: refactor `doRequestWithPossibleRetry` backoff to simplify logic Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> * Update lib/promscrape/client.go Co-authored-by: Roman Khavronenko <roman@victoriametrics.com> * lib/promscrape: refactor `doRequestWithPossibleRetry` to make it more straightforward Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> --------- Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com> Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
This commit is contained in:
parent
8efa9159cf
commit
5ea6d71cb3
3 changed files with 57 additions and 22 deletions
|
@ -42,6 +42,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||||
* BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790).
|
* BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790).
|
||||||
* BUGFIX: properly parse timestamps in milliseconds when [ingesting data via OpenTSDB telnet put protocol](https://docs.victoriametrics.com/#sending-data-via-telnet-put-protocol). Previously timestamps in milliseconds were mistakenly multiplied by 1000. Thanks to @Droxenator for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3810).
|
* BUGFIX: properly parse timestamps in milliseconds when [ingesting data via OpenTSDB telnet put protocol](https://docs.victoriametrics.com/#sending-data-via-telnet-put-protocol). Previously timestamps in milliseconds were mistakenly multiplied by 1000. Thanks to @Droxenator for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3810).
|
||||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): do not add extrapolated points outside the real points when using [interpolate()](https://docs.victoriametrics.com/MetricsQL.html#interpolate) function. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3816).
|
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): do not add extrapolated points outside the real points when using [interpolate()](https://docs.victoriametrics.com/MetricsQL.html#interpolate) function. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3816).
|
||||||
|
* BUGFIX: do not use exponential backoff for first retry of scrape request. Previously the first retry of scrape request was delayed by 2 seconds, which could lead to increased delays in case of connection being closed by short keep-alive timeout. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3293).
|
||||||
|
|
||||||
## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1)
|
## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1)
|
||||||
|
|
||||||
|
|
|
@ -351,33 +351,48 @@ var (
|
||||||
)
|
)
|
||||||
|
|
||||||
func doRequestWithPossibleRetry(ctx context.Context, hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response, deadline time.Time) error {
|
func doRequestWithPossibleRetry(ctx context.Context, hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response, deadline time.Time) error {
|
||||||
sleepTime := time.Second
|
|
||||||
scrapeRequests.Inc()
|
scrapeRequests.Inc()
|
||||||
reqCtx, cancel := context.WithDeadline(ctx, deadline)
|
reqCtx, cancel := context.WithDeadline(ctx, deadline)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
for {
|
|
||||||
|
var reqErr error
|
||||||
|
// Return true if the request execution is completed and retry is not required
|
||||||
|
attempt := func() bool {
|
||||||
// Use DoCtx instead of Do in order to support context cancellation
|
// Use DoCtx instead of Do in order to support context cancellation
|
||||||
err := hc.DoCtx(reqCtx, req, resp)
|
reqErr = hc.DoCtx(reqCtx, req, resp)
|
||||||
if err == nil {
|
if reqErr == nil {
|
||||||
statusCode := resp.StatusCode()
|
statusCode := resp.StatusCode()
|
||||||
if statusCode != fasthttp.StatusTooManyRequests {
|
if statusCode != fasthttp.StatusTooManyRequests {
|
||||||
return nil
|
return true
|
||||||
}
|
}
|
||||||
} else if err != fasthttp.ErrConnectionClosed && !strings.Contains(err.Error(), "broken pipe") {
|
} else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") {
|
||||||
return err
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt() {
|
||||||
|
return reqErr
|
||||||
|
}
|
||||||
|
|
||||||
|
sleepTime := time.Second
|
||||||
|
maxSleepTime := time.Until(deadline)
|
||||||
|
for {
|
||||||
|
scrapeRetries.Inc()
|
||||||
|
if attempt() {
|
||||||
|
return reqErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retry request after exponentially increased sleep.
|
// Retry request after exponentially increased sleep.
|
||||||
maxSleepTime := time.Until(deadline)
|
|
||||||
if sleepTime > maxSleepTime {
|
if sleepTime > maxSleepTime {
|
||||||
return fmt.Errorf("the server closes all the connection attempts: %w", err)
|
return fmt.Errorf("the server closes all the connection attempts: %w", reqErr)
|
||||||
}
|
}
|
||||||
sleepTime += sleepTime
|
sleepTime += sleepTime
|
||||||
if sleepTime > maxSleepTime {
|
if sleepTime > maxSleepTime {
|
||||||
sleepTime = maxSleepTime
|
sleepTime = maxSleepTime
|
||||||
}
|
}
|
||||||
time.Sleep(sleepTime)
|
time.Sleep(sleepTime)
|
||||||
scrapeRetries.Inc()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ import (
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
|
||||||
|
"github.com/VictoriaMetrics/fasthttp"
|
||||||
"github.com/VictoriaMetrics/metrics"
|
"github.com/VictoriaMetrics/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -270,30 +271,48 @@ func (c *Client) Stop() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func doRequestWithPossibleRetry(hc *HTTPClient, req *http.Request, deadline time.Time) (*http.Response, error) {
|
func doRequestWithPossibleRetry(hc *HTTPClient, req *http.Request, deadline time.Time) (*http.Response, error) {
|
||||||
sleepTime := time.Second
|
|
||||||
discoveryRequests.Inc()
|
discoveryRequests.Inc()
|
||||||
|
|
||||||
for {
|
var (
|
||||||
resp, err := hc.client.Do(req)
|
reqErr error
|
||||||
if err == nil {
|
resp *http.Response
|
||||||
|
)
|
||||||
|
// Return true if the request execution is completed and retry is not required
|
||||||
|
attempt := func() bool {
|
||||||
|
// Use DoCtx instead of Do in order to support context cancellation
|
||||||
|
resp, reqErr = hc.client.Do(req)
|
||||||
|
if reqErr == nil {
|
||||||
statusCode := resp.StatusCode
|
statusCode := resp.StatusCode
|
||||||
if statusCode != http.StatusTooManyRequests {
|
if statusCode != fasthttp.StatusTooManyRequests {
|
||||||
return resp, nil
|
return true
|
||||||
}
|
}
|
||||||
} else if err != net.ErrClosed && !strings.Contains(err.Error(), "broken pipe") {
|
} else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") {
|
||||||
return nil, err
|
return true
|
||||||
}
|
}
|
||||||
// Retry request after exponentially increased sleep.
|
|
||||||
maxSleepTime := time.Until(deadline)
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt() {
|
||||||
|
return resp, reqErr
|
||||||
|
}
|
||||||
|
|
||||||
|
sleepTime := time.Second
|
||||||
|
maxSleepTime := time.Until(deadline)
|
||||||
|
for {
|
||||||
|
discoveryRetries.Inc()
|
||||||
|
if attempt() {
|
||||||
|
return resp, reqErr
|
||||||
|
}
|
||||||
|
|
||||||
if sleepTime > maxSleepTime {
|
if sleepTime > maxSleepTime {
|
||||||
return nil, fmt.Errorf("the server closes all the connection attempts: %w", err)
|
return nil, fmt.Errorf("the server closes all the connection attempts: %w", reqErr)
|
||||||
}
|
}
|
||||||
sleepTime += sleepTime
|
sleepTime += sleepTime
|
||||||
if sleepTime > maxSleepTime {
|
if sleepTime > maxSleepTime {
|
||||||
sleepTime = maxSleepTime
|
sleepTime = maxSleepTime
|
||||||
}
|
}
|
||||||
time.Sleep(sleepTime)
|
time.Sleep(sleepTime)
|
||||||
discoveryRetries.Inc()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue