app/vmalert: adds idleConnTimeout flags and retry trivial network errors (#6382)

* "*.idleConnTimeout" flags must reduce probability of `write: broken pipe` and `read: connection reset by peer` errors Those errors may occur if remote server closes TCP socket for connection, while it's still exist at client. * single time retries for `write: broken pipe` and `read: connection reset by peer` must handle a case for incorrectly configured timeouts at middleware proxies, mitigate minor network issues. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661 ### Describe Your Changes Please provide a brief description of the changes you made. Be as specific as possible to help others understand the purpose and impact of your modifications. --------- Co-authored-by: Roman Khavronenko <roman@victoriametrics.com> (cherry picked from commit b97916276f)
2025-01-10 15:14:09 +00:00 · 2024-05-30 17:54:42 +02:00 · 2024-05-30 17:54:42 +02:00 · 908a50f79d
commit 908a50f79d
parent b994e64567
7 changed files with 24 additions and 5 deletions
--- a/app/vmalert/datasource/init.go
+++ b/app/vmalert/datasource/init.go
@ -57,8 +57,9 @@ var (
 		`Whether to align "time" parameter with evaluation interval. `+
 		"Alignment supposed to produce deterministic results despite number of vmalert replicas or time they were started. "+
 		"See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1257")
-	maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
+	maxIdleConnections    = flag.Int("datasource.maxIdleConnections", 100, `Defines the number of idle (keep-alive connections) to each configured datasource. Consider setting this value equal to the value: groups_total * group.concurrency. Too low a value may result in a high number of sockets in TIME_WAIT state.`)
-	disableKeepAlive   = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
+	idleConnectionTimeout = flag.Duration("datasource.idleConnTimeout", 50*time.Second, `Defines a duration for idle (keep-alive connections) to exist. Consider setting this value less than "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors.`)
 	disableKeepAlive      = flag.Bool("datasource.disableKeepAlive", false, `Whether to disable long-lived connections to the datasource. `+
 		`If true, disables HTTP keep-alives and will only use the connection to the server for a single HTTP request.`)
 	roundDigits = flag.Int("datasource.roundDigits", 0, `Adds "round_digits" GET param to datasource requests. `+
 		`In VM "round_digits" limits the number of digits after the decimal point in response values.`)
@ -105,6 +106,7 @@ func Init(extraParams url.Values) (QuerierBuilder, error) {
 	if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost {
 		tr.MaxIdleConns = tr.MaxIdleConnsPerHost
 	}
 	tr.IdleConnTimeout = *idleConnectionTimeout
 	if extraParams == nil {
 		extraParams = url.Values{}
--- a/app/vmalert/datasource/vm.go
+++ b/app/vmalert/datasource/vm.go
@ -11,6 +11,7 @@ import (
 	"time"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
 )
@ -140,7 +141,7 @@ func (s *VMStorage) Query(ctx context.Context, query string, ts time.Time) (Resu
 	}
 	resp, err := s.do(req)
 	if err != nil {
-		if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
+		if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) && !netutil.IsTrivialNetworkError(err) {
 			// Return unexpected error to the caller.
 			return Result{}, nil, err
 		}
@ -185,7 +186,7 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
 	}
 	resp, err := s.do(req)
 	if err != nil {
-		if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) {
+		if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) && !netutil.IsTrivialNetworkError(err) {
 			// Return unexpected error to the caller.
 			return res, err
 		}
--- a/app/vmalert/remoteread/init.go
+++ b/app/vmalert/remoteread/init.go
@ -4,6 +4,7 @@ import (
 	"flag"
 	"fmt"
 	"net/http"
 	"time"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
@ -32,6 +33,8 @@ var (
 	bearerToken     = flag.String("remoteRead.bearerToken", "", "Optional bearer auth token to use for -remoteRead.url.")
 	bearerTokenFile = flag.String("remoteRead.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteRead.url.")
 	idleConnectionTimeout = flag.Duration("remoteRead.idleConnTimeout", 50*time.Second, `Defines a duration for idle (keep-alive connections) to exist. Consider settings this value less to the value of "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors.`)
 	tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
 	tlsCertFile           = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
 	tlsKeyFile            = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
@ -66,6 +69,7 @@ func Init() (datasource.QuerierBuilder, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to create transport: %w", err)
 	}
 	tr.IdleConnTimeout = *idleConnectionTimeout
 	tr.DialContext = httputils.GetStatDialFunc("vmalert_remoteread")
 	endpointParams, err := flagutil.ParseJSONMap(*oauth2EndpointParams)
--- a/app/vmalert/remotewrite/client.go
+++ b/app/vmalert/remotewrite/client.go
@ -16,6 +16,7 @@ import (
 	"github.com/golang/snappy"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 	"github.com/VictoriaMetrics/metrics"
@ -235,7 +236,7 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
 L:
 	for attempts := 0; ; attempts++ {
 		err := c.send(ctx, b)
-		if errors.Is(err, io.EOF) {
+		if err != nil && (errors.Is(err, io.EOF) || netutil.IsTrivialNetworkError(err)) {
 			// Something in the middle between client and destination might be closing
 			// the connection. So we do a one more attempt in hope request will succeed.
 			err = c.send(ctx, b)
--- a/app/vmalert/remotewrite/init.go
+++ b/app/vmalert/remotewrite/init.go
@ -31,6 +31,8 @@ var (
 	bearerToken     = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url.")
 	bearerTokenFile = flag.String("remoteWrite.bearerTokenFile", "", "Optional path to bearer token file to use for -remoteWrite.url.")
 	idleConnectionTimeout = flag.Duration("remoteWrite.idleConnTimeout", 50*time.Second, `Defines a duration for idle (keep-alive connections) to exist. Consider settings this value less to the value of "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors.`)
 	maxQueueSize  = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
 	maxBatchSize  = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines max number of timeseries to be flushed at once")
 	concurrency   = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote write endpoint")
@ -71,6 +73,7 @@ func Init(ctx context.Context) (*Client, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to create transport: %w", err)
 	}
 	t.IdleConnTimeout = *idleConnectionTimeout
 	t.DialContext = httputils.GetStatDialFunc("vmalert_remotewrite")
 	endpointParams, err := flagutil.ParseJSONMap(*oauth2EndpointParams)
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -50,6 +50,8 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow configuring `-remoteWrite.disableOnDiskQueue` and `-remoteWrite.dropSamplesOnOverload` cmd-line flags per each `-remoteWrite.url`. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementaion!
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add labels `path` and `url` to metrics `vmagent_remotewrite_push_failures_total` and `vmagent_remotewrite_samples_dropped_total`. Now number of failed pushes and dropped samples can be tracked per `-remoteWrite.url`.
 * FEATURE: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): add [rate_sum](https://docs.victoriametrics.com/stream-aggregation/#rate_sum) and [rate_avg](https://docs.victoriametrics.com/stream-aggregation/#rate_avg) aggregation outputs.
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): add `datasource.idleConnTimeout`, `remoteWrite.idleConnTimeout` and `remoteRead.idleConnTimeout` flags. These flags are set to 50s by default and should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmalert logs. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661) for details.
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): add auto request retry for trivial network errors, such as `broken pipe` and `connection reset` for requests to `remoteRead`, `remoteWrite` and `datasource` URLs. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661) for details.
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): reduce CPU usage when evaluating high number of alerting and recording rules.
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): speed up retrieving rules files from object storages by skipping unchanged objects during reloading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6210).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): support reading [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) records in `-datasource.url`, `-remoteWrite.url` and `-remoteRead.url` command-line option. For example, `-remoteWrite.url=http://srv+victoria-metrics` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends data to one of the addresses. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
--- a/docs/vmalert.md
+++ b/docs/vmalert.md
@ -1007,6 +1007,8 @@ The shortlist of configuration flags is the following:
     Whether to disable adding 'step' param to the issued instant queries. This might be useful when using vmalert with datasources that do not support 'step' param for instant queries, like Google Managed Prometheus. It is not recommended to enable this flag if you use vmalert with VictoriaMetrics.
  -datasource.headers string
     Optional HTTP extraHeaders to send with each request to the corresponding -datasource.url. For example, -datasource.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -datasource.url. Multiple headers must be delimited by '^^': -datasource.headers='header1:value1^^header2:value2'
  -datasource.idleConnTimeout duration
     Defines a duration for idle (keep-alive connections) to exist. Consider settings this value less to the value of "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors. (default 50s)
  -datasource.lookback duration
     Deprecated: please adjust "-search.latencyOffset" at datasource side or specify "latency_offset" in rule group's params. Lookback defines how far into the past to look when evaluating queries. For example, if the datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
  -datasource.maxIdleConnections int
@ -1277,6 +1279,8 @@ The shortlist of configuration flags is the following:
     Whether to disable automatic appending of '/api/v1/query' path to the configured -datasource.url and -remoteRead.url
  -remoteRead.headers string
     Optional HTTP headers to send with each request to the corresponding -remoteRead.url. For example, -remoteRead.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteRead.url. Multiple headers must be delimited by '^^': -remoteRead.headers='header1:value1^^header2:value2'
  -remoteRead.idleConnTimeout duration
     Defines a duration for idle (keep-alive connections) to exist. Consider settings this value less to the value of "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors. (default 50s)
  -remoteRead.ignoreRestoreErrors
     Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases. (default true)
  -remoteRead.lookback duration
@ -1325,6 +1329,8 @@ The shortlist of configuration flags is the following:
     Defines interval of flushes to remote write endpoint (default 5s)
  -remoteWrite.headers string
     Optional HTTP headers to send with each request to the corresponding -remoteWrite.url. For example, -remoteWrite.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteWrite.url. Multiple headers must be delimited by '^^': -remoteWrite.headers='header1:value1^^header2:value2'
  -remoteWrite.idleConnTimeout duration
     Defines a duration for idle (keep-alive connections) to exist. Consider settings this value less to the value of "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors. (default 50s)
  -remoteWrite.maxBatchSize int
     Defines max number of timeseries to be flushed at once (default 1000)
  -remoteWrite.maxQueueSize int