From f082e64e0c36fb929f11bb194c1d9a97cd0d5b48 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 6 Apr 2022 12:28:54 +0300 Subject: [PATCH] app/vmagent: reduce the probability of TLS handshake timeout when dialing the remote storage The following actions are taken: - Increase the TLS hashdshake timeout from 5 seconds to 10 seconds - Increase dial timeout from 5 seconds to 30 seconds - Specify DialContext instead of Dial in http.Transport. This allows properly handling the Context arg during dialing the remote storage Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1699 --- app/vmagent/remotewrite/client.go | 4 ++-- app/vmagent/remotewrite/statconn.go | 22 ++++++++++++++++++++-- docs/CHANGELOG.md | 1 + 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/app/vmagent/remotewrite/client.go b/app/vmagent/remotewrite/client.go index 0493758b8..d265ac0ff 100644 --- a/app/vmagent/remotewrite/client.go +++ b/app/vmagent/remotewrite/client.go @@ -92,9 +92,9 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste } tlsCfg := authCfg.NewTLSConfig() tr := &http.Transport{ - Dial: statDial, + DialContext: statDial, TLSClientConfig: tlsCfg, - TLSHandshakeTimeout: 5 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, MaxConnsPerHost: 2 * concurrency, MaxIdleConnsPerHost: 2 * concurrency, IdleConnTimeout: time.Minute, diff --git a/app/vmagent/remotewrite/statconn.go b/app/vmagent/remotewrite/statconn.go index 5d597e9bf..9aa064c29 100644 --- a/app/vmagent/remotewrite/statconn.go +++ b/app/vmagent/remotewrite/statconn.go @@ -1,7 +1,9 @@ package remotewrite import ( + "context" "net" + "sync" "sync/atomic" "time" @@ -9,9 +11,25 @@ import ( "github.com/VictoriaMetrics/metrics" ) -func statDial(networkUnused, addr string) (conn net.Conn, err error) { +func getStdDialer() *net.Dialer { + stdDialerOnce.Do(func() { + stdDialer = &net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: netutil.TCP6Enabled(), + } + }) + return stdDialer +} + +var ( + stdDialer *net.Dialer + stdDialerOnce sync.Once +) + +func statDial(ctx context.Context, networkUnused, addr string) (conn net.Conn, err error) { network := netutil.GetTCPNetwork() - conn, err = net.DialTimeout(network, addr, 5*time.Second) + conn, err = stdDialer.DialContext(ctx, network, addr) dialsTotal.Inc() if err != nil { dialErrors.Inc() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 85a4b010a..cf6cdb785 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -33,6 +33,7 @@ Previously the `-search.maxUniqueTimeseries` command-line flag was used as a glo When using [cluster version of VictoriaMetrics](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html), these command-line flags (including `-search.maxUniqueTimeseries`) must be passed to `vmselect` instead of `vmstorage`. +* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html) and [vmauth](https://docs.victoriametrics.com/vmauth.html): reduce the probability of `TLS handshake error from XX.XX.XX.XX: EOF` errors when `-remoteWrite.url` points to HTTPS url at `vmauth`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1699). * BUGFIX: return `Content-Type: text/html` response header when requesting `/` HTTP path at VictoriaMetrics components. Previously `text/plain` response header was returned, which could lead to broken page formatting. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2323). * BUGFIX: [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage): accept floating-point values for [maxDataPoints](https://graphite.readthedocs.io/en/stable/render_api.html#maxdatapoints) query arg, since some clients send floating-point values instead of integer values for this arg.