diff --git a/app/vmauth/main.go b/app/vmauth/main.go index 87659350d..9611b19a5 100644 --- a/app/vmauth/main.go +++ b/app/vmauth/main.go @@ -37,6 +37,8 @@ var ( "With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing") maxIdleConnsPerBackend = flag.Int("maxIdleConnsPerBackend", 100, "The maximum number of idle connections vmauth can open per each backend host. "+ "See also -maxConcurrentRequests") + idleConnTimeout = flag.Duration("idleConnTimeout", 50*time.Second, `Defines a duration for idle (keep-alive connections) to exist. + Consider setting this value less than "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors.`) responseTimeout = flag.Duration("responseTimeout", 5*time.Minute, "The timeout for receiving a response from backend") maxConcurrentRequests = flag.Int("maxConcurrentRequests", 1000, "The maximum number of concurrent requests vmauth can process. Other requests are rejected with "+ "'429 Too Many Requests' http status code. See also -maxConcurrentPerUserRequests and -maxIdleConnsPerBackend command-line options") @@ -199,10 +201,8 @@ func processRequest(w http.ResponseWriter, r *http.Request, ui *UserInfo) { isDefault = true } maxAttempts := up.getBackendsCount() - if maxAttempts > 1 { - r.Body = &readTrackingBody{ - r: r.Body, - } + r.Body = &readTrackingBody{ + r: r.Body, } for i := 0; i < maxAttempts; i++ { bu := up.getBackendURL() @@ -243,8 +243,10 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url req.Host = targetURL.Host } updateHeadersByConfig(req.Header, hc.RequestHeaders) - res, err := ui.rt.RoundTrip(req) + var trivialRetries int rtb, rtbOK := req.Body.(*readTrackingBody) +again: + res, err := ui.rt.RoundTrip(req) if err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { // Do not retry canceled or timed out requests @@ -267,6 +269,12 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url ui.backendErrors.Inc() return true } + // one time retry trivial network errors, such as proxy idle timeout misconfiguration + // or socket close by OS + if (netutil.IsTrivialNetworkError(err) || errors.Is(err, io.EOF)) && trivialRetries < 1 { + trivialRetries++ + goto again + } // Retry the request if its body wasn't read yet. This usually means that the backend isn't reachable. remoteAddr := httpserver.GetQuotedRemoteAddr(r) // NOTE: do not use httpserver.GetRequestURI @@ -434,6 +442,7 @@ func newRoundTripper(caFileOpt, certFileOpt, keyFileOpt, serverNameOpt string, i tr.ResponseHeaderTimeout = *responseTimeout // Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535 tr.DisableCompression = true + tr.IdleConnTimeout = *idleConnTimeout tr.MaxIdleConnsPerHost = *maxIdleConnsPerBackend if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost { tr.MaxIdleConns = tr.MaxIdleConnsPerHost diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 87d26420d..3d3765656 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -34,6 +34,8 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * FEATURE: [alerts-vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmagent.yml): add new alerting rules `StreamAggrFlushTimeout` and `StreamAggrDedupFlushTimeout` to notify about issues during stream aggregation. * FEATURE: [dashboards/vmagent](https://grafana.com/grafana/dashboards/12683): add row `Streaming aggregation` with panels related to [streaming aggregation](https://docs.victoriametrics.com/stream-aggregation/) process. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add auto request retry for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends. * BUGFIX: all VictoriaMetrics components: prioritize `-configAuthKey` and `-reloadAuthKey` over `-httpAuth.*` settings. This change aligns behavior of mentioned flags with other auth flags like `-metricsAuthKey`, `-flagsAuthKey`, `-pprofAuthKey`. Check [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6329). * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): add `--disable-progress-bar` global command-line flag. It can be used for disabling dynamic progress bar for all migration modes. `--vm-disable-progress-bar` command-line flag is deprecated and will be removed in the future releases. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6367). diff --git a/docs/vmauth.md b/docs/vmauth.md index 798fc0ff5..7956b694a 100644 --- a/docs/vmauth.md +++ b/docs/vmauth.md @@ -1206,6 +1206,8 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . Whether to use proxy protocol for connections accepted at the corresponding -httpListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt . With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing Supports array of values separated by comma or specified via multiple flags. Empty values are set to false. + -idleConnTimeout duration + Defines a duration for idle (keep-alive connections) to exist. Consider setting this value less than "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors. (default 50s) -internStringCacheExpireDuration duration The expiry duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s) -internStringDisableCache