diff --git a/app/vmalert/datasource/init.go b/app/vmalert/datasource/init.go index a6f44aa8a..671b0ee9e 100644 --- a/app/vmalert/datasource/init.go +++ b/app/vmalert/datasource/init.go @@ -41,7 +41,7 @@ func Init() (QuerierBuilder, error) { if err != nil { return nil, fmt.Errorf("failed to create transport: %w", err) } - tr.MaxIdleConns = *maxIdleConnections + tr.MaxIdleConnsPerHost = *maxIdleConnections var rd string if *roundDigits > 0 { diff --git a/app/vmauth/README.md b/app/vmauth/README.md index 464865c8b..f9cb74396 100644 --- a/app/vmauth/README.md +++ b/app/vmauth/README.md @@ -221,6 +221,8 @@ See the docs at https://docs.victoriametrics.com/vmauth.html . Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC") -loggerWarnsPerSecondLimit int Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit + -maxIdleConnsPerBackend int + The maximum number of idle connections vmauth can open per each backend host (default 100) -memory.allowedBytes size Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0) diff --git a/app/vmauth/main.go b/app/vmauth/main.go index bc6ae38ba..5979577aa 100644 --- a/app/vmauth/main.go +++ b/app/vmauth/main.go @@ -17,7 +17,8 @@ import ( ) var ( - httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections") + httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections") + maxIdleConnsPerBackend = flag.Int("maxIdleConnsPerBackend", 100, "The maximum number of idle connections vmauth can open per each backend host") ) func main() { @@ -85,6 +86,7 @@ var reverseProxy = &httputil.ReverseProxy{ tr.DisableCompression = true // Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this). tr.ForceAttemptHTTP2 = false + tr.MaxIdleConnsPerHost = *maxIdleConnsPerBackend return tr }(), FlushInterval: time.Second, diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index d1c9ffa50..72a545c57 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -19,6 +19,7 @@ sort: 15 * BUGFIX: properly remove stale parts outside the configured retention if `-retentionPeriod` is smaller than one month. Previously stale parts could remain active for up to a month after they go outside the retention. * BUGFIX: stop the process on panic errors, since such errors may leave the process in inconsistent state. Previously panics could be recovered, which could result in unexpected hard-to-debug further behavior of running process. * BUGFIX: vminsert, vmagent: make sure data ingestion connections are closed before completing graceful shutdown. Previously the connection may remain open, which could result in trailing samples loss. +* BUGFIX: vmauth, vmalert: properly re-use HTTP keep-alive connections to backends and datasources. Previously only 2 keep-alive connections per backend could be re-used. Other connections were closed after the first request. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1300) for details. ## tip diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go index 20ae13294..aaf230a55 100644 --- a/lib/promscrape/client.go +++ b/lib/promscrape/client.go @@ -118,6 +118,7 @@ func newClient(sw *ScrapeWork) *client { DisableCompression: *disableCompression || sw.DisableCompression, DisableKeepAlives: *disableKeepAlive || sw.DisableKeepAlive, DialContext: statStdDial, + MaxIdleConnsPerHost: 100, // Set timeout for receiving the first response byte, // since the duration for reading the full response can be much bigger because of stream parsing. diff --git a/lib/promscrape/discovery/kubernetes/api_watcher.go b/lib/promscrape/discovery/kubernetes/api_watcher.go index 1dbbbc1cc..645a3a62c 100644 --- a/lib/promscrape/discovery/kubernetes/api_watcher.go +++ b/lib/promscrape/discovery/kubernetes/api_watcher.go @@ -178,6 +178,7 @@ func newGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string, Proxy: proxy, TLSHandshakeTimeout: 10 * time.Second, IdleConnTimeout: *apiServerTimeout, + MaxIdleConnsPerHost: 100, }, Timeout: *apiServerTimeout, } diff --git a/lib/promscrape/discovery/openstack/api.go b/lib/promscrape/discovery/openstack/api.go index fecfd3158..8e29d1d0c 100644 --- a/lib/promscrape/discovery/openstack/api.go +++ b/lib/promscrape/discovery/openstack/api.go @@ -70,7 +70,11 @@ func getAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { func newAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { cfg := &apiConfig{ - client: &http.Client{}, + client: &http.Client{ + Transport: &http.Transport{ + MaxIdleConnsPerHost: 100, + }, + }, availability: sdc.Availability, region: sdc.Region, allTenants: sdc.AllTenants, @@ -82,7 +86,8 @@ func newAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { return nil, err } cfg.client.Transport = &http.Transport{ - TLSClientConfig: ac.NewTLSConfig(), + TLSClientConfig: ac.NewTLSConfig(), + MaxIdleConnsPerHost: 100, } } // use public compute endpoint by default