From dc326f70b4026a802ba0db0c891b856d56b08c0d Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Apr 2024 20:47:59 +0200 Subject: [PATCH] app/vmagent: support for DNS SRV urls at -remoteWrite.url, scrape target urls and service discovery urls Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053 --- app/vmagent/remotewrite/statconn.go | 21 +-------------- docs/CHANGELOG.md | 6 +++-- docs/vmagent.md | 35 +++++++++++++++++++------ lib/netutil/netutil.go | 4 +++ lib/promscrape/discovery/dns/dns.go | 12 +++------ lib/promscrape/discoveryutils/client.go | 7 +++-- lib/promscrape/statconn.go | 21 +-------------- 7 files changed, 44 insertions(+), 62 deletions(-) diff --git a/app/vmagent/remotewrite/statconn.go b/app/vmagent/remotewrite/statconn.go index ff28e7580..924835496 100644 --- a/app/vmagent/remotewrite/statconn.go +++ b/app/vmagent/remotewrite/statconn.go @@ -3,34 +3,15 @@ package remotewrite import ( "context" "net" - "sync" "sync/atomic" - "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/metrics" ) -func getStdDialer() *net.Dialer { - stdDialerOnce.Do(func() { - stdDialer = &net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - DualStack: netutil.TCP6Enabled(), - } - }) - return stdDialer -} - -var ( - stdDialer *net.Dialer - stdDialerOnce sync.Once -) - func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) { network := netutil.GetTCPNetwork() - d := getStdDialer() - conn, err = d.DialContext(ctx, network, addr) + conn, err = netutil.DialMaybeSRV(ctx, network, addr) dialsTotal.Inc() if err != nil { dialErrors.Inc() diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index a40bd5a4d..cddd72f78 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -30,9 +30,11 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). ## tip -* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list. -* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) addresses in `-remoteWrite.url` command-line option and in scrape target urls. For example, `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends the collected metrics to these TCP addresses. See [these docs](https://docs.victoriametrics.com/vmagent/#srv-urls) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053). +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support automatic discovering and load balancing for TCP addresses behind DNS SRV addresses. These addresses can be put inside `url_prefix` urls in the form `http://srv+addr/path`, where the `addr` is the [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address, which is automatically resolved to hostnames with TCP ports. See [these docs](https://docs.victoriametrics.com/vmauth/#srv-urls) for details. * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support specifying client TLS certificates and TLS ServerName for requests to HTTPS backends. See [these docs](https://docs.victoriametrics.com/vmauth/#backend-tls-setup). +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070). +* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): supported any status codes from the range 200-299 from alertmanager. Previously, only 200 status code considered a successful action. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6110). * BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): don't treat concurrency limit hit as an error of the backend. Previously, hitting the concurrency limit would increment both `vmauth_concurrent_requests_limit_reached_total` and `vmauth_user_request_backend_errors_total` counters. Now, only concurrency limit counter is incremented. Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5565). diff --git a/docs/vmagent.md b/docs/vmagent.md index 53c616556..391ece303 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -68,6 +68,7 @@ and sending the data to the Prometheus-compatible remote storage: to run `vmagent` with `-promscrape.config.strictParse=false` command-line flag. In this case `vmagent` ignores unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections). * `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics, where to send the data to. + The `-remoteWrite.url` may refer to [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address. See [these docs](#srv-urls) for details. Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent) to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`: @@ -129,7 +130,7 @@ additionally to pull-based Prometheus-compatible targets' scraping: * Sending HTTP request to `http://vmagent:8429/-/reload` endpoint. This endpoint can be protected with `-reloadAuthKey` command-line flag. -There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file. +There is also `-promscrape.configCheckInterval` command-line flag, which can be used for automatic reloading configs from updated `-promscrape.config` file. ## Use cases @@ -272,6 +273,24 @@ for the collected samples. Examples: ./vmagent -remoteWrite=http://remote-storage/api/v1/write -streamAggr.dropInputLabels=replica -remoteWrite.streamAggr.dedupInterval=60s ``` +## SRV urls + +If `vmagent` encounters urls with `srv+` prefix in hostname (such as `http://srv+some-addr/some/path`), then it resolves `some-addr` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) +record into TCP address with hostname and TCP port, and then uses the resulting url when it needs connecting to it. + +SRV urls are supported in the following places: + +- In `-remoteWrite.url` command-line flags. For example, if `victoria-metrics` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) record contains + `victoria-metrics-host:8428` TCP address, then `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` is automatically resolved into + `-remoteWrite.url=http://victoria-metrics-host:8428/api/v1/write`. If the DNS SRV record is resolved into multiple TCP addresses, then `vmauth` + uses randomly chosen address per each connection it establishes to the remote storage. + +- In scrape target addresses aka `__address__` label - see [these docs](https://docs.victoriametrics.com/relabeling/#how-to-modify-scrape-urls-in-targets) for details. + +- In urls used for [service discovery](https://docs.victoriametrics.com/sd_configs/). + +SRV urls are useful when HTTP services run on different TCP ports or when they can change TCP ports over time (for instance, after the restart). + ## VictoriaMetrics remote write protocol `vmagent` supports sending data to the configured `-remoteWrite.url` either via Prometheus remote write protocol @@ -419,7 +438,7 @@ There is no need in specifying top-level `scrape_configs` section in these files The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format). Additionally, `vmagent` doesn't support `refresh_interval` option at service discovery sections. -This option is substituted with `-promscrape.*CheckInterval` command-line options, which are specific per each service discovery type. +This option is substituted with `-promscrape.*CheckInterval` command-line flags, which are specific per each service discovery type. See [the full list of command-line flags for vmagent](#advanced-usage). ## Adding labels to metrics @@ -506,7 +525,7 @@ and attaches `instance`, `job` and other target-specific labels to these metrics sum_over_time(scrape_series_added[1h]) > 1000 ``` - `vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line option + `vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line flag or when it scrapes target with `no_stale_markers: true` option, e.g. when [staleness markers](#prometheus-staleness-markers) are disabled. * `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](#cardinality-limiter). @@ -1117,14 +1136,14 @@ If you have suggestions for improvements or have found a bug - please open an is as `vmagent` establishes at least a single TCP connection per target. * If `vmagent` uses too big amounts of memory, then the following options can help: - * Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option. + * Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line flag. Another option is to reduce memory limits in Docker and/or Kubernetes if `vmagent` runs under these systems. * Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`, where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems. * Disabling staleness tracking with `-promscrape.noStaleMarkers` option. See [these docs](#prometheus-staleness-markers). * Enabling stream parsing mode if `vmagent` scrapes targets with millions of metrics per target. See [these docs](#stream-parsing-mode). - * Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line option. - * Passing `-promscrape.dropOriginalLabels` command-line option to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html) + * Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line flag. + * Passing `-promscrape.dropOriginalLabels` command-line flag to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html) big number of targets and many of these targets are [dropped](https://docs.victoriametrics.com/relabeling.html#how-to-drop-discovered-targets) before scraping. In this case `vmagent` drops `"discoveredLabels"` and `"droppedTargets"` lists at `http://vmagent-host:8429/service-discovery` page. This reduces memory usage when scraping big number of targets at the cost @@ -1142,7 +1161,7 @@ If you have suggestions for improvements or have found a bug - please open an is may result in increased memory usage if a big number of scrape targets are dropped during relabeling. * It is recommended increaseing `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` [metric](#monitoring) - grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line options in this case. + grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line flags in this case. This can improve data ingestion performance to the configured remote storage systems at the cost of higher memory usage. * If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, @@ -1387,7 +1406,7 @@ See how to request a free trial license [here](https://victoriametrics.com/produ ### Reading metrics from Kafka [Enterprise version](https://docs.victoriametrics.com/enterprise/) of `vmagent` can read metrics in various formats from Kafka messages. -These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line options. The following formats are supported: +These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line flags. The following formats are supported: * `promremotewrite` - [Prometheus remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). Messages in this format can be sent by vmagent - see [these docs](#writing-metrics-to-kafka). diff --git a/lib/netutil/netutil.go b/lib/netutil/netutil.go index 7d830cc13..39ffc609c 100644 --- a/lib/netutil/netutil.go +++ b/lib/netutil/netutil.go @@ -26,6 +26,10 @@ func IsTrivialNetworkError(err error) bool { func DialMaybeSRV(ctx context.Context, network, addr string) (net.Conn, error) { if strings.HasPrefix(addr, "srv+") { addr = strings.TrimPrefix(addr, "srv+") + if n := strings.IndexByte(addr, ':'); n >= 0 { + // Drop port, since it should be automatically resolved via DNS SRV lookup below. + addr = addr[:n] + } _, addrs, err := Resolver.LookupSRV(ctx, "", "", addr) if err != nil { return nil, fmt.Errorf("cannot resolve SRV addr %s: %w", addr, err) diff --git a/lib/promscrape/discovery/dns/dns.go b/lib/promscrape/discovery/dns/dns.go index edc35e650..977fbb9fd 100644 --- a/lib/promscrape/discovery/dns/dns.go +++ b/lib/promscrape/discovery/dns/dns.go @@ -10,6 +10,7 @@ import ( "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" ) @@ -74,7 +75,7 @@ func getMXAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels { ch := make(chan result, len(sdc.Names)) for _, name := range sdc.Names { go func(name string) { - mx, err := resolver.LookupMX(ctx, name) + mx, err := netutil.Resolver.LookupMX(ctx, name) ch <- result{ name: name, mx: mx, @@ -109,7 +110,7 @@ func getSRVAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels { ch := make(chan result, len(sdc.Names)) for _, name := range sdc.Names { go func(name string) { - _, as, err := resolver.LookupSRV(ctx, "", "", name) + _, as, err := netutil.Resolver.LookupSRV(ctx, "", "", name) ch <- result{ name: name, as: as, @@ -148,7 +149,7 @@ func getAAddrLabels(ctx context.Context, sdc *SDConfig, lookupType string) ([]*p ch := make(chan result, len(sdc.Names)) for _, name := range sdc.Names { go func(name string) { - ips, err := resolver.LookupIPAddr(ctx, name) + ips, err := netutil.Resolver.LookupIPAddr(ctx, name) ch <- result{ name: name, ips: ips, @@ -192,8 +193,3 @@ func appendAddrLabels(ms []*promutils.Labels, name, target string, port int) []* m.Add("__meta_dns_srv_record_port", strconv.Itoa(port)) return append(ms, m) } - -var resolver = &net.Resolver{ - PreferGo: true, - StrictErrors: true, -} diff --git a/lib/promscrape/discoveryutils/client.go b/lib/promscrape/discoveryutils/client.go index 1ee73ca00..f95543808 100644 --- a/lib/promscrape/discoveryutils/client.go +++ b/lib/promscrape/discoveryutils/client.go @@ -15,6 +15,7 @@ import ( "github.com/VictoriaMetrics/metrics" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy" "github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool" @@ -87,8 +88,6 @@ func (hc *HTTPClient) stop() { hc.client.CloseIdleConnections() } -var defaultDialer = &net.Dialer{} - // NewClient returns new Client for the given args. func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxyAC *promauth.Config, httpCfg *promauth.HTTPClientConfig) (*Client, error) { u, err := url.Parse(apiServer) @@ -96,13 +95,13 @@ func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxy return nil, fmt.Errorf("cannot parse apiServer=%q: %w", apiServer, err) } - dialFunc := defaultDialer.DialContext + dialFunc := netutil.DialMaybeSRV if u.Scheme == "unix" { // special case for unix socket connection dialAddr := u.Path apiServer = "http://unix" dialFunc = func(ctx context.Context, _, _ string) (net.Conn, error) { - return defaultDialer.DialContext(ctx, "unix", dialAddr) + return netutil.Dialer.DialContext(ctx, "unix", dialAddr) } } diff --git a/lib/promscrape/statconn.go b/lib/promscrape/statconn.go index 296f8d1d6..d637ffeca 100644 --- a/lib/promscrape/statconn.go +++ b/lib/promscrape/statconn.go @@ -6,18 +6,15 @@ import ( "net" "strconv" "strings" - "sync" "sync/atomic" - "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/metrics" ) func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) { - d := getStdDialer() network := netutil.GetTCPNetwork() - conn, err := d.DialContext(ctx, network, addr) + conn, err := netutil.DialMaybeSRV(ctx, network, addr) dialsTotal.Inc() if err != nil { dialErrors.Inc() @@ -33,22 +30,6 @@ func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) { return sc, nil } -func getStdDialer() *net.Dialer { - stdDialerOnce.Do(func() { - stdDialer = &net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - DualStack: netutil.TCP6Enabled(), - } - }) - return stdDialer -} - -var ( - stdDialer *net.Dialer - stdDialerOnce sync.Once -) - var ( dialsTotal = metrics.NewCounter(`vm_promscrape_dials_total`) dialErrors = metrics.NewCounter(`vm_promscrape_dial_errors_total`)