app/vmagent: support for DNS SRV urls at -remoteWrite.url, scrape target urls and service discovery urls

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053
This commit is contained in:
Aliaksandr Valialkin 2024-04-17 20:47:59 +02:00
parent b426d10847
commit dc326f70b4
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
7 changed files with 44 additions and 62 deletions

View file

@ -3,34 +3,15 @@ package remotewrite
import ( import (
"context" "context"
"net" "net"
"sync"
"sync/atomic" "sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/metrics" "github.com/VictoriaMetrics/metrics"
) )
func getStdDialer() *net.Dialer {
stdDialerOnce.Do(func() {
stdDialer = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: netutil.TCP6Enabled(),
}
})
return stdDialer
}
var (
stdDialer *net.Dialer
stdDialerOnce sync.Once
)
func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) { func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) {
network := netutil.GetTCPNetwork() network := netutil.GetTCPNetwork()
d := getStdDialer() conn, err = netutil.DialMaybeSRV(ctx, network, addr)
conn, err = d.DialContext(ctx, network, addr)
dialsTotal.Inc() dialsTotal.Inc()
if err != nil { if err != nil {
dialErrors.Inc() dialErrors.Inc()

View file

@ -30,9 +30,11 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
## tip ## tip
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) addresses in `-remoteWrite.url` command-line option and in scrape target urls. For example, `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends the collected metrics to these TCP addresses. See [these docs](https://docs.victoriametrics.com/vmagent/#srv-urls) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support automatic discovering and load balancing for TCP addresses behind DNS SRV addresses. These addresses can be put inside `url_prefix` urls in the form `http://srv+addr/path`, where the `addr` is the [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address, which is automatically resolved to hostnames with TCP ports. See [these docs](https://docs.victoriametrics.com/vmauth/#srv-urls) for details.
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support specifying client TLS certificates and TLS ServerName for requests to HTTPS backends. See [these docs](https://docs.victoriametrics.com/vmauth/#backend-tls-setup). * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support specifying client TLS certificates and TLS ServerName for requests to HTTPS backends. See [these docs](https://docs.victoriametrics.com/vmauth/#backend-tls-setup).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): supported any status codes from the range 200-299 from alertmanager. Previously, only 200 status code considered a successful action. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6110). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): supported any status codes from the range 200-299 from alertmanager. Previously, only 200 status code considered a successful action. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6110).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): don't treat concurrency limit hit as an error of the backend. Previously, hitting the concurrency limit would increment both `vmauth_concurrent_requests_limit_reached_total` and `vmauth_user_request_backend_errors_total` counters. Now, only concurrency limit counter is incremented. Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5565). * BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): don't treat concurrency limit hit as an error of the backend. Previously, hitting the concurrency limit would increment both `vmauth_concurrent_requests_limit_reached_total` and `vmauth_user_request_backend_errors_total` counters. Now, only concurrency limit counter is incremented. Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5565).

View file

@ -68,6 +68,7 @@ and sending the data to the Prometheus-compatible remote storage:
to run `vmagent` with `-promscrape.config.strictParse=false` command-line flag. to run `vmagent` with `-promscrape.config.strictParse=false` command-line flag.
In this case `vmagent` ignores unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections). In this case `vmagent` ignores unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections).
* `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics, where to send the data to. * `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics, where to send the data to.
The `-remoteWrite.url` may refer to [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address. See [these docs](#srv-urls) for details.
Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent) Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent)
to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`: to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`:
@ -129,7 +130,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
* Sending HTTP request to `http://vmagent:8429/-/reload` endpoint. This endpoint can be protected with `-reloadAuthKey` command-line flag. * Sending HTTP request to `http://vmagent:8429/-/reload` endpoint. This endpoint can be protected with `-reloadAuthKey` command-line flag.
There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file. There is also `-promscrape.configCheckInterval` command-line flag, which can be used for automatic reloading configs from updated `-promscrape.config` file.
## Use cases ## Use cases
@ -272,6 +273,24 @@ for the collected samples. Examples:
./vmagent -remoteWrite=http://remote-storage/api/v1/write -streamAggr.dropInputLabels=replica -remoteWrite.streamAggr.dedupInterval=60s ./vmagent -remoteWrite=http://remote-storage/api/v1/write -streamAggr.dropInputLabels=replica -remoteWrite.streamAggr.dedupInterval=60s
``` ```
## SRV urls
If `vmagent` encounters urls with `srv+` prefix in hostname (such as `http://srv+some-addr/some/path`), then it resolves `some-addr` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record)
record into TCP address with hostname and TCP port, and then uses the resulting url when it needs connecting to it.
SRV urls are supported in the following places:
- In `-remoteWrite.url` command-line flags. For example, if `victoria-metrics` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) record contains
`victoria-metrics-host:8428` TCP address, then `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` is automatically resolved into
`-remoteWrite.url=http://victoria-metrics-host:8428/api/v1/write`. If the DNS SRV record is resolved into multiple TCP addresses, then `vmauth`
uses randomly chosen address per each connection it establishes to the remote storage.
- In scrape target addresses aka `__address__` label - see [these docs](https://docs.victoriametrics.com/relabeling/#how-to-modify-scrape-urls-in-targets) for details.
- In urls used for [service discovery](https://docs.victoriametrics.com/sd_configs/).
SRV urls are useful when HTTP services run on different TCP ports or when they can change TCP ports over time (for instance, after the restart).
## VictoriaMetrics remote write protocol ## VictoriaMetrics remote write protocol
`vmagent` supports sending data to the configured `-remoteWrite.url` either via Prometheus remote write protocol `vmagent` supports sending data to the configured `-remoteWrite.url` either via Prometheus remote write protocol
@ -419,7 +438,7 @@ There is no need in specifying top-level `scrape_configs` section in these files
The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format). The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format).
Additionally, `vmagent` doesn't support `refresh_interval` option at service discovery sections. Additionally, `vmagent` doesn't support `refresh_interval` option at service discovery sections.
This option is substituted with `-promscrape.*CheckInterval` command-line options, which are specific per each service discovery type. This option is substituted with `-promscrape.*CheckInterval` command-line flags, which are specific per each service discovery type.
See [the full list of command-line flags for vmagent](#advanced-usage). See [the full list of command-line flags for vmagent](#advanced-usage).
## Adding labels to metrics ## Adding labels to metrics
@ -506,7 +525,7 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
sum_over_time(scrape_series_added[1h]) > 1000 sum_over_time(scrape_series_added[1h]) > 1000
``` ```
`vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line option `vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line flag
or when it scrapes target with `no_stale_markers: true` option, e.g. when [staleness markers](#prometheus-staleness-markers) are disabled. or when it scrapes target with `no_stale_markers: true` option, e.g. when [staleness markers](#prometheus-staleness-markers) are disabled.
* `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](#cardinality-limiter). * `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](#cardinality-limiter).
@ -1117,14 +1136,14 @@ If you have suggestions for improvements or have found a bug - please open an is
as `vmagent` establishes at least a single TCP connection per target. as `vmagent` establishes at least a single TCP connection per target.
* If `vmagent` uses too big amounts of memory, then the following options can help: * If `vmagent` uses too big amounts of memory, then the following options can help:
* Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option. * Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line flag.
Another option is to reduce memory limits in Docker and/or Kubernetes if `vmagent` runs under these systems. Another option is to reduce memory limits in Docker and/or Kubernetes if `vmagent` runs under these systems.
* Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`, * Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`,
where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems. where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems.
* Disabling staleness tracking with `-promscrape.noStaleMarkers` option. See [these docs](#prometheus-staleness-markers). * Disabling staleness tracking with `-promscrape.noStaleMarkers` option. See [these docs](#prometheus-staleness-markers).
* Enabling stream parsing mode if `vmagent` scrapes targets with millions of metrics per target. See [these docs](#stream-parsing-mode). * Enabling stream parsing mode if `vmagent` scrapes targets with millions of metrics per target. See [these docs](#stream-parsing-mode).
* Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line option. * Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line flag.
* Passing `-promscrape.dropOriginalLabels` command-line option to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html) * Passing `-promscrape.dropOriginalLabels` command-line flag to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html)
big number of targets and many of these targets are [dropped](https://docs.victoriametrics.com/relabeling.html#how-to-drop-discovered-targets) big number of targets and many of these targets are [dropped](https://docs.victoriametrics.com/relabeling.html#how-to-drop-discovered-targets)
before scraping. In this case `vmagent` drops `"discoveredLabels"` and `"droppedTargets"` before scraping. In this case `vmagent` drops `"discoveredLabels"` and `"droppedTargets"`
lists at `http://vmagent-host:8429/service-discovery` page. This reduces memory usage when scraping big number of targets at the cost lists at `http://vmagent-host:8429/service-discovery` page. This reduces memory usage when scraping big number of targets at the cost
@ -1142,7 +1161,7 @@ If you have suggestions for improvements or have found a bug - please open an is
may result in increased memory usage if a big number of scrape targets are dropped during relabeling. may result in increased memory usage if a big number of scrape targets are dropped during relabeling.
* It is recommended increaseing `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` [metric](#monitoring) * It is recommended increaseing `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` [metric](#monitoring)
grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line options in this case. grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line flags in this case.
This can improve data ingestion performance to the configured remote storage systems at the cost of higher memory usage. This can improve data ingestion performance to the configured remote storage systems at the cost of higher memory usage.
* If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, * If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set,
@ -1387,7 +1406,7 @@ See how to request a free trial license [here](https://victoriametrics.com/produ
### Reading metrics from Kafka ### Reading metrics from Kafka
[Enterprise version](https://docs.victoriametrics.com/enterprise/) of `vmagent` can read metrics in various formats from Kafka messages. [Enterprise version](https://docs.victoriametrics.com/enterprise/) of `vmagent` can read metrics in various formats from Kafka messages.
These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line options. The following formats are supported: These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line flags. The following formats are supported:
* `promremotewrite` - [Prometheus remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). * `promremotewrite` - [Prometheus remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write).
Messages in this format can be sent by vmagent - see [these docs](#writing-metrics-to-kafka). Messages in this format can be sent by vmagent - see [these docs](#writing-metrics-to-kafka).

View file

@ -26,6 +26,10 @@ func IsTrivialNetworkError(err error) bool {
func DialMaybeSRV(ctx context.Context, network, addr string) (net.Conn, error) { func DialMaybeSRV(ctx context.Context, network, addr string) (net.Conn, error) {
if strings.HasPrefix(addr, "srv+") { if strings.HasPrefix(addr, "srv+") {
addr = strings.TrimPrefix(addr, "srv+") addr = strings.TrimPrefix(addr, "srv+")
if n := strings.IndexByte(addr, ':'); n >= 0 {
// Drop port, since it should be automatically resolved via DNS SRV lookup below.
addr = addr[:n]
}
_, addrs, err := Resolver.LookupSRV(ctx, "", "", addr) _, addrs, err := Resolver.LookupSRV(ctx, "", "", addr)
if err != nil { if err != nil {
return nil, fmt.Errorf("cannot resolve SRV addr %s: %w", addr, err) return nil, fmt.Errorf("cannot resolve SRV addr %s: %w", addr, err)

View file

@ -10,6 +10,7 @@ import (
"time" "time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
) )
@ -74,7 +75,7 @@ func getMXAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels {
ch := make(chan result, len(sdc.Names)) ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names { for _, name := range sdc.Names {
go func(name string) { go func(name string) {
mx, err := resolver.LookupMX(ctx, name) mx, err := netutil.Resolver.LookupMX(ctx, name)
ch <- result{ ch <- result{
name: name, name: name,
mx: mx, mx: mx,
@ -109,7 +110,7 @@ func getSRVAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels {
ch := make(chan result, len(sdc.Names)) ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names { for _, name := range sdc.Names {
go func(name string) { go func(name string) {
_, as, err := resolver.LookupSRV(ctx, "", "", name) _, as, err := netutil.Resolver.LookupSRV(ctx, "", "", name)
ch <- result{ ch <- result{
name: name, name: name,
as: as, as: as,
@ -148,7 +149,7 @@ func getAAddrLabels(ctx context.Context, sdc *SDConfig, lookupType string) ([]*p
ch := make(chan result, len(sdc.Names)) ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names { for _, name := range sdc.Names {
go func(name string) { go func(name string) {
ips, err := resolver.LookupIPAddr(ctx, name) ips, err := netutil.Resolver.LookupIPAddr(ctx, name)
ch <- result{ ch <- result{
name: name, name: name,
ips: ips, ips: ips,
@ -192,8 +193,3 @@ func appendAddrLabels(ms []*promutils.Labels, name, target string, port int) []*
m.Add("__meta_dns_srv_record_port", strconv.Itoa(port)) m.Add("__meta_dns_srv_record_port", strconv.Itoa(port))
return append(ms, m) return append(ms, m)
} }
var resolver = &net.Resolver{
PreferGo: true,
StrictErrors: true,
}

View file

@ -15,6 +15,7 @@ import (
"github.com/VictoriaMetrics/metrics" "github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy" "github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool" "github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
@ -87,8 +88,6 @@ func (hc *HTTPClient) stop() {
hc.client.CloseIdleConnections() hc.client.CloseIdleConnections()
} }
var defaultDialer = &net.Dialer{}
// NewClient returns new Client for the given args. // NewClient returns new Client for the given args.
func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxyAC *promauth.Config, httpCfg *promauth.HTTPClientConfig) (*Client, error) { func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxyAC *promauth.Config, httpCfg *promauth.HTTPClientConfig) (*Client, error) {
u, err := url.Parse(apiServer) u, err := url.Parse(apiServer)
@ -96,13 +95,13 @@ func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxy
return nil, fmt.Errorf("cannot parse apiServer=%q: %w", apiServer, err) return nil, fmt.Errorf("cannot parse apiServer=%q: %w", apiServer, err)
} }
dialFunc := defaultDialer.DialContext dialFunc := netutil.DialMaybeSRV
if u.Scheme == "unix" { if u.Scheme == "unix" {
// special case for unix socket connection // special case for unix socket connection
dialAddr := u.Path dialAddr := u.Path
apiServer = "http://unix" apiServer = "http://unix"
dialFunc = func(ctx context.Context, _, _ string) (net.Conn, error) { dialFunc = func(ctx context.Context, _, _ string) (net.Conn, error) {
return defaultDialer.DialContext(ctx, "unix", dialAddr) return netutil.Dialer.DialContext(ctx, "unix", dialAddr)
} }
} }

View file

@ -6,18 +6,15 @@ import (
"net" "net"
"strconv" "strconv"
"strings" "strings"
"sync"
"sync/atomic" "sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/metrics" "github.com/VictoriaMetrics/metrics"
) )
func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) { func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) {
d := getStdDialer()
network := netutil.GetTCPNetwork() network := netutil.GetTCPNetwork()
conn, err := d.DialContext(ctx, network, addr) conn, err := netutil.DialMaybeSRV(ctx, network, addr)
dialsTotal.Inc() dialsTotal.Inc()
if err != nil { if err != nil {
dialErrors.Inc() dialErrors.Inc()
@ -33,22 +30,6 @@ func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) {
return sc, nil return sc, nil
} }
func getStdDialer() *net.Dialer {
stdDialerOnce.Do(func() {
stdDialer = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: netutil.TCP6Enabled(),
}
})
return stdDialer
}
var (
stdDialer *net.Dialer
stdDialerOnce sync.Once
)
var ( var (
dialsTotal = metrics.NewCounter(`vm_promscrape_dials_total`) dialsTotal = metrics.NewCounter(`vm_promscrape_dials_total`)
dialErrors = metrics.NewCounter(`vm_promscrape_dial_errors_total`) dialErrors = metrics.NewCounter(`vm_promscrape_dial_errors_total`)