diff --git a/README.md b/README.md index 37c14ea02..820b6d1c2 100644 --- a/README.md +++ b/README.md @@ -1918,8 +1918,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages - -snapshotsMaxAge duration + -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted + The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) -sortLabels Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit -storage.cacheSizeIndexDBDataBlocks size diff --git a/app/vmagent/README.md b/app/vmagent/README.md index 7bc2bbb6f..d6ca120b0 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -926,6 +926,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . -remoteWrite.aws.secretKey array Optional AWS SecretKey to use for -remoteWrite.url if -remoteWrite.aws.useSigv4 is set. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url Supports an array of values separated by comma or specified via multiple flags. + -remoteWrite.aws.serice array + Optional AWS Service to use for -remoteWrite.url if -remoteWrite.aws.useSigv4 is set. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url. Defaults to "aps". + Supports an array of values separated by comma or specified via multiple flags. -remoteWrite.aws.useSigv4 array Enables SigV4 request signing for -remoteWrite.url. It is expected that other -remoteWrite.aws.* command-line flags are set if sigv4 request signing is enabled. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url Supports array of values separated by comma or specified via multiple flags. diff --git a/app/vmalert/README.md b/app/vmalert/README.md index d100edb60..0a54570ac 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -605,7 +605,7 @@ The shortlist of configuration flags is the following: -datasource.tlsServerName string Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used -datasource.url string - VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428 + VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend -defaultTenant.graphite string Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy -defaultTenant.prometheus string @@ -748,7 +748,7 @@ The shortlist of configuration flags is the following: -remoteRead.bearerTokenFile string Optional path to bearer token file to use for -remoteRead.url. -remoteRead.disablePathAppend - Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url. + Whether to disable automatic appending of '/api/v1/query' path to the configured -datasource.url and -remoteRead.url -remoteRead.ignoreRestoreErrors Whether to ignore errors from remote storage when restoring alerts state on startup. (default true) -remoteRead.lookback duration @@ -817,6 +817,8 @@ The shortlist of configuration flags is the following: Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used -remoteWrite.url string Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend + -replay.disableProgressBar + Whether to disable rendering progress bars during the replay. Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode. -replay.maxDatapointsPerQuery int Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000) -replay.ruleRetryAttempts int @@ -836,17 +838,20 @@ The shortlist of configuration flags is the following: absolute path to all .yaml files in root. Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars. Supports an array of values separated by comma or specified via multiple flags. - -rule.templates - Path or glob pattern to location with go template definitions for rules annotations templating. Flag can be specified multiple times. - Examples: - -rule.templates="/path/to/file". Path to a single file with go templates - -rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder, absolute path to all .tpl files in root. -rule.configCheckInterval duration Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead -rule.maxResolveDuration duration Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group. -rule.resendDelay duration Minimum amount of time to wait before resending an alert to notifier + -rule.templates array + Path or glob pattern to location with go template definitions + for rules annotations templating. Flag can be specified multiple times. + Examples: + -rule.templates="/path/to/file". Path to a single file with go templates + -rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder, + absolute path to all .tpl files in root. + Supports an array of values separated by comma or specified via multiple flags. -rule.validateExpressions Whether to validate rules expressions via MetricsQL engine (default true) -rule.validateTemplates diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 3009a7650..c428537ba 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -49,14 +49,21 @@ type groupMetrics struct { iterationTotal *utils.Counter iterationDuration *utils.Summary iterationMissed *utils.Counter + iterationInterval *utils.Gauge } -func newGroupMetrics(name, file string) *groupMetrics { +func newGroupMetrics(g *Group) *groupMetrics { m := &groupMetrics{} - labels := fmt.Sprintf(`group=%q, file=%q`, name, file) + labels := fmt.Sprintf(`group=%q, file=%q`, g.Name, g.File) m.iterationTotal = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels)) m.iterationDuration = utils.GetOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels)) m.iterationMissed = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_iteration_missed_total{%s}`, labels)) + m.iterationInterval = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_iteration_interval_seconds{%s}`, labels), func() float64 { + g.mu.RLock() + i := g.Interval.Seconds() + g.mu.RUnlock() + return i + }) return m } @@ -92,13 +99,13 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti finishedCh: make(chan struct{}), updateCh: make(chan *Group), } - g.metrics = newGroupMetrics(g.Name, g.File) if g.Interval == 0 { g.Interval = defaultInterval } if g.Concurrency < 1 { g.Concurrency = 1 } + g.metrics = newGroupMetrics(g) rules := make([]Rule, len(cfg.Rules)) for i, r := range cfg.Rules { var extraLabels map[string]string @@ -222,6 +229,8 @@ func (g *Group) close() { g.metrics.iterationDuration.Unregister() g.metrics.iterationTotal.Unregister() + g.metrics.iterationMissed.Unregister() + g.metrics.iterationInterval.Unregister() for _, rule := range g.Rules { rule.Close() } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index b8bcf6243..a6034622c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -15,18 +15,24 @@ The following tip changes can be tested by building VictoriaMetrics components f ## tip +## [v1.77.2](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.77.2) + +Released at 21-05-2022 + * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support [reusable templates](https://prometheus.io/docs/prometheus/latest/configuration/template_examples/#defining-reusable-templates) for rules annotations. The path to the template files can be specified via `-rule.templates` flag. See more about this feature [here](https://docs.victoriametrics.com/vmalert.html#reusable-templates). Thanks to @AndrewChubatiuk for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2532). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2510). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): expose `vmalert_iteration_interval_seconds` metric at `http://vmalert:8880/metrics`. This metric shows the configured per-group evaluation interval. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2618). * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `influx-prometheus-mode` command-line flag, which allows to restore the original time series written from Prometheus into InfluxDB during data migration from InfluxDB to VictoriaMetrics. See [this feature request](https://github.com/VictoriaMetrics/vmctl/issues/8). Thanks to @mback2k for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2545). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to specify AWS service name when issuing requests to AWS api. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2605). Thanks to @transacid for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2604). +* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fix a bug, which could lead to incomplete discovery of scrape targets in Kubernetes (aka `kubernetes_sd_config`). the bug has been introduced in [v1.77.0](https://docs.victoriametrics.com/CHANGELOG.html#v1770). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): support `scalar` result type in response. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2607). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): support strings in `humanize.*` template function in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2569). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): proxy `/rules` requests to vmalert from Grafana's alerting UI. This removes errors in Grafana's UI for Grafana versions older than `8.5.*`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2583) +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not add `/api/v1/query` suffix to `-datasource.url` if `-remoteRead.disablePathAppend` command-line flag is set. Previously this flag was applied only to `-remoteRead.url`, which could confuse users. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): prevent from possible resource leak on config update, which could lead to the slowdown of `vmalert` over time. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2577). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): do not return values from [label_value()](https://docs.victoriametrics.com/MetricsQL.html#label_value) function if the original time series has no values at the selected timestamps. * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): limit the number of concurrently established connections from vmselect to vmstorage. This should prevent from potentially high spikes in the number of established connections after temporary slowdown in connection handshake procedure between vmselect and vmstorage because of spikes in workload. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2552). * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): fix build for Solaris / SmartOS. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1322#issuecomment-1120276146). -* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not add `/api/v1/query` suffix to `-datasource.url` if `-remoteRead.disablePathAppend` command-line flag is set. Previously this flag was applied only to `-remoteRead.url`, which could confuse users. -* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): prevent from possible resource leak on config update, which could lead to the slowdown of `vmalert` over time. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2577). ## [v1.77.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.77.1) diff --git a/docs/README.md b/docs/README.md index 37c14ea02..820b6d1c2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1918,8 +1918,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages - -snapshotsMaxAge duration + -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted + The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) -sortLabels Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit -storage.cacheSizeIndexDBDataBlocks size diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index d6ab05e44..46943d458 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -1922,8 +1922,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages - -snapshotsMaxAge duration + -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted + The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) -sortLabels Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit -storage.cacheSizeIndexDBDataBlocks size diff --git a/docs/guides/README.md b/docs/guides/README.md index 80fa0b5b9..8dbbaad7b 100644 --- a/docs/guides/README.md +++ b/docs/guides/README.md @@ -9,3 +9,5 @@ sort: 22 3. [HA monitoring setup in K8s via VM Cluster](https://docs.victoriametrics.com/guides/k8s-ha-monitoring-via-vm-cluster.html) 4. [Getting started with VM Operator](https://docs.victoriametrics.com/guides/getting-started-with-vm-operator.html) 5. [Multi Retention Setup within VictoriaMetrics Cluster](https://docs.victoriametrics.com/guides/guide-vmcluster-multiple-retention-setup.html) +6. [Migrate from InfluxDB to VictoriaMetrics](https://docs.victoriametrics.com/guides/migrate-from-influx.html) +7. [Multi-regional setup with VictoriaMetrics: Dedicated regions for monitoring](https://docs.victoriametrics.com/guides/multi-regional-setup-dedicated-regions.html) diff --git a/docs/vmagent.md b/docs/vmagent.md index 395d5a533..257ee8bed 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -930,6 +930,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . -remoteWrite.aws.secretKey array Optional AWS SecretKey to use for -remoteWrite.url if -remoteWrite.aws.useSigv4 is set. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url Supports an array of values separated by comma or specified via multiple flags. + -remoteWrite.aws.serice array + Optional AWS Service to use for -remoteWrite.url if -remoteWrite.aws.useSigv4 is set. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url. Defaults to "aps". + Supports an array of values separated by comma or specified via multiple flags. -remoteWrite.aws.useSigv4 array Enables SigV4 request signing for -remoteWrite.url. It is expected that other -remoteWrite.aws.* command-line flags are set if sigv4 request signing is enabled. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url Supports array of values separated by comma or specified via multiple flags. diff --git a/docs/vmalert.md b/docs/vmalert.md index 885d5877c..79ce0da20 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -609,7 +609,7 @@ The shortlist of configuration flags is the following: -datasource.tlsServerName string Optional TLS server name to use for connections to -datasource.url. By default, the server name from -datasource.url is used -datasource.url string - VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428 + VictoriaMetrics or vmselect url. Required parameter. E.g. http://127.0.0.1:8428 . See also -remoteRead.disablePathAppend -defaultTenant.graphite string Default tenant for Graphite alerting groups. See https://docs.victoriametrics.com/vmalert.html#multitenancy -defaultTenant.prometheus string @@ -752,7 +752,7 @@ The shortlist of configuration flags is the following: -remoteRead.bearerTokenFile string Optional path to bearer token file to use for -remoteRead.url. -remoteRead.disablePathAppend - Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url. + Whether to disable automatic appending of '/api/v1/query' path to the configured -datasource.url and -remoteRead.url -remoteRead.ignoreRestoreErrors Whether to ignore errors from remote storage when restoring alerts state on startup. (default true) -remoteRead.lookback duration @@ -821,6 +821,8 @@ The shortlist of configuration flags is the following: Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used -remoteWrite.url string Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend + -replay.disableProgressBar + Whether to disable rendering progress bars during the replay. Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode. -replay.maxDatapointsPerQuery int Max number of data points expected in one request. The higher the value, the less requests will be made during replay. (default 1000) -replay.ruleRetryAttempts int @@ -840,17 +842,20 @@ The shortlist of configuration flags is the following: absolute path to all .yaml files in root. Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars. Supports an array of values separated by comma or specified via multiple flags. - -rule.templates - Path or glob pattern to location with go template definitions for rules annotations templating. Flag can be specified multiple times. - Examples: - -rule.templates="/path/to/file". Path to a single file with go templates - -rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder, absolute path to all .tpl files in root. -rule.configCheckInterval duration Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead -rule.maxResolveDuration duration Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group. -rule.resendDelay duration Minimum amount of time to wait before resending an alert to notifier + -rule.templates array + Path or glob pattern to location with go template definitions + for rules annotations templating. Flag can be specified multiple times. + Examples: + -rule.templates="/path/to/file". Path to a single file with go templates + -rule.templates="dir/*.tpl" -rule.templates="/*.tpl". Relative path to all .tpl files in "dir" folder, + absolute path to all .tpl files in root. + Supports an array of values separated by comma or specified via multiple flags. -rule.validateExpressions Whether to validate rules expressions via MetricsQL engine (default true) -rule.validateTemplates diff --git a/lib/promscrape/discovery/kubernetes/api_watcher.go b/lib/promscrape/discovery/kubernetes/api_watcher.go index f1dbc5002..6f76d5d0e 100644 --- a/lib/promscrape/discovery/kubernetes/api_watcher.go +++ b/lib/promscrape/discovery/kubernetes/api_watcher.go @@ -55,7 +55,7 @@ type apiWatcher struct { gw *groupWatcher - // swos contains per-urlWatcher maps of ScrapeWork objects for the given apiWatcher + // swosByURLWatcher contains per-urlWatcher maps of ScrapeWork objects for the given apiWatcher swosByURLWatcher map[*urlWatcher]map[string][]interface{} swosByURLWatcherLock sync.Mutex @@ -91,23 +91,51 @@ func (aw *apiWatcher) mustStart() { aw.gw.startWatchersForRole(aw.role, aw) } +func (aw *apiWatcher) updateSwosCount(multiplier int, swosByKey map[string][]interface{}) { + n := 0 + for _, swos := range swosByKey { + n += len(swos) + } + n *= multiplier + aw.swosCount.Add(n) +} + func (aw *apiWatcher) mustStop() { aw.gw.unsubscribeAPIWatcher(aw) aw.swosByURLWatcherLock.Lock() for _, swosByKey := range aw.swosByURLWatcher { - aw.swosCount.Add(-len(swosByKey)) + aw.updateSwosCount(-1, swosByKey) } aw.swosByURLWatcher = make(map[*urlWatcher]map[string][]interface{}) aw.swosByURLWatcherLock.Unlock() } -func (aw *apiWatcher) reloadScrapeWorks(uw *urlWatcher, swosByKey map[string][]interface{}) { +func (aw *apiWatcher) replaceScrapeWorks(uw *urlWatcher, swosByKey map[string][]interface{}) { aw.swosByURLWatcherLock.Lock() - aw.swosCount.Add(len(swosByKey) - len(aw.swosByURLWatcher[uw])) + aw.updateSwosCount(-1, aw.swosByURLWatcher[uw]) + aw.updateSwosCount(1, swosByKey) aw.swosByURLWatcher[uw] = swosByKey aw.swosByURLWatcherLock.Unlock() } +func (aw *apiWatcher) updateScrapeWorks(uw *urlWatcher, swosByKey map[string][]interface{}) { + aw.swosByURLWatcherLock.Lock() + dst := aw.swosByURLWatcher[uw] + if dst == nil { + dst = make(map[string][]interface{}) + aw.swosByURLWatcher[uw] = dst + } + for key, swos := range swosByKey { + aw.swosCount.Add(len(swos) - len(dst[key])) + if len(swos) == 0 { + delete(dst, key) + } else { + dst[key] = swos + } + } + aw.swosByURLWatcherLock.Unlock() +} + func (aw *apiWatcher) setScrapeWorks(uw *urlWatcher, key string, labels []map[string]string) { swos := getScrapeWorkObjectsForLabels(aw.swcFunc, labels) aw.swosByURLWatcherLock.Lock() @@ -117,10 +145,10 @@ func (aw *apiWatcher) setScrapeWorks(uw *urlWatcher, key string, labels []map[st aw.swosByURLWatcher[uw] = swosByKey } aw.swosCount.Add(len(swos) - len(swosByKey[key])) - if len(swos) > 0 { - swosByKey[key] = swos - } else { + if len(swos) == 0 { delete(swosByKey, key) + } else { + swosByKey[key] = swos } aw.swosByURLWatcherLock.Unlock() } @@ -250,6 +278,45 @@ var ( }) ) +type swosByKeyWithLock struct { + mu sync.Mutex + swosByKey map[string][]interface{} +} + +func (gw *groupWatcher) getScrapeWorkObjectsByAPIWatcherLocked(objectsByKey map[string]object, awsMap map[*apiWatcher]struct{}) map[*apiWatcher]*swosByKeyWithLock { + if len(awsMap) == 0 { + return nil + } + swosByAPIWatcher := make(map[*apiWatcher]*swosByKeyWithLock, len(awsMap)) + for aw := range awsMap { + swosByAPIWatcher[aw] = &swosByKeyWithLock{ + swosByKey: make(map[string][]interface{}), + } + } + + // Generate ScrapeWork objects in parallel on available CPU cores. + // This should reduce the time needed for their generation on systems with many CPU cores. + var wg sync.WaitGroup + limiterCh := make(chan struct{}, cgroup.AvailableCPUs()) + for key, o := range objectsByKey { + labels := o.getTargetLabels(gw) + wg.Add(1) + limiterCh <- struct{}{} + go func(key string, labels []map[string]string) { + for aw, e := range swosByAPIWatcher { + swos := getScrapeWorkObjectsForLabels(aw.swcFunc, labels) + e.mu.Lock() + e.swosByKey[key] = swos + e.mu.Unlock() + } + wg.Done() + <-limiterCh + }(key, labels) + } + wg.Wait() + return swosByAPIWatcher +} + func (gw *groupWatcher) getObjectByRoleLocked(role, namespace, name string) object { if role == "node" { // Node objects have no namespace @@ -310,9 +377,9 @@ func (gw *groupWatcher) startWatchersForRole(role string, aw *apiWatcher) { time.Sleep(sleepTime) startTime := time.Now() gw.mu.Lock() - if uw.needUpdateScrapeWorks { - uw.needUpdateScrapeWorks = false - uw.updateScrapeWorksLocked(uw.objectsByKey, uw.aws) + if uw.needRecreateScrapeWorks { + uw.needRecreateScrapeWorks = false + uw.recreateScrapeWorksLocked(uw.objectsByKey, uw.aws) sleepTime = time.Since(startTime) if sleepTime < minSleepTime { sleepTime = minSleepTime @@ -401,7 +468,7 @@ type urlWatcher struct { // objectsByKey contains the latest state for objects obtained from apiURL objectsByKey map[string]object - needUpdateScrapeWorks bool + needRecreateScrapeWorks bool resourceVersion string @@ -450,7 +517,7 @@ func (uw *urlWatcher) registerPendingAPIWatchersLocked() { if len(uw.awsPending) == 0 { return } - uw.updateScrapeWorksLocked(uw.objectsByKey, uw.awsPending) + uw.recreateScrapeWorksLocked(uw.objectsByKey, uw.awsPending) for aw := range uw.awsPending { uw.aws[aw] = struct{}{} } @@ -460,44 +527,23 @@ func (uw *urlWatcher) registerPendingAPIWatchersLocked() { metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_subscribers{role=%q,status="pending"}`, uw.role)).Add(-awsPendingLen) } -func (uw *urlWatcher) updateScrapeWorksLocked(objectsByKey map[string]object, awsMap map[*apiWatcher]struct{}) { - if len(objectsByKey) == 0 || len(awsMap) == 0 { - return - } - aws := make([]*apiWatcher, 0, len(awsMap)) - for aw := range awsMap { - aws = append(aws, aw) - } - swosByKey := make([]map[string][]interface{}, len(aws)) - for i := range aws { - swosByKey[i] = make(map[string][]interface{}) - } - - // Generate ScrapeWork objects in parallel on available CPU cores. - // This should reduce the time needed for their generation on systems with many CPU cores. - var swosByKeyLock sync.Mutex - var wg sync.WaitGroup - limiterCh := make(chan struct{}, cgroup.AvailableCPUs()) - for key, o := range objectsByKey { - labels := o.getTargetLabels(uw.gw) - wg.Add(1) - limiterCh <- struct{}{} - go func(key string, labels []map[string]string) { - for i, aw := range aws { - swos := getScrapeWorkObjectsForLabels(aw.swcFunc, labels) - if len(swos) > 0 { - swosByKeyLock.Lock() - swosByKey[i][key] = swos - swosByKeyLock.Unlock() - } +func (uw *urlWatcher) recreateScrapeWorksLocked(objectsByKey map[string]object, awsMap map[*apiWatcher]struct{}) { + es := uw.gw.getScrapeWorkObjectsByAPIWatcherLocked(objectsByKey, awsMap) + for aw, e := range es { + swosByKey := e.swosByKey + for key, swos := range swosByKey { + if len(swos) == 0 { + delete(swosByKey, key) } - wg.Done() - <-limiterCh - }(key, labels) + } + aw.replaceScrapeWorks(uw, swosByKey) } - wg.Wait() - for i, aw := range aws { - aw.reloadScrapeWorks(uw, swosByKey[i]) +} + +func (uw *urlWatcher) updateScrapeWorksLocked(objectsByKey map[string]object, awsMap map[*apiWatcher]struct{}) { + es := uw.gw.getScrapeWorkObjectsByAPIWatcherLocked(objectsByKey, awsMap) + for aw, e := range es { + aw.updateScrapeWorks(uw, e.swosByKey) } } @@ -574,7 +620,7 @@ func (uw *urlWatcher) reloadObjects() string { uw.removeScrapeWorksLocked(objectsRemoved) uw.updateScrapeWorksLocked(objectsUpdated, uw.aws) uw.updateScrapeWorksLocked(objectsAdded, uw.aws) - uw.needUpdateScrapeWorks = false + uw.needRecreateScrapeWorks = false if len(objectsRemoved) > 0 || len(objectsUpdated) > 0 || len(objectsAdded) > 0 { uw.maybeUpdateDependedScrapeWorksLocked() } @@ -756,12 +802,12 @@ func (uw *urlWatcher) maybeUpdateDependedScrapeWorksLocked() { } if (role == "pod" || role == "service") && (uwx.role == "endpoints" || uwx.role == "endpointslice") { // endpoints and endpointslice objects depend on pods and service objects - uwx.needUpdateScrapeWorks = true + uwx.needRecreateScrapeWorks = true continue } if attachNodeMetadata && role == "node" && uwx.role == "pod" { // pod objects depend on node objects if attachNodeMetadata is set - uwx.needUpdateScrapeWorks = true + uwx.needRecreateScrapeWorks = true continue } }