From 03d88bc066627cf4d32be681c85bb85aabdc86f3 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 21 Nov 2022 23:38:43 +0100 Subject: [PATCH] vmagent: expose metrics for tracking config state (#3375) Expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345 Signed-off-by: hagen1778 Signed-off-by: hagen1778 --- app/vmagent/remotewrite/remotewrite.go | 16 ++++++++++++++++ deployment/docker/alerts-vmagent.yml | 14 +++++++++++++- docs/CHANGELOG.md | 1 + lib/promscrape/scraper.go | 19 +++++++++++++++++-- 4 files changed, 47 insertions(+), 3 deletions(-) diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index a7326f66ed..c48143926e 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -13,6 +13,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" @@ -139,6 +140,8 @@ func Init() { logger.Fatalf("cannot load relabel configs: %s", err) } allRelabelConfigs.Store(rcs) + configSuccess.Set(1) + configTimestamp.Set(fasttime.UnixTimestamp()) if len(*remoteWriteURLs) > 0 { rwctxsDefault = newRemoteWriteCtxs(nil, *remoteWriteURLs) @@ -154,18 +157,31 @@ func Init() { case <-stopCh: return } + configReloads.Inc() logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig") rcs, err := loadRelabelConfigs() if err != nil { + configReloadErrors.Inc() + configSuccess.Set(0) logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err) continue } + allRelabelConfigs.Store(rcs) + configSuccess.Set(1) + configTimestamp.Set(fasttime.UnixTimestamp()) logger.Infof("Successfully reloaded relabel configs") } }() } +var ( + configReloads = metrics.NewCounter(`vm_relabel_config_reloads_total`) + configReloadErrors = metrics.NewCounter(`vm_relabel_config_reloads_errors_total`) + configSuccess = metrics.NewCounter(`vm_relabel_config_last_reload_successful`) + configTimestamp = metrics.NewCounter(`vm_relabel_config_last_reload_success_timestamp_seconds`) +) + func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx { if len(urls) == 0 { logger.Panicf("BUG: urls must be non-empty") diff --git a/deployment/docker/alerts-vmagent.yml b/deployment/docker/alerts-vmagent.yml index 0fd9b8d56d..560e7be86a 100644 --- a/deployment/docker/alerts-vmagent.yml +++ b/deployment/docker/alerts-vmagent.yml @@ -119,4 +119,16 @@ groups: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} reached 90% of the limit" description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. - Then samples for new time series will be dropped instead of sending them to remote storage systems." \ No newline at end of file + Then samples for new time series will be dropped instead of sending them to remote storage systems." + + - alert: ConfigurationReloadFailure + expr: | + vm_promscrape_config_last_reload_successful != 1 + or + vm_relabel_config_last_reload_successful != 1 + labels: + severity: warning + annotations: + summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}" + description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}. + Check vmagent's logs for detailed error message." \ No newline at end of file diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index a9d8c2ab16..b9163388dd 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add copy button to row on Table view. The button copies row in MetricQL format. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2815). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to "stick" a tooltip on the chart by clicking on a data point. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3321) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3376) * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add default alert list for vmalert's metrics. See [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return an empty result from [limit_offset](https://docs.victoriametrics.com/MetricsQL.html#limit_offset) if the `offset` arg exceeds the number of inner time series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3312). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover GCE zones when `filter` option is set at [gce_sd_configs](https://docs.victoriametrics.com/sd_configs.html#gce_sd_configs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3202). diff --git a/lib/promscrape/scraper.go b/lib/promscrape/scraper.go index 9ae5eed6db..470a387bd5 100644 --- a/lib/promscrape/scraper.go +++ b/lib/promscrape/scraper.go @@ -4,12 +4,13 @@ import ( "bytes" "flag" "fmt" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "io" "sync" "sync/atomic" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" @@ -112,6 +113,9 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh configData.Store(&marshaledData) cfg.mustStart() + configSuccess.Set(1) + configTimestamp.Set(fasttime.UnixTimestamp()) + scs := newScrapeConfigs(pushData, globalStopCh) scs.add("azure_sd_configs", *azure.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getAzureSDScrapeWork(swsPrev) }) scs.add("consul_sd_configs", *consul.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getConsulSDScrapeWork(swsPrev) }) @@ -143,6 +147,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh logger.Infof("SIGHUP received; reloading Prometheus configs from %q", configFile) cfgNew, dataNew, err := loadConfig(configFile) if err != nil { + configReloadErrors.Inc() + configSuccess.Set(0) logger.Errorf("cannot read %q on SIGHUP: %s; continuing with the previous config", configFile, err) goto waitForChans } @@ -158,6 +164,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh case <-tickerCh: cfgNew, dataNew, err := loadConfig(configFile) if err != nil { + configReloadErrors.Inc() + configSuccess.Set(0) logger.Errorf("cannot read %q: %s; continuing with the previous config", configFile, err) goto waitForChans } @@ -180,10 +188,17 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh } logger.Infof("found changes in %q; applying these changes", configFile) configReloads.Inc() + configSuccess.Set(1) + configTimestamp.Set(fasttime.UnixTimestamp()) } } -var configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`) +var ( + configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`) + configReloadErrors = metrics.NewCounter(`vm_promscrape_config_reloads_errors_total`) + configSuccess = metrics.NewCounter(`vm_promscrape_config_last_reload_successful`) + configTimestamp = metrics.NewCounter(`vm_promscrape_config_last_reload_success_timestamp_seconds`) +) type scrapeConfigs struct { pushData func(at *auth.Token, wr *prompbmarshal.WriteRequest)