vmagent: expose metrics for tracking config state (#3375)

Expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics
for tracking relabel and scrape configuration hot-reloads.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345
Signed-off-by: hagen1778 <roman@victoriametrics.com>

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-11-21 23:38:43 +01:00 committed by GitHub
parent 2ddfde78c3
commit 03d88bc066
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 3 deletions

View file

@ -13,6 +13,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@ -139,6 +140,8 @@ func Init() {
logger.Fatalf("cannot load relabel configs: %s", err)
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
if len(*remoteWriteURLs) > 0 {
rwctxsDefault = newRemoteWriteCtxs(nil, *remoteWriteURLs)
@ -154,18 +157,31 @@ func Init() {
case <-stopCh:
return
}
configReloads.Inc()
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
rcs, err := loadRelabelConfigs()
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
continue
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Successfully reloaded relabel configs")
}
}()
}
var (
configReloads = metrics.NewCounter(`vm_relabel_config_reloads_total`)
configReloadErrors = metrics.NewCounter(`vm_relabel_config_reloads_errors_total`)
configSuccess = metrics.NewCounter(`vm_relabel_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vm_relabel_config_last_reload_success_timestamp_seconds`)
)
func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
if len(urls) == 0 {
logger.Panicf("BUG: urls must be non-empty")

View file

@ -120,3 +120,15 @@ groups:
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: ConfigurationReloadFailure
expr: |
vm_promscrape_config_last_reload_successful != 1
or
vm_relabel_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
Check vmagent's logs for detailed error message."

View file

@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add copy button to row on Table view. The button copies row in MetricQL format. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2815).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to "stick" a tooltip on the chart by clicking on a data point. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3321) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3376)
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add default alert list for vmalert's metrics. See [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345).
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return an empty result from [limit_offset](https://docs.victoriametrics.com/MetricsQL.html#limit_offset) if the `offset` arg exceeds the number of inner time series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3312).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover GCE zones when `filter` option is set at [gce_sd_configs](https://docs.victoriametrics.com/sd_configs.html#gce_sd_configs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3202).

View file

@ -4,12 +4,13 @@ import (
"bytes"
"flag"
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"io"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@ -112,6 +113,9 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
configData.Store(&marshaledData)
cfg.mustStart()
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
scs := newScrapeConfigs(pushData, globalStopCh)
scs.add("azure_sd_configs", *azure.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getAzureSDScrapeWork(swsPrev) })
scs.add("consul_sd_configs", *consul.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getConsulSDScrapeWork(swsPrev) })
@ -143,6 +147,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
logger.Infof("SIGHUP received; reloading Prometheus configs from %q", configFile)
cfgNew, dataNew, err := loadConfig(configFile)
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot read %q on SIGHUP: %s; continuing with the previous config", configFile, err)
goto waitForChans
}
@ -158,6 +164,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
case <-tickerCh:
cfgNew, dataNew, err := loadConfig(configFile)
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot read %q: %s; continuing with the previous config", configFile, err)
goto waitForChans
}
@ -180,10 +188,17 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
}
logger.Infof("found changes in %q; applying these changes", configFile)
configReloads.Inc()
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
}
}
var configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
var (
configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
configReloadErrors = metrics.NewCounter(`vm_promscrape_config_reloads_errors_total`)
configSuccess = metrics.NewCounter(`vm_promscrape_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vm_promscrape_config_last_reload_success_timestamp_seconds`)
)
type scrapeConfigs struct {
pushData func(at *auth.Token, wr *prompbmarshal.WriteRequest)