mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmagent: expose metrics for tracking config state (#3375)
Expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345 Signed-off-by: hagen1778 <roman@victoriametrics.com> Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
2ddfde78c3
commit
03d88bc066
4 changed files with 47 additions and 3 deletions
|
@ -13,6 +13,7 @@ import (
|
|||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
||||
|
@ -139,6 +140,8 @@ func Init() {
|
|||
logger.Fatalf("cannot load relabel configs: %s", err)
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
|
||||
if len(*remoteWriteURLs) > 0 {
|
||||
rwctxsDefault = newRemoteWriteCtxs(nil, *remoteWriteURLs)
|
||||
|
@ -154,18 +157,31 @@ func Init() {
|
|||
case <-stopCh:
|
||||
return
|
||||
}
|
||||
configReloads.Inc()
|
||||
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
|
||||
rcs, err := loadRelabelConfigs()
|
||||
if err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
|
||||
continue
|
||||
}
|
||||
|
||||
allRelabelConfigs.Store(rcs)
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
logger.Infof("Successfully reloaded relabel configs")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
configReloads = metrics.NewCounter(`vm_relabel_config_reloads_total`)
|
||||
configReloadErrors = metrics.NewCounter(`vm_relabel_config_reloads_errors_total`)
|
||||
configSuccess = metrics.NewCounter(`vm_relabel_config_last_reload_successful`)
|
||||
configTimestamp = metrics.NewCounter(`vm_relabel_config_last_reload_success_timestamp_seconds`)
|
||||
)
|
||||
|
||||
func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
|
||||
if len(urls) == 0 {
|
||||
logger.Panicf("BUG: urls must be non-empty")
|
||||
|
|
|
@ -119,4 +119,16 @@ groups:
|
|||
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
|
||||
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
||||
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
|
||||
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
||||
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
||||
|
||||
- alert: ConfigurationReloadFailure
|
||||
expr: |
|
||||
vm_promscrape_config_last_reload_successful != 1
|
||||
or
|
||||
vm_relabel_config_last_reload_successful != 1
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
|
||||
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
|
||||
Check vmagent's logs for detailed error message."
|
|
@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
|||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add copy button to row on Table view. The button copies row in MetricQL format. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2815).
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to "stick" a tooltip on the chart by clicking on a data point. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3321) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3376)
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add default alert list for vmalert's metrics. See [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345).
|
||||
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return an empty result from [limit_offset](https://docs.victoriametrics.com/MetricsQL.html#limit_offset) if the `offset` arg exceeds the number of inner time series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3312).
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover GCE zones when `filter` option is set at [gce_sd_configs](https://docs.victoriametrics.com/sd_configs.html#gce_sd_configs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3202).
|
||||
|
|
|
@ -4,12 +4,13 @@ import (
|
|||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
|
||||
"io"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
|
@ -112,6 +113,9 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
|
|||
configData.Store(&marshaledData)
|
||||
cfg.mustStart()
|
||||
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
|
||||
scs := newScrapeConfigs(pushData, globalStopCh)
|
||||
scs.add("azure_sd_configs", *azure.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getAzureSDScrapeWork(swsPrev) })
|
||||
scs.add("consul_sd_configs", *consul.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getConsulSDScrapeWork(swsPrev) })
|
||||
|
@ -143,6 +147,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
|
|||
logger.Infof("SIGHUP received; reloading Prometheus configs from %q", configFile)
|
||||
cfgNew, dataNew, err := loadConfig(configFile)
|
||||
if err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("cannot read %q on SIGHUP: %s; continuing with the previous config", configFile, err)
|
||||
goto waitForChans
|
||||
}
|
||||
|
@ -158,6 +164,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
|
|||
case <-tickerCh:
|
||||
cfgNew, dataNew, err := loadConfig(configFile)
|
||||
if err != nil {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
logger.Errorf("cannot read %q: %s; continuing with the previous config", configFile, err)
|
||||
goto waitForChans
|
||||
}
|
||||
|
@ -180,10 +188,17 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
|
|||
}
|
||||
logger.Infof("found changes in %q; applying these changes", configFile)
|
||||
configReloads.Inc()
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(fasttime.UnixTimestamp())
|
||||
}
|
||||
}
|
||||
|
||||
var configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
|
||||
var (
|
||||
configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
|
||||
configReloadErrors = metrics.NewCounter(`vm_promscrape_config_reloads_errors_total`)
|
||||
configSuccess = metrics.NewCounter(`vm_promscrape_config_last_reload_successful`)
|
||||
configTimestamp = metrics.NewCounter(`vm_promscrape_config_last_reload_success_timestamp_seconds`)
|
||||
)
|
||||
|
||||
type scrapeConfigs struct {
|
||||
pushData func(at *auth.Token, wr *prompbmarshal.WriteRequest)
|
||||
|
|
Loading…
Reference in a new issue