vmalert: cleanup config reload metrics handling (#4790)

* rename `configErr` to `lastConfigErr` to reduce confusion
* add tests to verify metrics and msg are set properly
* fix mistake when config success metric wasn't restored after an error

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-08-07 21:58:40 +02:00 committed by Aliaksandr Valialkin
parent c0fdd73313
commit 4c91773a15
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
5 changed files with 65 additions and 40 deletions

View file

@ -319,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
validateTplFn = notifier.ValidateTemplates
}
// init reload metrics with positive values to improve alerting conditions
setConfigSuccess(fasttime.UnixTimestamp())
// init metrics for config state with positive values to improve alerting conditions
setConfigSuccessAt(fasttime.UnixTimestamp())
parseFn := config.Parse
for {
select {
@ -358,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
}
if configsEqual(newGroupsCfg, groupsCfg) {
templates.Reload()
// set success to 1 since previous reload
// could have been unsuccessful
// set success to 1 since previous reload could have been unsuccessful
// do not update configTimestamp as config version remains old.
configSuccess.Set(1)
setConfigError(nil)
// config didn't change - skip it
// reset the last config error since the config change was rolled back
setLastConfigErr(nil)
// config didn't change - skip iteration
continue
}
if err := m.update(ctx, newGroupsCfg, false); err != nil {
@ -372,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
}
templates.Reload()
groupsCfg = newGroupsCfg
setConfigSuccess(fasttime.UnixTimestamp())
setConfigSuccessAt(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath)
}
}
@ -389,39 +391,36 @@ func configsEqual(a, b []config.Group) bool {
return true
}
// setConfigSuccess sets config reload status to 1.
func setConfigSuccess(at uint64) {
// setConfigSuccessAt updates config related metrics as successful.
func setConfigSuccessAt(at uint64) {
configSuccess.Set(1)
configTimestamp.Set(at)
// reset the error if any
setConfigErr(nil)
// reset the lastConfigErr
setLastConfigErr(nil)
}
// setConfigError sets config reload status to 0.
// setConfigError updates config related metrics according to the error.
func setConfigError(err error) {
configReloadErrors.Inc()
configSuccess.Set(0)
setConfigErr(err)
setLastConfigErr(err)
}
var (
configErrMu sync.RWMutex
// configErr represent the error message from the last
// config reload.
configErr error
lastConfigErrMu sync.RWMutex
// lastConfigErr represent the error message from the last config reload.
// The message is used in web UI as notification
lastConfigErr error
)
func setConfigErr(err error) {
configErrMu.Lock()
configErr = err
configErrMu.Unlock()
func setLastConfigErr(err error) {
lastConfigErrMu.Lock()
lastConfigErr = err
lastConfigErrMu.Unlock()
}
func configError() error {
configErrMu.RLock()
defer configErrMu.RUnlock()
if configErr != nil {
return configErr
}
return nil
func getLastConfigError() error {
lastConfigErrMu.RLock()
defer lastConfigErrMu.RUnlock()
return lastConfigErr
}

View file

@ -118,7 +118,29 @@ groups:
return len(m.groups)
}
checkCfg := func(err error) {
cErr := getLastConfigError()
cfgSuc := configSuccess.Get()
if err != nil {
if cErr == nil {
t.Fatalf("expected to have config error %s; got nil instead", cErr)
}
if cfgSuc != 0 {
t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc)
}
return
}
if cErr != nil {
t.Fatalf("unexpected config error: %s", cErr)
}
if cfgSuc != 1 {
t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc)
}
}
time.Sleep(*configCheckInterval * 2)
checkCfg(nil)
groupsLen := lenLocked(m)
if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
@ -126,6 +148,7 @@ groups:
writeToFile(t, f.Name(), rules2)
time.Sleep(*configCheckInterval * 2)
checkCfg(nil)
groupsLen = lenLocked(m)
if groupsLen != 2 {
fmt.Println(m.groups)
@ -135,6 +158,7 @@ groups:
writeToFile(t, f.Name(), rules1)
procutil.SelfSIGHUP()
time.Sleep(*configCheckInterval / 2)
checkCfg(nil)
groupsLen = lenLocked(m)
if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
@ -143,6 +167,7 @@ groups:
writeToFile(t, f.Name(), `corrupted`)
procutil.SelfSIGHUP()
time.Sleep(*configCheckInterval / 2)
checkCfg(fmt.Errorf("config error"))
groupsLen = lenLocked(m)
if groupsLen != 1 { // should remain unchanged
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)

View file

@ -12,7 +12,7 @@
{% func Welcome(r *http.Request) %}
{%= tpl.Header(r, navItems, "vmalert", configError()) %}
{%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}
<p>
API:<br>
{% for _, p := range apiLinks %}
@ -40,7 +40,7 @@ btn-primary
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups", configError()) %}
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
{%code
filter := r.URL.Query().Get("filter")
rOk := make(map[string]int)
@ -168,7 +168,7 @@ btn-primary
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Alerts", configError()) %}
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
{% if len(groupAlerts) > 0 %}
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
@ -255,7 +255,7 @@ btn-primary
{% endfunc %}
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
{%= tpl.Header(r, navItems, "Notifiers", configError()) %}
{%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %}
{% if len(targets) > 0 %}
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
@ -312,7 +312,7 @@ btn-primary
{% func Alert(r *http.Request, alert *APIAlert) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", configError()) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
var labelKeys []string
for k := range alert.Labels {
@ -399,7 +399,7 @@ btn-primary
{% func RuleDetails(r *http.Request, rule APIRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", configError()) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
var labelKeys []string
for k := range rule.Labels {

View file

@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) {
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:15
tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError())
tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError())
//line app/vmalert/web.qtpl:15
qw422016.N().S(`
<p>
@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:43
tpl.StreamHeader(qw422016, r, navItems, "Groups", configError())
tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError())
//line app/vmalert/web.qtpl:43
qw422016.N().S(`
`)
@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:171
tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError())
tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError())
//line app/vmalert/web.qtpl:171
qw422016.N().S(`
`)
@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:258
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError())
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError())
//line app/vmalert/web.qtpl:258
qw422016.N().S(`
`)
@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:315
tpl.StreamHeader(qw422016, r, navItems, "", configError())
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
//line app/vmalert/web.qtpl:315
qw422016.N().S(`
`)
@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`
`)
//line app/vmalert/web.qtpl:402
tpl.StreamHeader(qw422016, r, navItems, "", configError())
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
//line app/vmalert/web.qtpl:402
qw422016.N().S(`
`)

View file

@ -40,6 +40,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543).
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).