diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 5a15eff22..4e81ce838 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -319,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig validateTplFn = notifier.ValidateTemplates } - // init reload metrics with positive values to improve alerting conditions - setConfigSuccess(fasttime.UnixTimestamp()) + // init metrics for config state with positive values to improve alerting conditions + setConfigSuccessAt(fasttime.UnixTimestamp()) + parseFn := config.Parse for { select { @@ -358,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig } if configsEqual(newGroupsCfg, groupsCfg) { templates.Reload() - // set success to 1 since previous reload - // could have been unsuccessful + // set success to 1 since previous reload could have been unsuccessful + // do not update configTimestamp as config version remains old. configSuccess.Set(1) - setConfigError(nil) - // config didn't change - skip it + // reset the last config error since the config change was rolled back + setLastConfigErr(nil) + // config didn't change - skip iteration continue } if err := m.update(ctx, newGroupsCfg, false); err != nil { @@ -372,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig } templates.Reload() groupsCfg = newGroupsCfg - setConfigSuccess(fasttime.UnixTimestamp()) + setConfigSuccessAt(fasttime.UnixTimestamp()) logger.Infof("Rules reloaded successfully from %q", *rulePath) } } @@ -389,39 +391,36 @@ func configsEqual(a, b []config.Group) bool { return true } -// setConfigSuccess sets config reload status to 1. -func setConfigSuccess(at uint64) { +// setConfigSuccessAt updates config related metrics as successful. +func setConfigSuccessAt(at uint64) { configSuccess.Set(1) configTimestamp.Set(at) - // reset the error if any - setConfigErr(nil) + // reset the lastConfigErr + setLastConfigErr(nil) } -// setConfigError sets config reload status to 0. +// setConfigError updates config related metrics according to the error. func setConfigError(err error) { configReloadErrors.Inc() configSuccess.Set(0) - setConfigErr(err) + setLastConfigErr(err) } var ( - configErrMu sync.RWMutex - // configErr represent the error message from the last - // config reload. - configErr error + lastConfigErrMu sync.RWMutex + // lastConfigErr represent the error message from the last config reload. + // The message is used in web UI as notification + lastConfigErr error ) -func setConfigErr(err error) { - configErrMu.Lock() - configErr = err - configErrMu.Unlock() +func setLastConfigErr(err error) { + lastConfigErrMu.Lock() + lastConfigErr = err + lastConfigErrMu.Unlock() } -func configError() error { - configErrMu.RLock() - defer configErrMu.RUnlock() - if configErr != nil { - return configErr - } - return nil +func getLastConfigError() error { + lastConfigErrMu.RLock() + defer lastConfigErrMu.RUnlock() + return lastConfigErr } diff --git a/app/vmalert/main_test.go b/app/vmalert/main_test.go index ea9be4310..2c7e0254d 100644 --- a/app/vmalert/main_test.go +++ b/app/vmalert/main_test.go @@ -118,7 +118,29 @@ groups: return len(m.groups) } + checkCfg := func(err error) { + cErr := getLastConfigError() + cfgSuc := configSuccess.Get() + if err != nil { + if cErr == nil { + t.Fatalf("expected to have config error %s; got nil instead", cErr) + } + if cfgSuc != 0 { + t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc) + } + return + } + + if cErr != nil { + t.Fatalf("unexpected config error: %s", cErr) + } + if cfgSuc != 1 { + t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc) + } + } + time.Sleep(*configCheckInterval * 2) + checkCfg(nil) groupsLen := lenLocked(m) if groupsLen != 1 { t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) @@ -126,6 +148,7 @@ groups: writeToFile(t, f.Name(), rules2) time.Sleep(*configCheckInterval * 2) + checkCfg(nil) groupsLen = lenLocked(m) if groupsLen != 2 { fmt.Println(m.groups) @@ -135,6 +158,7 @@ groups: writeToFile(t, f.Name(), rules1) procutil.SelfSIGHUP() time.Sleep(*configCheckInterval / 2) + checkCfg(nil) groupsLen = lenLocked(m) if groupsLen != 1 { t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) @@ -143,6 +167,7 @@ groups: writeToFile(t, f.Name(), `corrupted`) procutil.SelfSIGHUP() time.Sleep(*configCheckInterval / 2) + checkCfg(fmt.Errorf("config error")) groupsLen = lenLocked(m) if groupsLen != 1 { // should remain unchanged t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) diff --git a/app/vmalert/web.qtpl b/app/vmalert/web.qtpl index 2cbf0a2ce..425ea6ea5 100644 --- a/app/vmalert/web.qtpl +++ b/app/vmalert/web.qtpl @@ -12,7 +12,7 @@ {% func Welcome(r *http.Request) %} - {%= tpl.Header(r, navItems, "vmalert", configError()) %} + {%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}

API:
{% for _, p := range apiLinks %} @@ -40,7 +40,7 @@ btn-primary {% func ListGroups(r *http.Request, originGroups []APIGroup) %} {%code prefix := utils.Prefix(r.URL.Path) %} - {%= tpl.Header(r, navItems, "Groups", configError()) %} + {%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %} {%code filter := r.URL.Query().Get("filter") rOk := make(map[string]int) @@ -168,7 +168,7 @@ btn-primary {% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %} {%code prefix := utils.Prefix(r.URL.Path) %} - {%= tpl.Header(r, navItems, "Alerts", configError()) %} + {%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %} {% if len(groupAlerts) > 0 %} Collapse All Expand All @@ -255,7 +255,7 @@ btn-primary {% endfunc %} {% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %} - {%= tpl.Header(r, navItems, "Notifiers", configError()) %} + {%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %} {% if len(targets) > 0 %} Collapse All Expand All @@ -312,7 +312,7 @@ btn-primary {% func Alert(r *http.Request, alert *APIAlert) %} {%code prefix := utils.Prefix(r.URL.Path) %} - {%= tpl.Header(r, navItems, "", configError()) %} + {%= tpl.Header(r, navItems, "", getLastConfigError()) %} {%code var labelKeys []string for k := range alert.Labels { @@ -399,7 +399,7 @@ btn-primary {% func RuleDetails(r *http.Request, rule APIRule) %} {%code prefix := utils.Prefix(r.URL.Path) %} - {%= tpl.Header(r, navItems, "", configError()) %} + {%= tpl.Header(r, navItems, "", getLastConfigError()) %} {%code var labelKeys []string for k := range rule.Labels { diff --git a/app/vmalert/web.qtpl.go b/app/vmalert/web.qtpl.go index 0fa83c873..bb95cc89b 100644 --- a/app/vmalert/web.qtpl.go +++ b/app/vmalert/web.qtpl.go @@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) { qw422016.N().S(` `) //line app/vmalert/web.qtpl:15 - tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError()) + tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError()) //line app/vmalert/web.qtpl:15 qw422016.N().S(`

@@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [ qw422016.N().S(` `) //line app/vmalert/web.qtpl:43 - tpl.StreamHeader(qw422016, r, navItems, "Groups", configError()) + tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError()) //line app/vmalert/web.qtpl:43 qw422016.N().S(` `) @@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts [] qw422016.N().S(` `) //line app/vmalert/web.qtpl:171 - tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError()) + tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError()) //line app/vmalert/web.qtpl:171 qw422016.N().S(` `) @@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n qw422016.N().S(` `) //line app/vmalert/web.qtpl:258 - tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError()) + tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError()) //line app/vmalert/web.qtpl:258 qw422016.N().S(` `) @@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) { qw422016.N().S(` `) //line app/vmalert/web.qtpl:315 - tpl.StreamHeader(qw422016, r, navItems, "", configError()) + tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError()) //line app/vmalert/web.qtpl:315 qw422016.N().S(` `) @@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule) qw422016.N().S(` `) //line app/vmalert/web.qtpl:402 - tpl.StreamHeader(qw422016, r, navItems, "", configError()) + tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError()) //line app/vmalert/web.qtpl:402 qw422016.N().S(` `) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 8f6d6d47f..f05553ee8 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -40,6 +40,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components * FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680). +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543). * BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).