vmalert: cleanup config reload metrics handling (#4790)

* rename `configErr` to `lastConfigErr` to reduce confusion
* add tests to verify metrics and msg are set properly
* fix mistake when config success metric wasn't restored after an error

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-08-07 21:58:40 +02:00 committed by Aliaksandr Valialkin
parent c0fdd73313
commit 4c91773a15
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
5 changed files with 65 additions and 40 deletions

View file

@ -319,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
validateTplFn = notifier.ValidateTemplates validateTplFn = notifier.ValidateTemplates
} }
// init reload metrics with positive values to improve alerting conditions // init metrics for config state with positive values to improve alerting conditions
setConfigSuccess(fasttime.UnixTimestamp()) setConfigSuccessAt(fasttime.UnixTimestamp())
parseFn := config.Parse parseFn := config.Parse
for { for {
select { select {
@ -358,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
} }
if configsEqual(newGroupsCfg, groupsCfg) { if configsEqual(newGroupsCfg, groupsCfg) {
templates.Reload() templates.Reload()
// set success to 1 since previous reload // set success to 1 since previous reload could have been unsuccessful
// could have been unsuccessful // do not update configTimestamp as config version remains old.
configSuccess.Set(1) configSuccess.Set(1)
setConfigError(nil) // reset the last config error since the config change was rolled back
// config didn't change - skip it setLastConfigErr(nil)
// config didn't change - skip iteration
continue continue
} }
if err := m.update(ctx, newGroupsCfg, false); err != nil { if err := m.update(ctx, newGroupsCfg, false); err != nil {
@ -372,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
} }
templates.Reload() templates.Reload()
groupsCfg = newGroupsCfg groupsCfg = newGroupsCfg
setConfigSuccess(fasttime.UnixTimestamp()) setConfigSuccessAt(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath) logger.Infof("Rules reloaded successfully from %q", *rulePath)
} }
} }
@ -389,39 +391,36 @@ func configsEqual(a, b []config.Group) bool {
return true return true
} }
// setConfigSuccess sets config reload status to 1. // setConfigSuccessAt updates config related metrics as successful.
func setConfigSuccess(at uint64) { func setConfigSuccessAt(at uint64) {
configSuccess.Set(1) configSuccess.Set(1)
configTimestamp.Set(at) configTimestamp.Set(at)
// reset the error if any // reset the lastConfigErr
setConfigErr(nil) setLastConfigErr(nil)
} }
// setConfigError sets config reload status to 0. // setConfigError updates config related metrics according to the error.
func setConfigError(err error) { func setConfigError(err error) {
configReloadErrors.Inc() configReloadErrors.Inc()
configSuccess.Set(0) configSuccess.Set(0)
setConfigErr(err) setLastConfigErr(err)
} }
var ( var (
configErrMu sync.RWMutex lastConfigErrMu sync.RWMutex
// configErr represent the error message from the last // lastConfigErr represent the error message from the last config reload.
// config reload. // The message is used in web UI as notification
configErr error lastConfigErr error
) )
func setConfigErr(err error) { func setLastConfigErr(err error) {
configErrMu.Lock() lastConfigErrMu.Lock()
configErr = err lastConfigErr = err
configErrMu.Unlock() lastConfigErrMu.Unlock()
} }
func configError() error { func getLastConfigError() error {
configErrMu.RLock() lastConfigErrMu.RLock()
defer configErrMu.RUnlock() defer lastConfigErrMu.RUnlock()
if configErr != nil { return lastConfigErr
return configErr
}
return nil
} }

View file

@ -118,7 +118,29 @@ groups:
return len(m.groups) return len(m.groups)
} }
checkCfg := func(err error) {
cErr := getLastConfigError()
cfgSuc := configSuccess.Get()
if err != nil {
if cErr == nil {
t.Fatalf("expected to have config error %s; got nil instead", cErr)
}
if cfgSuc != 0 {
t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc)
}
return
}
if cErr != nil {
t.Fatalf("unexpected config error: %s", cErr)
}
if cfgSuc != 1 {
t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc)
}
}
time.Sleep(*configCheckInterval * 2) time.Sleep(*configCheckInterval * 2)
checkCfg(nil)
groupsLen := lenLocked(m) groupsLen := lenLocked(m)
if groupsLen != 1 { if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
@ -126,6 +148,7 @@ groups:
writeToFile(t, f.Name(), rules2) writeToFile(t, f.Name(), rules2)
time.Sleep(*configCheckInterval * 2) time.Sleep(*configCheckInterval * 2)
checkCfg(nil)
groupsLen = lenLocked(m) groupsLen = lenLocked(m)
if groupsLen != 2 { if groupsLen != 2 {
fmt.Println(m.groups) fmt.Println(m.groups)
@ -135,6 +158,7 @@ groups:
writeToFile(t, f.Name(), rules1) writeToFile(t, f.Name(), rules1)
procutil.SelfSIGHUP() procutil.SelfSIGHUP()
time.Sleep(*configCheckInterval / 2) time.Sleep(*configCheckInterval / 2)
checkCfg(nil)
groupsLen = lenLocked(m) groupsLen = lenLocked(m)
if groupsLen != 1 { if groupsLen != 1 {
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
@ -143,6 +167,7 @@ groups:
writeToFile(t, f.Name(), `corrupted`) writeToFile(t, f.Name(), `corrupted`)
procutil.SelfSIGHUP() procutil.SelfSIGHUP()
time.Sleep(*configCheckInterval / 2) time.Sleep(*configCheckInterval / 2)
checkCfg(fmt.Errorf("config error"))
groupsLen = lenLocked(m) groupsLen = lenLocked(m)
if groupsLen != 1 { // should remain unchanged if groupsLen != 1 { // should remain unchanged
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)

View file

@ -12,7 +12,7 @@
{% func Welcome(r *http.Request) %} {% func Welcome(r *http.Request) %}
{%= tpl.Header(r, navItems, "vmalert", configError()) %} {%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}
<p> <p>
API:<br> API:<br>
{% for _, p := range apiLinks %} {% for _, p := range apiLinks %}
@ -40,7 +40,7 @@ btn-primary
{% func ListGroups(r *http.Request, originGroups []APIGroup) %} {% func ListGroups(r *http.Request, originGroups []APIGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups", configError()) %} {%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
{%code {%code
filter := r.URL.Query().Get("filter") filter := r.URL.Query().Get("filter")
rOk := make(map[string]int) rOk := make(map[string]int)
@ -168,7 +168,7 @@ btn-primary
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %} {% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Alerts", configError()) %} {%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
{% if len(groupAlerts) > 0 %} {% if len(groupAlerts) > 0 %}
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a> <a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a> <a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
@ -255,7 +255,7 @@ btn-primary
{% endfunc %} {% endfunc %}
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %} {% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
{%= tpl.Header(r, navItems, "Notifiers", configError()) %} {%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %}
{% if len(targets) > 0 %} {% if len(targets) > 0 %}
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a> <a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a> <a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
@ -312,7 +312,7 @@ btn-primary
{% func Alert(r *http.Request, alert *APIAlert) %} {% func Alert(r *http.Request, alert *APIAlert) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", configError()) %} {%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code {%code
var labelKeys []string var labelKeys []string
for k := range alert.Labels { for k := range alert.Labels {
@ -399,7 +399,7 @@ btn-primary
{% func RuleDetails(r *http.Request, rule APIRule) %} {% func RuleDetails(r *http.Request, rule APIRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", configError()) %} {%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code {%code
var labelKeys []string var labelKeys []string
for k := range rule.Labels { for k := range rule.Labels {

View file

@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) {
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:15 //line app/vmalert/web.qtpl:15
tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError()) tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError())
//line app/vmalert/web.qtpl:15 //line app/vmalert/web.qtpl:15
qw422016.N().S(` qw422016.N().S(`
<p> <p>
@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:43 //line app/vmalert/web.qtpl:43
tpl.StreamHeader(qw422016, r, navItems, "Groups", configError()) tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError())
//line app/vmalert/web.qtpl:43 //line app/vmalert/web.qtpl:43
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:171 //line app/vmalert/web.qtpl:171
tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError()) tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError())
//line app/vmalert/web.qtpl:171 //line app/vmalert/web.qtpl:171
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:258 //line app/vmalert/web.qtpl:258
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError()) tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError())
//line app/vmalert/web.qtpl:258 //line app/vmalert/web.qtpl:258
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:315 //line app/vmalert/web.qtpl:315
tpl.StreamHeader(qw422016, r, navItems, "", configError()) tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
//line app/vmalert/web.qtpl:315 //line app/vmalert/web.qtpl:315
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(` qw422016.N().S(`
`) `)
//line app/vmalert/web.qtpl:402 //line app/vmalert/web.qtpl:402
tpl.StreamHeader(qw422016, r, navItems, "", configError()) tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
//line app/vmalert/web.qtpl:402 //line app/vmalert/web.qtpl:402
qw422016.N().S(` qw422016.N().S(`
`) `)

View file

@ -40,6 +40,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components. * FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543).
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704). * BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).