mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: cleanup config reload metrics handling (#4790)
* rename `configErr` to `lastConfigErr` to reduce confusion * add tests to verify metrics and msg are set properly * fix mistake when config success metric wasn't restored after an error Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
c0fdd73313
commit
4c91773a15
5 changed files with 65 additions and 40 deletions
|
@ -319,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
|||
validateTplFn = notifier.ValidateTemplates
|
||||
}
|
||||
|
||||
// init reload metrics with positive values to improve alerting conditions
|
||||
setConfigSuccess(fasttime.UnixTimestamp())
|
||||
// init metrics for config state with positive values to improve alerting conditions
|
||||
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||
|
||||
parseFn := config.Parse
|
||||
for {
|
||||
select {
|
||||
|
@ -358,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
|||
}
|
||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||
templates.Reload()
|
||||
// set success to 1 since previous reload
|
||||
// could have been unsuccessful
|
||||
// set success to 1 since previous reload could have been unsuccessful
|
||||
// do not update configTimestamp as config version remains old.
|
||||
configSuccess.Set(1)
|
||||
setConfigError(nil)
|
||||
// config didn't change - skip it
|
||||
// reset the last config error since the config change was rolled back
|
||||
setLastConfigErr(nil)
|
||||
// config didn't change - skip iteration
|
||||
continue
|
||||
}
|
||||
if err := m.update(ctx, newGroupsCfg, false); err != nil {
|
||||
|
@ -372,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
|||
}
|
||||
templates.Reload()
|
||||
groupsCfg = newGroupsCfg
|
||||
setConfigSuccess(fasttime.UnixTimestamp())
|
||||
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||
}
|
||||
}
|
||||
|
@ -389,39 +391,36 @@ func configsEqual(a, b []config.Group) bool {
|
|||
return true
|
||||
}
|
||||
|
||||
// setConfigSuccess sets config reload status to 1.
|
||||
func setConfigSuccess(at uint64) {
|
||||
// setConfigSuccessAt updates config related metrics as successful.
|
||||
func setConfigSuccessAt(at uint64) {
|
||||
configSuccess.Set(1)
|
||||
configTimestamp.Set(at)
|
||||
// reset the error if any
|
||||
setConfigErr(nil)
|
||||
// reset the lastConfigErr
|
||||
setLastConfigErr(nil)
|
||||
}
|
||||
|
||||
// setConfigError sets config reload status to 0.
|
||||
// setConfigError updates config related metrics according to the error.
|
||||
func setConfigError(err error) {
|
||||
configReloadErrors.Inc()
|
||||
configSuccess.Set(0)
|
||||
setConfigErr(err)
|
||||
setLastConfigErr(err)
|
||||
}
|
||||
|
||||
var (
|
||||
configErrMu sync.RWMutex
|
||||
// configErr represent the error message from the last
|
||||
// config reload.
|
||||
configErr error
|
||||
lastConfigErrMu sync.RWMutex
|
||||
// lastConfigErr represent the error message from the last config reload.
|
||||
// The message is used in web UI as notification
|
||||
lastConfigErr error
|
||||
)
|
||||
|
||||
func setConfigErr(err error) {
|
||||
configErrMu.Lock()
|
||||
configErr = err
|
||||
configErrMu.Unlock()
|
||||
func setLastConfigErr(err error) {
|
||||
lastConfigErrMu.Lock()
|
||||
lastConfigErr = err
|
||||
lastConfigErrMu.Unlock()
|
||||
}
|
||||
|
||||
func configError() error {
|
||||
configErrMu.RLock()
|
||||
defer configErrMu.RUnlock()
|
||||
if configErr != nil {
|
||||
return configErr
|
||||
}
|
||||
return nil
|
||||
func getLastConfigError() error {
|
||||
lastConfigErrMu.RLock()
|
||||
defer lastConfigErrMu.RUnlock()
|
||||
return lastConfigErr
|
||||
}
|
||||
|
|
|
@ -118,7 +118,29 @@ groups:
|
|||
return len(m.groups)
|
||||
}
|
||||
|
||||
checkCfg := func(err error) {
|
||||
cErr := getLastConfigError()
|
||||
cfgSuc := configSuccess.Get()
|
||||
if err != nil {
|
||||
if cErr == nil {
|
||||
t.Fatalf("expected to have config error %s; got nil instead", cErr)
|
||||
}
|
||||
if cfgSuc != 0 {
|
||||
t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if cErr != nil {
|
||||
t.Fatalf("unexpected config error: %s", cErr)
|
||||
}
|
||||
if cfgSuc != 1 {
|
||||
t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc)
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(*configCheckInterval * 2)
|
||||
checkCfg(nil)
|
||||
groupsLen := lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
|
@ -126,6 +148,7 @@ groups:
|
|||
|
||||
writeToFile(t, f.Name(), rules2)
|
||||
time.Sleep(*configCheckInterval * 2)
|
||||
checkCfg(nil)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 2 {
|
||||
fmt.Println(m.groups)
|
||||
|
@ -135,6 +158,7 @@ groups:
|
|||
writeToFile(t, f.Name(), rules1)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*configCheckInterval / 2)
|
||||
checkCfg(nil)
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 {
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
|
@ -143,6 +167,7 @@ groups:
|
|||
writeToFile(t, f.Name(), `corrupted`)
|
||||
procutil.SelfSIGHUP()
|
||||
time.Sleep(*configCheckInterval / 2)
|
||||
checkCfg(fmt.Errorf("config error"))
|
||||
groupsLen = lenLocked(m)
|
||||
if groupsLen != 1 { // should remain unchanged
|
||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
|
||||
{% func Welcome(r *http.Request) %}
|
||||
{%= tpl.Header(r, navItems, "vmalert", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}
|
||||
<p>
|
||||
API:<br>
|
||||
{% for _, p := range apiLinks %}
|
||||
|
@ -40,7 +40,7 @@ btn-primary
|
|||
|
||||
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Groups", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
|
||||
{%code
|
||||
filter := r.URL.Query().Get("filter")
|
||||
rOk := make(map[string]int)
|
||||
|
@ -168,7 +168,7 @@ btn-primary
|
|||
|
||||
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Alerts", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
|
||||
{% if len(groupAlerts) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||
|
@ -255,7 +255,7 @@ btn-primary
|
|||
{% endfunc %}
|
||||
|
||||
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
|
||||
{%= tpl.Header(r, navItems, "Notifiers", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %}
|
||||
{% if len(targets) > 0 %}
|
||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||
|
@ -312,7 +312,7 @@ btn-primary
|
|||
|
||||
{% func Alert(r *http.Request, alert *APIAlert) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
for k := range alert.Labels {
|
||||
|
@ -399,7 +399,7 @@ btn-primary
|
|||
|
||||
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
for k := range rule.Labels {
|
||||
|
|
|
@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) {
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:15
|
||||
tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:15
|
||||
qw422016.N().S(`
|
||||
<p>
|
||||
|
@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:43
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Groups", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:43
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:171
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:171
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:258
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:258
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:315
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:315
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
qw422016.N().S(`
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:402
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
||||
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||
//line app/vmalert/web.qtpl:402
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
|
|
@ -40,6 +40,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
|
|||
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
|
||||
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543).
|
||||
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue