mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: cleanup config reload metrics handling (#4790)
* rename `configErr` to `lastConfigErr` to reduce confusion * add tests to verify metrics and msg are set properly * fix mistake when config success metric wasn't restored after an error Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
c0fdd73313
commit
4c91773a15
5 changed files with 65 additions and 40 deletions
|
@ -319,8 +319,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||||
validateTplFn = notifier.ValidateTemplates
|
validateTplFn = notifier.ValidateTemplates
|
||||||
}
|
}
|
||||||
|
|
||||||
// init reload metrics with positive values to improve alerting conditions
|
// init metrics for config state with positive values to improve alerting conditions
|
||||||
setConfigSuccess(fasttime.UnixTimestamp())
|
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||||
|
|
||||||
parseFn := config.Parse
|
parseFn := config.Parse
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
|
@ -358,11 +359,12 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||||
}
|
}
|
||||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||||
templates.Reload()
|
templates.Reload()
|
||||||
// set success to 1 since previous reload
|
// set success to 1 since previous reload could have been unsuccessful
|
||||||
// could have been unsuccessful
|
// do not update configTimestamp as config version remains old.
|
||||||
configSuccess.Set(1)
|
configSuccess.Set(1)
|
||||||
setConfigError(nil)
|
// reset the last config error since the config change was rolled back
|
||||||
// config didn't change - skip it
|
setLastConfigErr(nil)
|
||||||
|
// config didn't change - skip iteration
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := m.update(ctx, newGroupsCfg, false); err != nil {
|
if err := m.update(ctx, newGroupsCfg, false); err != nil {
|
||||||
|
@ -372,7 +374,7 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sig
|
||||||
}
|
}
|
||||||
templates.Reload()
|
templates.Reload()
|
||||||
groupsCfg = newGroupsCfg
|
groupsCfg = newGroupsCfg
|
||||||
setConfigSuccess(fasttime.UnixTimestamp())
|
setConfigSuccessAt(fasttime.UnixTimestamp())
|
||||||
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
logger.Infof("Rules reloaded successfully from %q", *rulePath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -389,39 +391,36 @@ func configsEqual(a, b []config.Group) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// setConfigSuccess sets config reload status to 1.
|
// setConfigSuccessAt updates config related metrics as successful.
|
||||||
func setConfigSuccess(at uint64) {
|
func setConfigSuccessAt(at uint64) {
|
||||||
configSuccess.Set(1)
|
configSuccess.Set(1)
|
||||||
configTimestamp.Set(at)
|
configTimestamp.Set(at)
|
||||||
// reset the error if any
|
// reset the lastConfigErr
|
||||||
setConfigErr(nil)
|
setLastConfigErr(nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
// setConfigError sets config reload status to 0.
|
// setConfigError updates config related metrics according to the error.
|
||||||
func setConfigError(err error) {
|
func setConfigError(err error) {
|
||||||
configReloadErrors.Inc()
|
configReloadErrors.Inc()
|
||||||
configSuccess.Set(0)
|
configSuccess.Set(0)
|
||||||
setConfigErr(err)
|
setLastConfigErr(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
configErrMu sync.RWMutex
|
lastConfigErrMu sync.RWMutex
|
||||||
// configErr represent the error message from the last
|
// lastConfigErr represent the error message from the last config reload.
|
||||||
// config reload.
|
// The message is used in web UI as notification
|
||||||
configErr error
|
lastConfigErr error
|
||||||
)
|
)
|
||||||
|
|
||||||
func setConfigErr(err error) {
|
func setLastConfigErr(err error) {
|
||||||
configErrMu.Lock()
|
lastConfigErrMu.Lock()
|
||||||
configErr = err
|
lastConfigErr = err
|
||||||
configErrMu.Unlock()
|
lastConfigErrMu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func configError() error {
|
func getLastConfigError() error {
|
||||||
configErrMu.RLock()
|
lastConfigErrMu.RLock()
|
||||||
defer configErrMu.RUnlock()
|
defer lastConfigErrMu.RUnlock()
|
||||||
if configErr != nil {
|
return lastConfigErr
|
||||||
return configErr
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,7 +118,29 @@ groups:
|
||||||
return len(m.groups)
|
return len(m.groups)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
checkCfg := func(err error) {
|
||||||
|
cErr := getLastConfigError()
|
||||||
|
cfgSuc := configSuccess.Get()
|
||||||
|
if err != nil {
|
||||||
|
if cErr == nil {
|
||||||
|
t.Fatalf("expected to have config error %s; got nil instead", cErr)
|
||||||
|
}
|
||||||
|
if cfgSuc != 0 {
|
||||||
|
t.Fatalf("expected to have metric configSuccess to be set to 0; got %d instead", cfgSuc)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if cErr != nil {
|
||||||
|
t.Fatalf("unexpected config error: %s", cErr)
|
||||||
|
}
|
||||||
|
if cfgSuc != 1 {
|
||||||
|
t.Fatalf("expected to have metric configSuccess to be set to 1; got %d instead", cfgSuc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
time.Sleep(*configCheckInterval * 2)
|
time.Sleep(*configCheckInterval * 2)
|
||||||
|
checkCfg(nil)
|
||||||
groupsLen := lenLocked(m)
|
groupsLen := lenLocked(m)
|
||||||
if groupsLen != 1 {
|
if groupsLen != 1 {
|
||||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||||
|
@ -126,6 +148,7 @@ groups:
|
||||||
|
|
||||||
writeToFile(t, f.Name(), rules2)
|
writeToFile(t, f.Name(), rules2)
|
||||||
time.Sleep(*configCheckInterval * 2)
|
time.Sleep(*configCheckInterval * 2)
|
||||||
|
checkCfg(nil)
|
||||||
groupsLen = lenLocked(m)
|
groupsLen = lenLocked(m)
|
||||||
if groupsLen != 2 {
|
if groupsLen != 2 {
|
||||||
fmt.Println(m.groups)
|
fmt.Println(m.groups)
|
||||||
|
@ -135,6 +158,7 @@ groups:
|
||||||
writeToFile(t, f.Name(), rules1)
|
writeToFile(t, f.Name(), rules1)
|
||||||
procutil.SelfSIGHUP()
|
procutil.SelfSIGHUP()
|
||||||
time.Sleep(*configCheckInterval / 2)
|
time.Sleep(*configCheckInterval / 2)
|
||||||
|
checkCfg(nil)
|
||||||
groupsLen = lenLocked(m)
|
groupsLen = lenLocked(m)
|
||||||
if groupsLen != 1 {
|
if groupsLen != 1 {
|
||||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||||
|
@ -143,6 +167,7 @@ groups:
|
||||||
writeToFile(t, f.Name(), `corrupted`)
|
writeToFile(t, f.Name(), `corrupted`)
|
||||||
procutil.SelfSIGHUP()
|
procutil.SelfSIGHUP()
|
||||||
time.Sleep(*configCheckInterval / 2)
|
time.Sleep(*configCheckInterval / 2)
|
||||||
|
checkCfg(fmt.Errorf("config error"))
|
||||||
groupsLen = lenLocked(m)
|
groupsLen = lenLocked(m)
|
||||||
if groupsLen != 1 { // should remain unchanged
|
if groupsLen != 1 { // should remain unchanged
|
||||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
|
|
||||||
{% func Welcome(r *http.Request) %}
|
{% func Welcome(r *http.Request) %}
|
||||||
{%= tpl.Header(r, navItems, "vmalert", configError()) %}
|
{%= tpl.Header(r, navItems, "vmalert", getLastConfigError()) %}
|
||||||
<p>
|
<p>
|
||||||
API:<br>
|
API:<br>
|
||||||
{% for _, p := range apiLinks %}
|
{% for _, p := range apiLinks %}
|
||||||
|
@ -40,7 +40,7 @@ btn-primary
|
||||||
|
|
||||||
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
|
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
|
||||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||||
{%= tpl.Header(r, navItems, "Groups", configError()) %}
|
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
|
||||||
{%code
|
{%code
|
||||||
filter := r.URL.Query().Get("filter")
|
filter := r.URL.Query().Get("filter")
|
||||||
rOk := make(map[string]int)
|
rOk := make(map[string]int)
|
||||||
|
@ -168,7 +168,7 @@ btn-primary
|
||||||
|
|
||||||
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
|
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
|
||||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||||
{%= tpl.Header(r, navItems, "Alerts", configError()) %}
|
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
|
||||||
{% if len(groupAlerts) > 0 %}
|
{% if len(groupAlerts) > 0 %}
|
||||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||||
|
@ -255,7 +255,7 @@ btn-primary
|
||||||
{% endfunc %}
|
{% endfunc %}
|
||||||
|
|
||||||
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
|
{% func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Target) %}
|
||||||
{%= tpl.Header(r, navItems, "Notifiers", configError()) %}
|
{%= tpl.Header(r, navItems, "Notifiers", getLastConfigError()) %}
|
||||||
{% if len(targets) > 0 %}
|
{% if len(targets) > 0 %}
|
||||||
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
<a class="btn btn-primary" role="button" onclick="collapseAll()">Collapse All</a>
|
||||||
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
<a class="btn btn-primary" role="button" onclick="expandAll()">Expand All</a>
|
||||||
|
@ -312,7 +312,7 @@ btn-primary
|
||||||
|
|
||||||
{% func Alert(r *http.Request, alert *APIAlert) %}
|
{% func Alert(r *http.Request, alert *APIAlert) %}
|
||||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||||
{%code
|
{%code
|
||||||
var labelKeys []string
|
var labelKeys []string
|
||||||
for k := range alert.Labels {
|
for k := range alert.Labels {
|
||||||
|
@ -399,7 +399,7 @@ btn-primary
|
||||||
|
|
||||||
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
||||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||||
{%= tpl.Header(r, navItems, "", configError()) %}
|
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||||
{%code
|
{%code
|
||||||
var labelKeys []string
|
var labelKeys []string
|
||||||
for k := range rule.Labels {
|
for k := range rule.Labels {
|
||||||
|
|
|
@ -34,7 +34,7 @@ func StreamWelcome(qw422016 *qt422016.Writer, r *http.Request) {
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:15
|
//line app/vmalert/web.qtpl:15
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "vmalert", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "vmalert", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:15
|
//line app/vmalert/web.qtpl:15
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
<p>
|
<p>
|
||||||
|
@ -207,7 +207,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:43
|
//line app/vmalert/web.qtpl:43
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "Groups", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "Groups", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:43
|
//line app/vmalert/web.qtpl:43
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
|
@ -647,7 +647,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:171
|
//line app/vmalert/web.qtpl:171
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "Alerts", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "Alerts", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:171
|
//line app/vmalert/web.qtpl:171
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
|
@ -922,7 +922,7 @@ func StreamListTargets(qw422016 *qt422016.Writer, r *http.Request, targets map[n
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:258
|
//line app/vmalert/web.qtpl:258
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "Notifiers", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:258
|
//line app/vmalert/web.qtpl:258
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
|
@ -1102,7 +1102,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:315
|
//line app/vmalert/web.qtpl:315
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:315
|
//line app/vmalert/web.qtpl:315
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
|
@ -1311,7 +1311,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
//line app/vmalert/web.qtpl:402
|
//line app/vmalert/web.qtpl:402
|
||||||
tpl.StreamHeader(qw422016, r, navItems, "", configError())
|
tpl.StreamHeader(qw422016, r, navItems, "", getLastConfigError())
|
||||||
//line app/vmalert/web.qtpl:402
|
//line app/vmalert/web.qtpl:402
|
||||||
qw422016.N().S(`
|
qw422016.N().S(`
|
||||||
`)
|
`)
|
||||||
|
|
|
@ -40,6 +40,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
|
||||||
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
|
* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components.
|
||||||
|
|
||||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680).
|
||||||
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly set `vmalert_config_last_reload_successful` value on configuration updates or rollbacks. The bug was introduced in [v1.92.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.92.0) in [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4543).
|
||||||
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).
|
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue