vmalert: add flag to control behaviour on startup for state restore errors (#1265)

Alerting rules now can return specific error type ErrStateRestore to indicate
whether restore state procedure failed. Such errors were returned and logged
before as well. But now user can specify whether to just log these errors
(remoteRead.ignoreRestoreErrors=true) or to stop the process
(remoteRead.ignoreRestoreErrors=false). The latter is important when VM isn't
ready yet to serve queries from vmalert and it needs to wait.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1252
This commit is contained in:
Roman Khavronenko 2021-05-05 08:07:19 +01:00 committed by Aliaksandr Valialkin
parent e6c19cb09d
commit bb7e113dd4
3 changed files with 18 additions and 5 deletions

View file

@ -2,6 +2,7 @@ package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
@ -404,6 +405,8 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
var ErrStateRestore = errors.New("failed to restore the state")
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
@ -428,7 +431,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
return fmt.Errorf("%s: %w", err, ErrStateRestore)
}
for _, m := range qMetrics {

View file

@ -47,6 +47,7 @@ eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
)

View file

@ -2,6 +2,7 @@ package main
import (
"context"
"errors"
"fmt"
"strings"
"sync"
@ -51,7 +52,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
}
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
return m.update(ctx, path, validateTpl, validateExpr, true)
err := m.update(ctx, path, validateTpl, validateExpr, true)
if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) {
logger.Errorf("%s", err)
return nil
}
return err
}
func (m *manager) close() {
@ -64,11 +70,11 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err)
}
}
@ -79,6 +85,7 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
m.wg.Done()
}()
m.groups[id] = group
return nil
}
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
@ -117,7 +124,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
}
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
if err := m.startGroup(ctx, ng, restore); err != nil {
return err
}
}
m.groupsMu.Unlock()