mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: add flag to control behaviour on startup for state restore errors (#1265)
Alerting rules now can return specific error type ErrStateRestore to indicate whether restore state procedure failed. Such errors were returned and logged before as well. But now user can specify whether to just log these errors (remoteRead.ignoreRestoreErrors=true) or to stop the process (remoteRead.ignoreRestoreErrors=false). The latter is important when VM isn't ready yet to serve queries from vmalert and it needs to wait. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1252
This commit is contained in:
parent
e6c19cb09d
commit
bb7e113dd4
3 changed files with 18 additions and 5 deletions
|
@ -2,6 +2,7 @@ package main
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
|
@ -404,6 +405,8 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
|
|||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||
}
|
||||
|
||||
var ErrStateRestore = errors.New("failed to restore the state")
|
||||
|
||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
|
@ -428,7 +431,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
|||
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s: %w", err, ErrStateRestore)
|
||||
}
|
||||
|
||||
for _, m := range qMetrics {
|
||||
|
|
|
@ -47,6 +47,7 @@ eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{
|
|||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
|
||||
)
|
||||
|
|
|
@ -2,6 +2,7 @@ package main
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
@ -51,7 +52,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
|||
}
|
||||
|
||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
err := m.update(ctx, path, validateTpl, validateExpr, true)
|
||||
if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) {
|
||||
logger.Errorf("%s", err)
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *manager) close() {
|
||||
|
@ -64,11 +70,11 @@ func (m *manager) close() {
|
|||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,6 +85,7 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
|||
m.wg.Done()
|
||||
}()
|
||||
m.groups[id] = group
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
|
||||
|
@ -117,7 +124,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
|
|||
}
|
||||
}
|
||||
for _, ng := range groupsRegistry {
|
||||
m.startGroup(ctx, ng, restore)
|
||||
if err := m.startGroup(ctx, ng, restore); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
m.groupsMu.Unlock()
|
||||
|
||||
|
|
Loading…
Reference in a new issue