diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 75c80f2de..5ecf4da08 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "hash/fnv" "sort" @@ -404,6 +405,8 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p return newTimeSeries(float64(a.Start.Unix()), labels, timestamp) } +var ErrStateRestore = errors.New("failed to restore the state") + // Restore restores the state of active alerts basing on previously written timeseries. // Restore restores only Start field. Field State will be always Pending and supposed // to be updated on next Exec, as well as Value field. @@ -428,7 +431,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds())) qMetrics, err := q.Query(ctx, expr) if err != nil { - return err + return fmt.Errorf("%s: %w", err, ErrStateRestore) } for _, m := range qMetrics { diff --git a/app/vmalert/main.go b/app/vmalert/main.go index a2a5ff75a..886dcb0b8 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -47,6 +47,7 @@ eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{ remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+ " For example, if lookback=1h then range from now() to now()-1h will be scanned.") + remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.") dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.") ) diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index cd248f5bd..22de88ae8 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "strings" "sync" @@ -51,7 +52,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { } func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error { - return m.update(ctx, path, validateTpl, validateExpr, true) + err := m.update(ctx, path, validateTpl, validateExpr, true) + if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) { + logger.Errorf("%s", err) + return nil + } + return err } func (m *manager) close() { @@ -64,11 +70,11 @@ func (m *manager) close() { m.wg.Wait() } -func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) { +func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error { if restore && m.rr != nil { err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels) if err != nil { - logger.Errorf("error while restoring state for group %q: %s", group.Name, err) + return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err) } } @@ -79,6 +85,7 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) { m.wg.Done() }() m.groups[id] = group + return nil } func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error { @@ -117,7 +124,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida } } for _, ng := range groupsRegistry { - m.startGroup(ctx, ng, restore) + if err := m.startGroup(ctx, ng, restore); err != nil { + return err + } } m.groupsMu.Unlock()