From 4247168a2dae35648fa108f8f98b016a2843431e Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 10 May 2021 09:06:31 +0100 Subject: [PATCH] vmalert: fix error when rule didn't start if restore failed (#1279) Previously, `startGroup` could exit on restore errors despite the `remoteRead.ignoreRestoreErrors` flag value. Now vmalert checks the flag value before deciding whether to return error or just log it. --- app/vmalert/alerting.go | 6 +----- app/vmalert/manager.go | 13 +++++-------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 82a53348b..75c80f2de 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -2,7 +2,6 @@ package main import ( "context" - "errors" "fmt" "hash/fnv" "sort" @@ -405,9 +404,6 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p return newTimeSeries(float64(a.Start.Unix()), labels, timestamp) } -// ErrStateRestore indicates that the vmalert state failed to restore during startup. -var ErrStateRestore = errors.New("failed to restore the state") - // Restore restores the state of active alerts basing on previously written timeseries. // Restore restores only Start field. Field State will be always Pending and supposed // to be updated on next Exec, as well as Value field. @@ -432,7 +428,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds())) qMetrics, err := q.Query(ctx, expr) if err != nil { - return fmt.Errorf("%s: %w", err, ErrStateRestore) + return err } for _, m := range qMetrics { diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index 22de88ae8..4d9b0595e 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -2,7 +2,6 @@ package main import ( "context" - "errors" "fmt" "strings" "sync" @@ -52,12 +51,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { } func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error { - err := m.update(ctx, path, validateTpl, validateExpr, true) - if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) { - logger.Errorf("%s", err) - return nil - } - return err + return m.update(ctx, path, validateTpl, validateExpr, true) } func (m *manager) close() { @@ -74,7 +68,10 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er if restore && m.rr != nil { err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels) if err != nil { - return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err) + if !*remoteReadIgnoreRestoreErrors { + return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err) + } + logger.Errorf("error while restoring state for group %q: %s", group.Name, err) } }