vmalert: fix error when rule didn't start if restore failed (#1279)

Previously, `startGroup` could exit on restore errors despite the
`remoteRead.ignoreRestoreErrors` flag value. Now vmalert checks the
flag value before deciding whether to return error or just log it.
This commit is contained in:
Roman Khavronenko 2021-05-10 09:06:31 +01:00 committed by Aliaksandr Valialkin
parent 2dddd68feb
commit 35237fe1f5
2 changed files with 6 additions and 13 deletions

View file

@ -2,7 +2,6 @@ package main
import (
"context"
"errors"
"fmt"
"hash/fnv"
"sort"
@ -405,9 +404,6 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
// ErrStateRestore indicates that the vmalert state failed to restore during startup.
var ErrStateRestore = errors.New("failed to restore the state")
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
@ -432,7 +428,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return fmt.Errorf("%s: %w", err, ErrStateRestore)
return err
}
for _, m := range qMetrics {

View file

@ -2,7 +2,6 @@ package main
import (
"context"
"errors"
"fmt"
"strings"
"sync"
@ -52,12 +51,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
}
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
err := m.update(ctx, path, validateTpl, validateExpr, true)
if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) {
logger.Errorf("%s", err)
return nil
}
return err
return m.update(ctx, path, validateTpl, validateExpr, true)
}
func (m *manager) close() {
@ -74,7 +68,10 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err)
if !*remoteReadIgnoreRestoreErrors {
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
}
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
}