vmalert: fix error when rule didn't start if restore failed (#1279)

Previously, `startGroup` could exit on restore errors despite the
`remoteRead.ignoreRestoreErrors` flag value. Now vmalert checks the
flag value before deciding whether to return error or just log it.
This commit is contained in:
Roman Khavronenko 2021-05-10 09:06:31 +01:00 committed by GitHub
parent 6ff19096be
commit 4247168a2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 13 deletions

View file

@ -2,7 +2,6 @@ package main
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"hash/fnv" "hash/fnv"
"sort" "sort"
@ -405,9 +404,6 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp) return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
} }
// ErrStateRestore indicates that the vmalert state failed to restore during startup.
var ErrStateRestore = errors.New("failed to restore the state")
// Restore restores the state of active alerts basing on previously written timeseries. // Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed // Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field. // to be updated on next Exec, as well as Value field.
@ -432,7 +428,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds())) alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr) qMetrics, err := q.Query(ctx, expr)
if err != nil { if err != nil {
return fmt.Errorf("%s: %w", err, ErrStateRestore) return err
} }
for _, m := range qMetrics { for _, m := range qMetrics {

View file

@ -2,7 +2,6 @@ package main
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"strings" "strings"
"sync" "sync"
@ -52,12 +51,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
} }
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error { func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
err := m.update(ctx, path, validateTpl, validateExpr, true) return m.update(ctx, path, validateTpl, validateExpr, true)
if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) {
logger.Errorf("%s", err)
return nil
}
return err
} }
func (m *manager) close() { func (m *manager) close() {
@ -74,7 +68,10 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
if restore && m.rr != nil { if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels) err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil { if err != nil {
return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err) if !*remoteReadIgnoreRestoreErrors {
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
}
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
} }
} }