mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: fix error when rule didn't start if restore failed (#1279)
Previously, `startGroup` could exit on restore errors despite the `remoteRead.ignoreRestoreErrors` flag value. Now vmalert checks the flag value before deciding whether to return error or just log it.
This commit is contained in:
parent
6ff19096be
commit
4247168a2d
2 changed files with 6 additions and 13 deletions
|
@ -2,7 +2,6 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -405,9 +404,6 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
|
||||||
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ErrStateRestore indicates that the vmalert state failed to restore during startup.
|
|
||||||
var ErrStateRestore = errors.New("failed to restore the state")
|
|
||||||
|
|
||||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||||
// to be updated on next Exec, as well as Value field.
|
// to be updated on next Exec, as well as Value field.
|
||||||
|
@ -432,7 +428,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
||||||
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||||
qMetrics, err := q.Query(ctx, expr)
|
qMetrics, err := q.Query(ctx, expr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s: %w", err, ErrStateRestore)
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, m := range qMetrics {
|
for _, m := range qMetrics {
|
||||||
|
|
|
@ -2,7 +2,6 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -52,12 +51,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
|
||||||
err := m.update(ctx, path, validateTpl, validateExpr, true)
|
return m.update(ctx, path, validateTpl, validateExpr, true)
|
||||||
if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) {
|
|
||||||
logger.Errorf("%s", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *manager) close() {
|
func (m *manager) close() {
|
||||||
|
@ -74,7 +68,10 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
|
||||||
if restore && m.rr != nil {
|
if restore && m.rr != nil {
|
||||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err)
|
if !*remoteReadIgnoreRestoreErrors {
|
||||||
|
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
|
||||||
|
}
|
||||||
|
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue