clear the code for alerts stale metrics

This commit is contained in:
Haley Wang 2024-11-12 22:39:46 +08:00
parent 09e9f82758
commit 9e79fb2e11
No known key found for this signature in database
GPG key ID: C6299A8A1D6CC50C
3 changed files with 29 additions and 36 deletions

View file

@ -456,17 +456,18 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
ar.logDebugf(ts, a, "created in state PENDING") ar.logDebugf(ts, a, "created in state PENDING")
} }
var numActivePending int var numActivePending int
var tss []prompbmarshal.TimeSeries
// store alerts' labels which are `FIRING => INACTIVE`, `PENDING => INACTIVE` or `PENDING => FIRING` in this iteration, // store alerts' labels which are `FIRING => INACTIVE`, `PENDING => INACTIVE` or `PENDING => FIRING` in this iteration,
// need to create stale time series for them later. // need to create stale time series for them later.
var pendingToFiring, pendingToInactive, firingToInactive []map[string]string
for h, a := range ar.alerts { for h, a := range ar.alerts {
// if alert wasn't updated in this iteration // if alert wasn't updated in this iteration
// means it is resolved already // means it is resolved already
if _, ok := updated[h]; !ok { if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending { if a.State == notifier.StatePending {
// alert was in Pending state - it is not // alert was in Pending state - it is not active anymore
// active anymore // add stale time series for it
pendingToInactive = append(pendingToInactive, a.Labels) tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), true)...)
delete(ar.alerts, h) delete(ar.alerts, h)
ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round") ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round")
continue continue
@ -484,7 +485,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
if ts.Sub(a.KeepFiringSince) >= ar.KeepFiringFor { if ts.Sub(a.KeepFiringSince) >= ar.KeepFiringFor {
a.State = notifier.StateInactive a.State = notifier.StateInactive
a.ResolvedAt = ts a.ResolvedAt = ts
firingToInactive = append(firingToInactive, a.Labels) // add stale time series for it
tss = append(tss, firingAlertStaleTimeSeries(a.Labels, ts.Unix())...)
ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round") ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
continue continue
} }
@ -497,7 +500,8 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
a.Start = ts a.Start = ts
alertsFired.Inc() alertsFired.Inc()
if ar.For > 0 { if ar.For > 0 {
pendingToFiring = append(pendingToFiring, a.Labels) // add stale time series for it
tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), false)...)
} }
ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt) ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt)
} }
@ -507,8 +511,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending) curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.Err return nil, curState.Err
} }
return append(tss, ar.toTimeSeries(ts.Unix())...), nil
return ar.toTimeSeries(ts.Unix(), pendingToFiring, firingToInactive, pendingToInactive), nil
} }
func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.QueryFn, ts time.Time) (*labelSet, map[string]string, error) { func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.QueryFn, ts time.Time) (*labelSet, map[string]string, error) {
@ -533,9 +536,8 @@ func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.Query
return ls, as, nil return ls, as, nil
} }
// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts, // toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts
// also includes stale metrics for alerts which changed their state. func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToInactive, pendingToInactive []map[string]string) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts { for _, a := range ar.alerts {
if a.State == notifier.StateInactive { if a.State == notifier.StateInactive {
@ -544,15 +546,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToI
ts := ar.alertToTimeSeries(a, timestamp) ts := ar.alertToTimeSeries(a, timestamp)
tss = append(tss, ts...) tss = append(tss, ts...)
} }
for i := range pendingToFiring {
tss = append(tss, pendingAlertStaleTimeSeries(pendingToFiring[i], timestamp, false)...)
}
for i := range pendingToInactive {
tss = append(tss, pendingAlertStaleTimeSeries(pendingToInactive[i], timestamp, true)...)
}
for i := range firingToInactive {
tss = append(tss, firingAlertStaleTimeSeries(firingToInactive[i], timestamp)...)
}
return tss return tss
} }

View file

@ -27,7 +27,7 @@ func TestAlertingRuleToTimeSeries(t *testing.T) {
t.Helper() t.Helper()
rule.alerts[alert.ID] = alert rule.alerts[alert.ID] = alert
tss := rule.toTimeSeries(timestamp.Unix(), nil, nil, nil) tss := rule.toTimeSeries(timestamp.Unix())
if err := compareTimeSeries(t, tssExpected, tss); err != nil { if err := compareTimeSeries(t, tssExpected, tss); err != nil {
t.Fatalf("timeseries mismatch for rule %q: %s", rule.Name, err) t.Fatalf("timeseries mismatch for rule %q: %s", rule.Name, err)
} }
@ -207,7 +207,7 @@ func TestAlertingRule_Exec(t *testing.T) {
// check generate time series // check generate time series
if _, ok := tssExpected[i]; ok { if _, ok := tssExpected[i]; ok {
if err := compareTimeSeries(t, tssExpected[i], tss); err != nil { if err := compareTimeSeries(t, tssExpected[i], tss); err != nil {
t.Fatalf("generated time series mismatch for rule %q: %s", rule.Name, err) t.Fatalf("generated time series mismatch for rule %q in step %d: %s", rule.Name, i, err)
} }
} }
@ -350,28 +350,28 @@ func TestAlertingRule_Exec(t *testing.T) {
Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
}, },
1: { 1: {
// new time series for foo1
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
// stale time series for foo, `firing -> inactive` // stale time series for foo, `firing -> inactive`
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
// new time series for foo1
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
}, },
2: { 2: {
// new time series for foo2
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
// stale time series for foo1 // stale time series for foo1
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
// new time series for foo2
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
}, },
}) })
@ -395,13 +395,13 @@ func TestAlertingRule_Exec(t *testing.T) {
Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
}, },
1: { 1: {
// stale time series for `pending -> firing`
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "name", Value: "foo"}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
// stale time series for `pending -> firing`
{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
}, },
}) })

View file

@ -9,8 +9,8 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"