From 9e79fb2e1156617b56284323e19554486e51898f Mon Sep 17 00:00:00 2001 From: Haley Wang Date: Tue, 12 Nov 2024 22:39:46 +0800 Subject: [PATCH] clear the code for alerts stale metrics --- app/vmalert/rule/alerting.go | 33 ++++++++++++------------------- app/vmalert/rule/alerting_test.go | 30 ++++++++++++++-------------- app/vmalert/rule/recording.go | 2 +- 3 files changed, 29 insertions(+), 36 deletions(-) diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index 955c46c62..338a4e8d7 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -456,17 +456,18 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr ar.logDebugf(ts, a, "created in state PENDING") } var numActivePending int + var tss []prompbmarshal.TimeSeries // store alerts' labels which are `FIRING => INACTIVE`, `PENDING => INACTIVE` or `PENDING => FIRING` in this iteration, // need to create stale time series for them later. - var pendingToFiring, pendingToInactive, firingToInactive []map[string]string for h, a := range ar.alerts { // if alert wasn't updated in this iteration // means it is resolved already if _, ok := updated[h]; !ok { if a.State == notifier.StatePending { - // alert was in Pending state - it is not - // active anymore - pendingToInactive = append(pendingToInactive, a.Labels) + // alert was in Pending state - it is not active anymore + // add stale time series for it + tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), true)...) + delete(ar.alerts, h) ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round") continue @@ -484,7 +485,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr if ts.Sub(a.KeepFiringSince) >= ar.KeepFiringFor { a.State = notifier.StateInactive a.ResolvedAt = ts - firingToInactive = append(firingToInactive, a.Labels) + // add stale time series for it + tss = append(tss, firingAlertStaleTimeSeries(a.Labels, ts.Unix())...) + ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round") continue } @@ -497,7 +500,8 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr a.Start = ts alertsFired.Inc() if ar.For > 0 { - pendingToFiring = append(pendingToFiring, a.Labels) + // add stale time series for it + tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), false)...) } ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt) } @@ -507,8 +511,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending) return nil, curState.Err } - - return ar.toTimeSeries(ts.Unix(), pendingToFiring, firingToInactive, pendingToInactive), nil + return append(tss, ar.toTimeSeries(ts.Unix())...), nil } func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.QueryFn, ts time.Time) (*labelSet, map[string]string, error) { @@ -533,9 +536,8 @@ func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.Query return ls, as, nil } -// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts, -// also includes stale metrics for alerts which changed their state. -func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToInactive, pendingToInactive []map[string]string) []prompbmarshal.TimeSeries { +// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts +func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries { var tss []prompbmarshal.TimeSeries for _, a := range ar.alerts { if a.State == notifier.StateInactive { @@ -544,15 +546,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToI ts := ar.alertToTimeSeries(a, timestamp) tss = append(tss, ts...) } - for i := range pendingToFiring { - tss = append(tss, pendingAlertStaleTimeSeries(pendingToFiring[i], timestamp, false)...) - } - for i := range pendingToInactive { - tss = append(tss, pendingAlertStaleTimeSeries(pendingToInactive[i], timestamp, true)...) - } - for i := range firingToInactive { - tss = append(tss, firingAlertStaleTimeSeries(firingToInactive[i], timestamp)...) - } return tss } diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index 898ea40d2..d54aefd34 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -27,7 +27,7 @@ func TestAlertingRuleToTimeSeries(t *testing.T) { t.Helper() rule.alerts[alert.ID] = alert - tss := rule.toTimeSeries(timestamp.Unix(), nil, nil, nil) + tss := rule.toTimeSeries(timestamp.Unix()) if err := compareTimeSeries(t, tssExpected, tss); err != nil { t.Fatalf("timeseries mismatch for rule %q: %s", rule.Name, err) } @@ -207,7 +207,7 @@ func TestAlertingRule_Exec(t *testing.T) { // check generate time series if _, ok := tssExpected[i]; ok { if err := compareTimeSeries(t, tssExpected[i], tss); err != nil { - t.Fatalf("generated time series mismatch for rule %q: %s", rule.Name, err) + t.Fatalf("generated time series mismatch for rule %q in step %d: %s", rule.Name, i, err) } } @@ -350,28 +350,28 @@ func TestAlertingRule_Exec(t *testing.T) { Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}}, }, 1: { - // new time series for foo1 - {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}}, - Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, - {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}}, - Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, // stale time series for foo, `firing -> inactive` {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo"}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, + // new time series for foo1 + {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}}, + Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, + {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}}, + Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, }, 2: { - // new time series for foo2 - {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}}, - Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, - {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}}, - Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, // stale time series for foo1 {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}}, Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, + // new time series for foo2 + {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}}, + Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, + {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}}, + Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}}, }, }) @@ -395,13 +395,13 @@ func TestAlertingRule_Exec(t *testing.T) { Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}}, }, 1: { + // stale time series for `pending -> firing` + {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}}, + Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}}, Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "name", Value: "foo"}}, Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, - // stale time series for `pending -> firing` - {Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}}, - Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}}, }, }) diff --git a/app/vmalert/rule/recording.go b/app/vmalert/rule/recording.go index fa002365a..a616fe794 100644 --- a/app/vmalert/rule/recording.go +++ b/app/vmalert/rule/recording.go @@ -9,8 +9,8 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"