clear the code for alerts stale metrics

2024-11-21 14:44:00 +00:00 · 2024-11-12 22:39:46 +08:00 · 2024-11-12 22:39:46 +08:00 · 9e79fb2e11
commit 9e79fb2e11
parent 09e9f82758
3 changed files with 29 additions and 36 deletions
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@ -456,17 +456,18 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 		ar.logDebugf(ts, a, "created in state PENDING")
 	}
 	var numActivePending int
 	var tss []prompbmarshal.TimeSeries
 	// store alerts' labels which are `FIRING => INACTIVE`, `PENDING => INACTIVE` or `PENDING => FIRING` in this iteration,
 	// need to create stale time series for them later.
 	var pendingToFiring, pendingToInactive, firingToInactive []map[string]string
 	for h, a := range ar.alerts {
 		// if alert wasn't updated in this iteration
 		// means it is resolved already
 		if _, ok := updated[h]; !ok {
 			if a.State == notifier.StatePending {
-				// alert was in Pending state - it is not
+				// alert was in Pending state - it is not active anymore
-				// active anymore
+				// add stale time series for it
-				pendingToInactive = append(pendingToInactive, a.Labels)
+				tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), true)...)
 				delete(ar.alerts, h)
 				ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round")
 				continue
@ -484,7 +485,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 				if ts.Sub(a.KeepFiringSince) >= ar.KeepFiringFor {
 					a.State = notifier.StateInactive
 					a.ResolvedAt = ts
-					firingToInactive = append(firingToInactive, a.Labels)
+					// add stale time series for it
 					tss = append(tss, firingAlertStaleTimeSeries(a.Labels, ts.Unix())...)
 					ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
 					continue
 				}
@ -497,7 +500,8 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 			a.Start = ts
 			alertsFired.Inc()
 			if ar.For > 0 {
-				pendingToFiring = append(pendingToFiring, a.Labels)
+				// add stale time series for it
 				tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), false)...)
 			}
 			ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt)
 		}
@ -507,8 +511,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 		curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
 		return nil, curState.Err
 	}
-
+	return append(tss, ar.toTimeSeries(ts.Unix())...), nil
 	return ar.toTimeSeries(ts.Unix(), pendingToFiring, firingToInactive, pendingToInactive), nil
 }
 func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.QueryFn, ts time.Time) (*labelSet, map[string]string, error) {
@ -533,9 +536,8 @@ func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.Query
 	return ls, as, nil
 }
-// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts,
+// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts
-// also includes stale metrics for alerts which changed their state.
+func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
 func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToInactive, pendingToInactive []map[string]string) []prompbmarshal.TimeSeries {
 	var tss []prompbmarshal.TimeSeries
 	for _, a := range ar.alerts {
 		if a.State == notifier.StateInactive {
@ -544,15 +546,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToI
 		ts := ar.alertToTimeSeries(a, timestamp)
 		tss = append(tss, ts...)
 	}
 	for i := range pendingToFiring {
 		tss = append(tss, pendingAlertStaleTimeSeries(pendingToFiring[i], timestamp, false)...)
 	}
 	for i := range pendingToInactive {
 		tss = append(tss, pendingAlertStaleTimeSeries(pendingToInactive[i], timestamp, true)...)
 	}
 	for i := range firingToInactive {
 		tss = append(tss, firingAlertStaleTimeSeries(firingToInactive[i], timestamp)...)
 	}
 	return tss
 }
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@ -27,7 +27,7 @@ func TestAlertingRuleToTimeSeries(t *testing.T) {
 		t.Helper()
 		rule.alerts[alert.ID] = alert
-		tss := rule.toTimeSeries(timestamp.Unix(), nil, nil, nil)
+		tss := rule.toTimeSeries(timestamp.Unix())
 		if err := compareTimeSeries(t, tssExpected, tss); err != nil {
 			t.Fatalf("timeseries mismatch for rule %q: %s", rule.Name, err)
 		}
@ -207,7 +207,7 @@ func TestAlertingRule_Exec(t *testing.T) {
 			// check generate time series
 			if _, ok := tssExpected[i]; ok {
 				if err := compareTimeSeries(t, tssExpected[i], tss); err != nil {
-					t.Fatalf("generated time series mismatch for rule %q: %s", rule.Name, err)
+					t.Fatalf("generated time series mismatch for rule %q in step %d: %s", rule.Name, i, err)
 				}
 			}
@ -350,28 +350,28 @@ func TestAlertingRule_Exec(t *testing.T) {
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
 		},
 		1: {
 			// new time series for foo1
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			// stale time series for foo, `firing -> inactive`
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			// new time series for foo1
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 		},
 		2: {
 			// new time series for foo2
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			// stale time series for foo1
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			// new time series for foo2
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 		},
 	})
@ -395,13 +395,13 @@ func TestAlertingRule_Exec(t *testing.T) {
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
 		},
 		1: {
 			// stale time series for `pending -> firing`
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			// stale time series for `pending -> firing`
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 		},
 	})
--- a/app/vmalert/rule/recording.go
+++ b/app/vmalert/rule/recording.go
@ -9,8 +9,8 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"