From 9e79fb2e1156617b56284323e19554486e51898f Mon Sep 17 00:00:00 2001
From: Haley Wang <haley@victoriametrics.com>
Date: Tue, 12 Nov 2024 22:39:46 +0800
Subject: [PATCH] clear the code for alerts stale metrics

---
 app/vmalert/rule/alerting.go      | 33 ++++++++++++-------------------
 app/vmalert/rule/alerting_test.go | 30 ++++++++++++++--------------
 app/vmalert/rule/recording.go     |  2 +-
 3 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go
index 955c46c62..338a4e8d7 100644
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@@ -456,17 +456,18 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 		ar.logDebugf(ts, a, "created in state PENDING")
 	}
 	var numActivePending int
+	var tss []prompbmarshal.TimeSeries
 	// store alerts' labels which are `FIRING => INACTIVE`, `PENDING => INACTIVE` or `PENDING => FIRING` in this iteration,
 	// need to create stale time series for them later.
-	var pendingToFiring, pendingToInactive, firingToInactive []map[string]string
 	for h, a := range ar.alerts {
 		// if alert wasn't updated in this iteration
 		// means it is resolved already
 		if _, ok := updated[h]; !ok {
 			if a.State == notifier.StatePending {
-				// alert was in Pending state - it is not
-				// active anymore
-				pendingToInactive = append(pendingToInactive, a.Labels)
+				// alert was in Pending state - it is not active anymore
+				// add stale time series for it
+				tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), true)...)
+
 				delete(ar.alerts, h)
 				ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round")
 				continue
@@ -484,7 +485,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 				if ts.Sub(a.KeepFiringSince) >= ar.KeepFiringFor {
 					a.State = notifier.StateInactive
 					a.ResolvedAt = ts
-					firingToInactive = append(firingToInactive, a.Labels)
+					// add stale time series for it
+					tss = append(tss, firingAlertStaleTimeSeries(a.Labels, ts.Unix())...)
+
 					ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
 					continue
 				}
@@ -497,7 +500,8 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 			a.Start = ts
 			alertsFired.Inc()
 			if ar.For > 0 {
-				pendingToFiring = append(pendingToFiring, a.Labels)
+				// add stale time series for it
+				tss = append(tss, pendingAlertStaleTimeSeries(a.Labels, ts.Unix(), false)...)
 			}
 			ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt)
 		}
@@ -507,8 +511,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 		curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
 		return nil, curState.Err
 	}
-
-	return ar.toTimeSeries(ts.Unix(), pendingToFiring, firingToInactive, pendingToInactive), nil
+	return append(tss, ar.toTimeSeries(ts.Unix())...), nil
 }
 
 func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.QueryFn, ts time.Time) (*labelSet, map[string]string, error) {
@@ -533,9 +536,8 @@ func (ar *AlertingRule) expandTemplates(m datasource.Metric, qFn templates.Query
 	return ls, as, nil
 }
 
-// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts,
-// also includes stale metrics for alerts which changed their state.
-func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToInactive, pendingToInactive []map[string]string) []prompbmarshal.TimeSeries {
+// toTimeSeries creates `ALERTS` and `ALERTS_FOR_STATE` for active alerts
+func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
 	var tss []prompbmarshal.TimeSeries
 	for _, a := range ar.alerts {
 		if a.State == notifier.StateInactive {
@@ -544,15 +546,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64, pendingToFiring, firingToI
 		ts := ar.alertToTimeSeries(a, timestamp)
 		tss = append(tss, ts...)
 	}
-	for i := range pendingToFiring {
-		tss = append(tss, pendingAlertStaleTimeSeries(pendingToFiring[i], timestamp, false)...)
-	}
-	for i := range pendingToInactive {
-		tss = append(tss, pendingAlertStaleTimeSeries(pendingToInactive[i], timestamp, true)...)
-	}
-	for i := range firingToInactive {
-		tss = append(tss, firingAlertStaleTimeSeries(firingToInactive[i], timestamp)...)
-	}
 	return tss
 }
 
diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go
index 898ea40d2..d54aefd34 100644
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@@ -27,7 +27,7 @@ func TestAlertingRuleToTimeSeries(t *testing.T) {
 		t.Helper()
 
 		rule.alerts[alert.ID] = alert
-		tss := rule.toTimeSeries(timestamp.Unix(), nil, nil, nil)
+		tss := rule.toTimeSeries(timestamp.Unix())
 		if err := compareTimeSeries(t, tssExpected, tss); err != nil {
 			t.Fatalf("timeseries mismatch for rule %q: %s", rule.Name, err)
 		}
@@ -207,7 +207,7 @@ func TestAlertingRule_Exec(t *testing.T) {
 			// check generate time series
 			if _, ok := tssExpected[i]; ok {
 				if err := compareTimeSeries(t, tssExpected[i], tss); err != nil {
-					t.Fatalf("generated time series mismatch for rule %q: %s", rule.Name, err)
+					t.Fatalf("generated time series mismatch for rule %q in step %d: %s", rule.Name, i, err)
 				}
 			}
 
@@ -350,28 +350,28 @@ func TestAlertingRule_Exec(t *testing.T) {
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
 		},
 		1: {
-			// new time series for foo1
-			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
-				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
-			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
-				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			// stale time series for foo, `firing -> inactive`
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
+			// new time series for foo1
+			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
+				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
+			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
+				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 		},
 		2: {
-			// new time series for foo2
-			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
-				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
-			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
-				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			// stale time series for foo1
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo1"}},
 				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
+			// new time series for foo2
+			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo2"}},
+				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
+			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "multiple-steps-firing"}, {Name: "name", Value: "foo2"}},
+				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(2 * defaultStep).Unix()), Timestamp: ts.Add(2*defaultStep).UnixNano() / 1e6}}},
 		},
 	})
 
@@ -395,13 +395,13 @@ func TestAlertingRule_Exec(t *testing.T) {
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Unix()), Timestamp: ts.UnixNano() / 1e6}}},
 		},
 		1: {
+			// stale time series for `pending -> firing`
+			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
+				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "firing"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: 1, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertForStateMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "name", Value: "foo"}},
 				Samples: []prompbmarshal.Sample{{Value: float64(ts.Add(defaultStep).Unix()), Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
-			// stale time series for `pending -> firing`
-			{Labels: []prompbmarshal.Label{{Name: "__name__", Value: alertMetricName}, {Name: "alertname", Value: "for-fired"}, {Name: "alertstate", Value: "pending"}, {Name: "name", Value: "foo"}},
-				Samples: []prompbmarshal.Sample{{Value: decimal.StaleNaN, Timestamp: ts.Add(defaultStep).UnixNano() / 1e6}}},
 		},
 	})
 
diff --git a/app/vmalert/rule/recording.go b/app/vmalert/rule/recording.go
index fa002365a..a616fe794 100644
--- a/app/vmalert/rule/recording.go
+++ b/app/vmalert/rule/recording.go
@@ -9,8 +9,8 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"