From c7f3e58032ff4058aa219d340f8ccabdba193678 Mon Sep 17 00:00:00 2001
From: Roman Khavronenko <hagen1778@gmail.com>
Date: Sun, 17 May 2020 15:13:22 +0100
Subject: [PATCH] vmalert: avoid sending resolves for pending alerts (#498)

Before the change we were sending notifications to notifier
if following conditions are met:
* alert is in Fire state
* alert is in Inactive state

We were sending Inactive notifications to resolve alert ASAP.
Unfortunately, we were sending resolves for Pending alerts that become
Inactive, which is wrong.

In this change we delete alert from the active list if
it was Pending and become Inactive. In this way we now
have Inactive alerts only if they were in state Fire before.
See test change for example.
---
 app/vmalert/group.go     | 55 ++++++++++++++++++++++++++--------------
 app/vmalert/rule.go      | 18 ++++++-------
 app/vmalert/rule_test.go |  8 +++---
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/app/vmalert/group.go b/app/vmalert/group.go
index f666c69341..4d5e5a3182 100644
--- a/app/vmalert/group.go
+++ b/app/vmalert/group.go
@@ -69,7 +69,7 @@ func (g *Group) updateWith(newGroup Group) {
 
 		// copy all significant fields.
 		// alerts state isn't copied since
-		// it should be updated in next 2 Evals
+		// it should be updated in next 2 Execs
 		or.For = nr.For
 		or.Expr = nr.Expr
 		or.Labels = nr.Labels
@@ -151,30 +151,47 @@ func (g *Group) start(ctx context.Context, interval time.Duration,
 
 				var alertsToSend []notifier.Alert
 				for _, a := range rule.alerts {
-					if a.State != notifier.StatePending {
+					switch a.State {
+					case notifier.StateFiring:
+						// set End to execStart + 3 intervals
+						// so notifier can resolve it automatically if `vmalert`
+						// won't be able to send resolve for some reason
+						a.End = execStart.Add(3 * interval)
+						alertsToSend = append(alertsToSend, *a)
+						pushToRW(rw, rule, a, execStart)
+					case notifier.StatePending:
+						pushToRW(rw, rule, a, execStart)
+					case notifier.StateInactive:
+						// set End to execStart to notify
+						// that it was just resolved
+						a.End = execStart
 						alertsToSend = append(alertsToSend, *a)
 					}
-					if a.State == notifier.StateInactive || rw == nil {
-						continue
-					}
-					tss := rule.AlertToTimeSeries(a, execStart)
-					for _, ts := range tss {
-						remoteWriteSent.Inc()
-						if err := rw.Push(ts); err != nil {
-							remoteWriteErrors.Inc()
-							logger.Errorf("failed to push timeseries to remotewrite: %s", err)
-						}
-					}
 				}
-				if len(alertsToSend) > 0 {
-					alertsSent.Add(len(alertsToSend))
-					if err := nr.Send(ctx, alertsToSend); err != nil {
-						alertsSendErrors.Inc()
-						logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
-					}
+				if len(alertsToSend) == 0 {
+					continue
+				}
+				alertsSent.Add(len(alertsToSend))
+				if err := nr.Send(ctx, alertsToSend); err != nil {
+					alertsSendErrors.Inc()
+					logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
 				}
 			}
 			iterationDuration.UpdateDuration(iterationStart)
 		}
 	}
 }
+
+func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
+	if rw == nil {
+		return
+	}
+	tss := rule.AlertToTimeSeries(a, timestamp)
+	remoteWriteSent.Add(len(tss))
+	for _, ts := range tss {
+		if err := rw.Push(ts); err != nil {
+			remoteWriteErrors.Inc()
+			logger.Errorf("failed to push timeseries to remotewrite: %s", err)
+		}
+	}
+}
diff --git a/app/vmalert/rule.go b/app/vmalert/rule.go
index 08d1d64d1c..bdb0d9134e 100644
--- a/app/vmalert/rule.go
+++ b/app/vmalert/rule.go
@@ -71,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
 	}
 
 	for h, a := range r.alerts {
-		// cleanup inactive alerts from previous Eval
+		// cleanup inactive alerts from previous Exec
 		if a.State == notifier.StateInactive {
 			delete(r.alerts, h)
 		}
@@ -109,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
 		// if alert wasn't updated in this iteration
 		// means it is resolved already
 		if _, ok := updated[h]; !ok {
+			if a.State == notifier.StatePending {
+				// alert was in Pending state - it is not
+				// active anymore
+				delete(r.alerts, h)
+				continue
+			}
 			a.State = notifier.StateInactive
-			// set endTime to last execution time
-			// so it can be sent by notifier on next step
-			a.End = r.lastExecTime
 			continue
 		}
 		if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
 			a.State = notifier.StateFiring
 			alertsFired.Inc()
 		}
-		if a.State == notifier.StateFiring {
-			a.End = r.lastExecTime.Add(3 * *evaluationInterval)
-		}
 	}
 	return nil
 }
@@ -293,7 +293,7 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time)
 
 // Restore restores the state of active alerts basing on previously written timeseries.
 // Restore restores only Start field. Field State will be always Pending and supposed
-// to be updated on next Eval, as well as Value field.
+// to be updated on next Exec, as well as Value field.
 func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
 	if q == nil {
 		return fmt.Errorf("querier is nil")
@@ -313,7 +313,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
 		labels := m.Labels
 		m.Labels = make([]datasource.Label, 0)
 		// drop all extra labels, so hash key will
-		// be identical to timeseries received in Eval
+		// be identical to timeseries received in Exec
 		for _, l := range labels {
 			if l.Name == alertNameLabel {
 				continue
diff --git a/app/vmalert/rule_test.go b/app/vmalert/rule_test.go
index ae11105baf..73227b4760 100644
--- a/app/vmalert/rule_test.go
+++ b/app/vmalert/rule_test.go
@@ -296,16 +296,14 @@ func TestRule_Exec(t *testing.T) {
 			},
 		},
 		{
-			newTestRule("for-pending=>inactive", time.Millisecond),
+			newTestRule("for-pending=>empty", time.Second),
 			[][]datasource.Metric{
 				{metricWithLabels(t, "name", "foo")},
 				{metricWithLabels(t, "name", "foo")},
-				// empty step to reset pending alerts
+				// empty step to reset and delete pending alerts
 				{},
 			},
-			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
-			},
+			map[uint64]*notifier.Alert{},
 		},
 		{
 			newTestRule("for-pending=>firing=>inactive", time.Millisecond),