From c7f3e58032ff4058aa219d340f8ccabdba193678 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Sun, 17 May 2020 15:13:22 +0100 Subject: [PATCH] vmalert: avoid sending resolves for pending alerts (#498) Before the change we were sending notifications to notifier if following conditions are met: * alert is in Fire state * alert is in Inactive state We were sending Inactive notifications to resolve alert ASAP. Unfortunately, we were sending resolves for Pending alerts that become Inactive, which is wrong. In this change we delete alert from the active list if it was Pending and become Inactive. In this way we now have Inactive alerts only if they were in state Fire before. See test change for example. --- app/vmalert/group.go | 55 ++++++++++++++++++++++++++-------------- app/vmalert/rule.go | 18 ++++++------- app/vmalert/rule_test.go | 8 +++--- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/app/vmalert/group.go b/app/vmalert/group.go index f666c69341..4d5e5a3182 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -69,7 +69,7 @@ func (g *Group) updateWith(newGroup Group) { // copy all significant fields. // alerts state isn't copied since - // it should be updated in next 2 Evals + // it should be updated in next 2 Execs or.For = nr.For or.Expr = nr.Expr or.Labels = nr.Labels @@ -151,30 +151,47 @@ func (g *Group) start(ctx context.Context, interval time.Duration, var alertsToSend []notifier.Alert for _, a := range rule.alerts { - if a.State != notifier.StatePending { + switch a.State { + case notifier.StateFiring: + // set End to execStart + 3 intervals + // so notifier can resolve it automatically if `vmalert` + // won't be able to send resolve for some reason + a.End = execStart.Add(3 * interval) + alertsToSend = append(alertsToSend, *a) + pushToRW(rw, rule, a, execStart) + case notifier.StatePending: + pushToRW(rw, rule, a, execStart) + case notifier.StateInactive: + // set End to execStart to notify + // that it was just resolved + a.End = execStart alertsToSend = append(alertsToSend, *a) } - if a.State == notifier.StateInactive || rw == nil { - continue - } - tss := rule.AlertToTimeSeries(a, execStart) - for _, ts := range tss { - remoteWriteSent.Inc() - if err := rw.Push(ts); err != nil { - remoteWriteErrors.Inc() - logger.Errorf("failed to push timeseries to remotewrite: %s", err) - } - } } - if len(alertsToSend) > 0 { - alertsSent.Add(len(alertsToSend)) - if err := nr.Send(ctx, alertsToSend); err != nil { - alertsSendErrors.Inc() - logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err) - } + if len(alertsToSend) == 0 { + continue + } + alertsSent.Add(len(alertsToSend)) + if err := nr.Send(ctx, alertsToSend); err != nil { + alertsSendErrors.Inc() + logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err) } } iterationDuration.UpdateDuration(iterationStart) } } } + +func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) { + if rw == nil { + return + } + tss := rule.AlertToTimeSeries(a, timestamp) + remoteWriteSent.Add(len(tss)) + for _, ts := range tss { + if err := rw.Push(ts); err != nil { + remoteWriteErrors.Inc() + logger.Errorf("failed to push timeseries to remotewrite: %s", err) + } + } +} diff --git a/app/vmalert/rule.go b/app/vmalert/rule.go index 08d1d64d1c..bdb0d9134e 100644 --- a/app/vmalert/rule.go +++ b/app/vmalert/rule.go @@ -71,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error { } for h, a := range r.alerts { - // cleanup inactive alerts from previous Eval + // cleanup inactive alerts from previous Exec if a.State == notifier.StateInactive { delete(r.alerts, h) } @@ -109,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error { // if alert wasn't updated in this iteration // means it is resolved already if _, ok := updated[h]; !ok { + if a.State == notifier.StatePending { + // alert was in Pending state - it is not + // active anymore + delete(r.alerts, h) + continue + } a.State = notifier.StateInactive - // set endTime to last execution time - // so it can be sent by notifier on next step - a.End = r.lastExecTime continue } if a.State == notifier.StatePending && time.Since(a.Start) >= r.For { a.State = notifier.StateFiring alertsFired.Inc() } - if a.State == notifier.StateFiring { - a.End = r.lastExecTime.Add(3 * *evaluationInterval) - } } return nil } @@ -293,7 +293,7 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) // Restore restores the state of active alerts basing on previously written timeseries. // Restore restores only Start field. Field State will be always Pending and supposed -// to be updated on next Eval, as well as Value field. +// to be updated on next Exec, as well as Value field. func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error { if q == nil { return fmt.Errorf("querier is nil") @@ -313,7 +313,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time. labels := m.Labels m.Labels = make([]datasource.Label, 0) // drop all extra labels, so hash key will - // be identical to timeseries received in Eval + // be identical to timeseries received in Exec for _, l := range labels { if l.Name == alertNameLabel { continue diff --git a/app/vmalert/rule_test.go b/app/vmalert/rule_test.go index ae11105baf..73227b4760 100644 --- a/app/vmalert/rule_test.go +++ b/app/vmalert/rule_test.go @@ -296,16 +296,14 @@ func TestRule_Exec(t *testing.T) { }, }, { - newTestRule("for-pending=>inactive", time.Millisecond), + newTestRule("for-pending=>empty", time.Second), [][]datasource.Metric{ {metricWithLabels(t, "name", "foo")}, {metricWithLabels(t, "name", "foo")}, - // empty step to reset pending alerts + // empty step to reset and delete pending alerts {}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, - }, + map[uint64]*notifier.Alert{}, }, { newTestRule("for-pending=>firing=>inactive", time.Millisecond),