From a1f3207e93a0bc85ca9ecc076c0feb87705f2505 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 28 Mar 2024 08:55:10 +0100 Subject: [PATCH] vmalert: fix sending alert messages (#6028) * vmalert: fix sending alert messages 1. fix `endsAt` field in messages that send to alertmanager, previously rule with small interval could never be triggered; 2. fix behavior of `-rule.resendDelay`, before it could prevent sending firing message when rule state is volatile. Signed-off-by: hagen1778 --------- Signed-off-by: hagen1778 Co-authored-by: hagen1778 (cherry picked from commit d7224b2d1cecdef881793e1675bb2f1090e1d864) Signed-off-by: hagen1778 --- app/vmalert/alerting.go | 14 +++++--- app/vmalert/alerting_test.go | 70 ++++++++++++------------------------ app/vmalert/group.go | 2 +- docs/CHANGELOG.md | 3 ++ 4 files changed, 36 insertions(+), 53 deletions(-) diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 779423b6f..14ade0cd1 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -708,15 +708,19 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts ti // alertsToSend walks through the current alerts of AlertingRule // and returns only those which should be sent to notifier. // Isn't concurrent safe. -func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert { +func (ar *AlertingRule) alertsToSend(resolveDuration, resendDelay time.Duration) []notifier.Alert { + currentTime := time.Now() needsSending := func(a *notifier.Alert) bool { if a.State == notifier.StatePending { return false } - if a.ResolvedAt.After(a.LastSent) { + if a.State == notifier.StateFiring && a.End.Before(a.LastSent) { return true } - return a.LastSent.Add(resendDelay).Before(ts) + if a.State == notifier.StateInactive && a.ResolvedAt.After(a.LastSent) { + return true + } + return a.LastSent.Add(resendDelay).Before(currentTime) } var alerts []notifier.Alert @@ -724,11 +728,11 @@ func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay if !needsSending(a) { continue } - a.End = ts.Add(resolveDuration) + a.End = currentTime.Add(resolveDuration) if a.State == notifier.StateInactive { a.End = a.ResolvedAt } - a.LastSent = ts + a.LastSent = currentTime alerts = append(alerts, *a) } return alerts diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index 9c779d738..ac29dfafc 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -892,7 +892,7 @@ func TestAlertsToSend(t *testing.T) { for i, a := range alerts { ar.alerts[uint64(i)] = a } - gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay) + gotAlerts := ar.alertsToSend(resolveDuration, resendDelay) if gotAlerts == nil && expAlerts == nil { return } @@ -908,60 +908,36 @@ func TestAlertsToSend(t *testing.T) { }) for i, exp := range expAlerts { got := gotAlerts[i] - if got.LastSent != exp.LastSent { - t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent) - } - if got.End != exp.End { - t.Fatalf("expected End to be %v; got %v", exp.End, got.End) + if got.Name != exp.Name { + t.Fatalf("expected Name to be %v; got %v", exp.Name, got.Name) } } } - f( // send firing alert with custom resolve time - []*notifier.Alert{{State: notifier.StateFiring}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}}, + f( // check if firing alerts need to be sent with non-zero resendDelay + []*notifier.Alert{ + {Name: "a", State: notifier.StateFiring, Start: ts}, + // no need to resend firing + {Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)}, + // last message is for resolved, send firing message this time + {Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)}, + // resend firing + {Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)}, + }, + []*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}}, 5*time.Minute, time.Minute, ) - f( // resolve inactive alert at the current timestamp - []*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{LastSent: ts, End: ts}}, - time.Minute, time.Minute, - ) - f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting - []*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}}, + f( // check if resolved alerts need to be sent with non-zero resendDelay + []*notifier.Alert{ + {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)}, + // no need to resend resolved + {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts}, + // resend resolved + {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)}, + }, + []*notifier.Alert{{Name: "a"}, {Name: "c"}}, 5*time.Minute, time.Minute, ) - f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting - []*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{Name: "b", LastSent: ts, End: ts}}, - 5*time.Minute, time.Minute, - ) - f( // attempt to send alert that was already sent in the resendDelay interval - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, - nil, - time.Minute, time.Minute, - ) - f( // attempt to send alert that was sent out of the resendDelay interval - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, - time.Minute, time.Minute, - ) - f( // alert must be sent even if resendDelay interval is 0 - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, - time.Minute, 0, - ) - f( // inactive alert which has been sent already - []*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}}, - nil, - time.Minute, time.Minute, - ) - f( // inactive alert which has been resolved after last send - []*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}}, - []*notifier.Alert{{LastSent: ts, End: ts}}, - time.Minute, time.Minute, - ) } func newTestRuleWithLabels(name string, labels ...string) *AlertingRule { diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 3ccc01269..2fc498834 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -512,7 +512,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur return nil } - alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay) + alerts := ar.alertsToSend(resolveDuration, *resendDelay) if len(alerts) < 1 { return nil } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e06c6899a..365552a25 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -13,6 +13,9 @@ The following `tip` changes can be tested by building VictoriaMetrics components * SECURITY: upgrade Go builder from Go1.21.7 to Go1.22.1. See [the list of issues addressed in Go1.22.1](https://github.com/golang/go/issues?q=milestone%3AGo1.22.1+label%3ACherryPickApproved). +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): set correct `endsAt` value in notifications sent to the Alertmanager. Previously, a rule with evaluation intervals lower than 10s could never be triggered. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5995) for details. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details. + ## [v1.93.13](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.13) Released at 2024-03-01