vmalert: fix sending alert messages (#6028)

* vmalert: fix sending alert messages
1. fix `endsAt` field in messages that send to alertmanager, previously rule with small interval could never be triggered;
2. fix behavior of `-rule.resendDelay`, before it could prevent sending firing message when rule state is volatile.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>

(cherry picked from commit d7224b2d1c)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Hui Wang 2024-03-28 08:55:10 +01:00 committed by hagen1778
parent 405d2c30d4
commit a1f3207e93
No known key found for this signature in database
GPG key ID: 3BF75F3741CA9640
4 changed files with 36 additions and 53 deletions

View file

@ -708,15 +708,19 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts ti
// alertsToSend walks through the current alerts of AlertingRule // alertsToSend walks through the current alerts of AlertingRule
// and returns only those which should be sent to notifier. // and returns only those which should be sent to notifier.
// Isn't concurrent safe. // Isn't concurrent safe.
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert { func (ar *AlertingRule) alertsToSend(resolveDuration, resendDelay time.Duration) []notifier.Alert {
currentTime := time.Now()
needsSending := func(a *notifier.Alert) bool { needsSending := func(a *notifier.Alert) bool {
if a.State == notifier.StatePending { if a.State == notifier.StatePending {
return false return false
} }
if a.ResolvedAt.After(a.LastSent) { if a.State == notifier.StateFiring && a.End.Before(a.LastSent) {
return true return true
} }
return a.LastSent.Add(resendDelay).Before(ts) if a.State == notifier.StateInactive && a.ResolvedAt.After(a.LastSent) {
return true
}
return a.LastSent.Add(resendDelay).Before(currentTime)
} }
var alerts []notifier.Alert var alerts []notifier.Alert
@ -724,11 +728,11 @@ func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay
if !needsSending(a) { if !needsSending(a) {
continue continue
} }
a.End = ts.Add(resolveDuration) a.End = currentTime.Add(resolveDuration)
if a.State == notifier.StateInactive { if a.State == notifier.StateInactive {
a.End = a.ResolvedAt a.End = a.ResolvedAt
} }
a.LastSent = ts a.LastSent = currentTime
alerts = append(alerts, *a) alerts = append(alerts, *a)
} }
return alerts return alerts

View file

@ -892,7 +892,7 @@ func TestAlertsToSend(t *testing.T) {
for i, a := range alerts { for i, a := range alerts {
ar.alerts[uint64(i)] = a ar.alerts[uint64(i)] = a
} }
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay) gotAlerts := ar.alertsToSend(resolveDuration, resendDelay)
if gotAlerts == nil && expAlerts == nil { if gotAlerts == nil && expAlerts == nil {
return return
} }
@ -908,60 +908,36 @@ func TestAlertsToSend(t *testing.T) {
}) })
for i, exp := range expAlerts { for i, exp := range expAlerts {
got := gotAlerts[i] got := gotAlerts[i]
if got.LastSent != exp.LastSent { if got.Name != exp.Name {
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent) t.Fatalf("expected Name to be %v; got %v", exp.Name, got.Name)
}
if got.End != exp.End {
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
} }
} }
} }
f( // send firing alert with custom resolve time f( // check if firing alerts need to be sent with non-zero resendDelay
[]*notifier.Alert{{State: notifier.StateFiring}}, []*notifier.Alert{
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}}, {Name: "a", State: notifier.StateFiring, Start: ts},
// no need to resend firing
{Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)},
// last message is for resolved, send firing message this time
{Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)},
// resend firing
{Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)},
},
[]*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}},
5*time.Minute, time.Minute, 5*time.Minute, time.Minute,
) )
f( // resolve inactive alert at the current timestamp f( // check if resolved alerts need to be sent with non-zero resendDelay
[]*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}}, []*notifier.Alert{
[]*notifier.Alert{{LastSent: ts, End: ts}}, {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)},
time.Minute, time.Minute, // no need to resend resolved
) {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts},
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting // resend resolved
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)},
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}}, },
[]*notifier.Alert{{Name: "a"}, {Name: "c"}},
5*time.Minute, time.Minute, 5*time.Minute, time.Minute,
) )
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // attempt to send alert that was already sent in the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // attempt to send alert that was sent out of the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, time.Minute,
)
f( // alert must be sent even if resendDelay interval is 0
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, 0,
)
f( // inactive alert which has been sent already
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // inactive alert which has been resolved after last send
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}},
[]*notifier.Alert{{LastSent: ts, End: ts}},
time.Minute, time.Minute,
)
} }
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule { func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {

View file

@ -512,7 +512,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return nil return nil
} }
alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay) alerts := ar.alertsToSend(resolveDuration, *resendDelay)
if len(alerts) < 1 { if len(alerts) < 1 {
return nil return nil
} }

View file

@ -13,6 +13,9 @@ The following `tip` changes can be tested by building VictoriaMetrics components
* SECURITY: upgrade Go builder from Go1.21.7 to Go1.22.1. See [the list of issues addressed in Go1.22.1](https://github.com/golang/go/issues?q=milestone%3AGo1.22.1+label%3ACherryPickApproved). * SECURITY: upgrade Go builder from Go1.21.7 to Go1.22.1. See [the list of issues addressed in Go1.22.1](https://github.com/golang/go/issues?q=milestone%3AGo1.22.1+label%3ACherryPickApproved).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): set correct `endsAt` value in notifications sent to the Alertmanager. Previously, a rule with evaluation intervals lower than 10s could never be triggered. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5995) for details.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details.
## [v1.93.13](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.13) ## [v1.93.13](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.13)
Released at 2024-03-01 Released at 2024-03-01