vmalert: fix sending alert messages (#6028)

* vmalert: fix sending alert messages
1. fix `endsAt` field in messages that send to alertmanager, previously rule with small interval could never be triggered;
2. fix behavior of `-rule.resendDelay`, before it could prevent sending firing message when rule state is volatile.

* docs: update changelog notes

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Hui Wang 2024-03-28 15:55:10 +08:00 committed by Aliaksandr Valialkin
parent 93b389eb70
commit fdb6eb1071
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
4 changed files with 57 additions and 66 deletions

View file

@ -655,15 +655,19 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti
// alertsToSend walks through the current alerts of AlertingRule // alertsToSend walks through the current alerts of AlertingRule
// and returns only those which should be sent to notifier. // and returns only those which should be sent to notifier.
// Isn't concurrent safe. // Isn't concurrent safe.
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert { func (ar *AlertingRule) alertsToSend(resolveDuration, resendDelay time.Duration) []notifier.Alert {
currentTime := time.Now()
needsSending := func(a *notifier.Alert) bool { needsSending := func(a *notifier.Alert) bool {
if a.State == notifier.StatePending { if a.State == notifier.StatePending {
return false return false
} }
if a.ResolvedAt.After(a.LastSent) { if a.State == notifier.StateFiring && a.End.Before(a.LastSent) {
return true return true
} }
return a.LastSent.Add(resendDelay).Before(ts) if a.State == notifier.StateInactive && a.ResolvedAt.After(a.LastSent) {
return true
}
return a.LastSent.Add(resendDelay).Before(currentTime)
} }
var alerts []notifier.Alert var alerts []notifier.Alert
@ -671,11 +675,11 @@ func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay
if !needsSending(a) { if !needsSending(a) {
continue continue
} }
a.End = ts.Add(resolveDuration) a.End = currentTime.Add(resolveDuration)
if a.State == notifier.StateInactive { if a.State == notifier.StateInactive {
a.End = a.ResolvedAt a.End = a.ResolvedAt
} }
a.LastSent = ts a.LastSent = currentTime
alerts = append(alerts, *a) alerts = append(alerts, *a)
} }
return alerts return alerts

View file

@ -43,11 +43,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
}, },
{ {
newTestAlertingRule("instant extra labels", 0), newTestAlertingRule("instant extra labels", 0),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), &notifier.Alert{
State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
Labels: map[string]string{ Labels: map[string]string{
"job": "foo", "job": "foo",
"instance": "bar", "instance": "bar",
}}, },
},
[]prompbmarshal.TimeSeries{ []prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName, "__name__": alertMetricName,
@ -66,11 +68,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
}, },
{ {
newTestAlertingRule("instant labels override", 0), newTestAlertingRule("instant labels override", 0),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), &notifier.Alert{
State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
Labels: map[string]string{ Labels: map[string]string{
alertStateLabel: "foo", alertStateLabel: "foo",
"__name__": "bar", "__name__": "bar",
}}, },
},
[]prompbmarshal.TimeSeries{ []prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName, "__name__": alertMetricName,
@ -572,25 +576,33 @@ func TestAlertingRule_ExecRange(t *testing.T) {
}, },
}, },
[]*notifier.Alert{ []*notifier.Alert{
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), {
State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{ Labels: map[string]string{
"source": "vm", "source": "vm",
}}, },
{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), },
{
State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
Labels: map[string]string{ Labels: map[string]string{
"source": "vm", "source": "vm",
}}, },
},
// //
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), {
State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{ Labels: map[string]string{
"foo": "bar", "foo": "bar",
"source": "vm", "source": "vm",
}}, },
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), },
{
State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
Labels: map[string]string{ Labels: map[string]string{
"foo": "bar", "foo": "bar",
"source": "vm", "source": "vm",
}}, },
},
}, },
nil, nil,
}, },
@ -1042,7 +1054,7 @@ func TestAlertsToSend(t *testing.T) {
for i, a := range alerts { for i, a := range alerts {
ar.alerts[uint64(i)] = a ar.alerts[uint64(i)] = a
} }
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay) gotAlerts := ar.alertsToSend(resolveDuration, resendDelay)
if gotAlerts == nil && expAlerts == nil { if gotAlerts == nil && expAlerts == nil {
return return
} }
@ -1058,60 +1070,36 @@ func TestAlertsToSend(t *testing.T) {
}) })
for i, exp := range expAlerts { for i, exp := range expAlerts {
got := gotAlerts[i] got := gotAlerts[i]
if got.LastSent != exp.LastSent { if got.Name != exp.Name {
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent) t.Fatalf("expected Name to be %v; got %v", exp.Name, got.Name)
}
if got.End != exp.End {
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
} }
} }
} }
f( // send firing alert with custom resolve time f( // check if firing alerts need to be sent with non-zero resendDelay
[]*notifier.Alert{{State: notifier.StateFiring}}, []*notifier.Alert{
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}}, {Name: "a", State: notifier.StateFiring, Start: ts},
// no need to resend firing
{Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)},
// last message is for resolved, send firing message this time
{Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)},
// resend firing
{Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)},
},
[]*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}},
5*time.Minute, time.Minute, 5*time.Minute, time.Minute,
) )
f( // resolve inactive alert at the current timestamp f( // check if resolved alerts need to be sent with non-zero resendDelay
[]*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}}, []*notifier.Alert{
[]*notifier.Alert{{LastSent: ts, End: ts}}, {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)},
time.Minute, time.Minute, // no need to resend resolved
) {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts},
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting // resend resolved
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)},
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}}, },
[]*notifier.Alert{{Name: "a"}, {Name: "c"}},
5*time.Minute, time.Minute, 5*time.Minute, time.Minute,
) )
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // attempt to send alert that was already sent in the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // attempt to send alert that was sent out of the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, time.Minute,
)
f( // alert must be sent even if resendDelay interval is 0
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, 0,
)
f( // inactive alert which has been sent already
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // inactive alert which has been resolved after last send
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}},
[]*notifier.Alert{{LastSent: ts, End: ts}},
time.Minute, time.Minute,
)
} }
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule { func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {

View file

@ -704,7 +704,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati
return nil return nil
} }
alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay) alerts := ar.alertsToSend(resolveDuration, *resendDelay)
if len(alerts) < 1 { if len(alerts) < 1 {
return nil return nil
} }

View file

@ -77,7 +77,6 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): respect `-remoteWrite.maxBatchSize` at shutdown period. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6025). Thanks to @jiekun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6039). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): respect `-remoteWrite.maxBatchSize` at shutdown period. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6025). Thanks to @jiekun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6039).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fixed response body and headers for AWS Firehose HTTP Destination. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fixed response body and headers for AWS Firehose HTTP Destination.
* BUGFIX: properly wait for force merge to be completed during the shutdown. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5944) for the details.
## [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0) ## [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)