mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-30 15:22:07 +00:00
vmalert: fix sending alert messages (#6028)
* vmalert: fix sending alert messages 1. fix `endsAt` field in messages that send to alertmanager, previously rule with small interval could never be triggered; 2. fix behavior of `-rule.resendDelay`, before it could prevent sending firing message when rule state is volatile. * docs: update changelog notes Signed-off-by: hagen1778 <roman@victoriametrics.com> --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
77eca6bb37
commit
d7224b2d1c
4 changed files with 60 additions and 66 deletions
|
@ -655,15 +655,19 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti
|
|||
// alertsToSend walks through the current alerts of AlertingRule
|
||||
// and returns only those which should be sent to notifier.
|
||||
// Isn't concurrent safe.
|
||||
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert {
|
||||
func (ar *AlertingRule) alertsToSend(resolveDuration, resendDelay time.Duration) []notifier.Alert {
|
||||
currentTime := time.Now()
|
||||
needsSending := func(a *notifier.Alert) bool {
|
||||
if a.State == notifier.StatePending {
|
||||
return false
|
||||
}
|
||||
if a.ResolvedAt.After(a.LastSent) {
|
||||
if a.State == notifier.StateFiring && a.End.Before(a.LastSent) {
|
||||
return true
|
||||
}
|
||||
return a.LastSent.Add(resendDelay).Before(ts)
|
||||
if a.State == notifier.StateInactive && a.ResolvedAt.After(a.LastSent) {
|
||||
return true
|
||||
}
|
||||
return a.LastSent.Add(resendDelay).Before(currentTime)
|
||||
}
|
||||
|
||||
var alerts []notifier.Alert
|
||||
|
@ -671,11 +675,11 @@ func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay
|
|||
if !needsSending(a) {
|
||||
continue
|
||||
}
|
||||
a.End = ts.Add(resolveDuration)
|
||||
a.End = currentTime.Add(resolveDuration)
|
||||
if a.State == notifier.StateInactive {
|
||||
a.End = a.ResolvedAt
|
||||
}
|
||||
a.LastSent = ts
|
||||
a.LastSent = currentTime
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
return alerts
|
||||
|
|
|
@ -43,11 +43,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
|||
},
|
||||
{
|
||||
newTestAlertingRule("instant extra labels", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||
¬ifier.Alert{
|
||||
State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||
Labels: map[string]string{
|
||||
"job": "foo",
|
||||
"instance": "bar",
|
||||
}},
|
||||
},
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
|
@ -66,11 +68,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
|||
},
|
||||
{
|
||||
newTestAlertingRule("instant labels override", 0),
|
||||
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||
¬ifier.Alert{
|
||||
State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||
Labels: map[string]string{
|
||||
alertStateLabel: "foo",
|
||||
"__name__": "bar",
|
||||
}},
|
||||
},
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": alertMetricName,
|
||||
|
@ -572,25 +576,33 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||
},
|
||||
},
|
||||
[]*notifier.Alert{
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||
{
|
||||
State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||
Labels: map[string]string{
|
||||
"source": "vm",
|
||||
}},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
|
||||
},
|
||||
},
|
||||
{
|
||||
State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
|
||||
Labels: map[string]string{
|
||||
"source": "vm",
|
||||
}},
|
||||
},
|
||||
},
|
||||
//
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||
{
|
||||
State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"source": "vm",
|
||||
}},
|
||||
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
|
||||
},
|
||||
},
|
||||
{
|
||||
State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"source": "vm",
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
|
@ -1042,7 +1054,7 @@ func TestAlertsToSend(t *testing.T) {
|
|||
for i, a := range alerts {
|
||||
ar.alerts[uint64(i)] = a
|
||||
}
|
||||
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay)
|
||||
gotAlerts := ar.alertsToSend(resolveDuration, resendDelay)
|
||||
if gotAlerts == nil && expAlerts == nil {
|
||||
return
|
||||
}
|
||||
|
@ -1058,60 +1070,36 @@ func TestAlertsToSend(t *testing.T) {
|
|||
})
|
||||
for i, exp := range expAlerts {
|
||||
got := gotAlerts[i]
|
||||
if got.LastSent != exp.LastSent {
|
||||
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent)
|
||||
}
|
||||
if got.End != exp.End {
|
||||
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
|
||||
if got.Name != exp.Name {
|
||||
t.Fatalf("expected Name to be %v; got %v", exp.Name, got.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
f( // send firing alert with custom resolve time
|
||||
[]*notifier.Alert{{State: notifier.StateFiring}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}},
|
||||
f( // check if firing alerts need to be sent with non-zero resendDelay
|
||||
[]*notifier.Alert{
|
||||
{Name: "a", State: notifier.StateFiring, Start: ts},
|
||||
// no need to resend firing
|
||||
{Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)},
|
||||
// last message is for resolved, send firing message this time
|
||||
{Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)},
|
||||
// resend firing
|
||||
{Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)},
|
||||
},
|
||||
[]*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // resolve inactive alert at the current timestamp
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}},
|
||||
f( // check if resolved alerts need to be sent with non-zero resendDelay
|
||||
[]*notifier.Alert{
|
||||
{Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)},
|
||||
// no need to resend resolved
|
||||
{Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts},
|
||||
// resend resolved
|
||||
{Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)},
|
||||
},
|
||||
[]*notifier.Alert{{Name: "a"}, {Name: "c"}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was already sent in the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
nil,
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was sent out of the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // alert must be sent even if resendDelay interval is 0
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, 0,
|
||||
)
|
||||
f( // inactive alert which has been sent already
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}},
|
||||
nil,
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // inactive alert which has been resolved after last send
|
||||
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
|
|
|
@ -704,7 +704,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati
|
|||
return nil
|
||||
}
|
||||
|
||||
alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay)
|
||||
alerts := ar.alertsToSend(resolveDuration, *resendDelay)
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -68,8 +68,10 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
|||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix VictoriaLogs UI query handling to correctly apply `_time` filter across all queries. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5920).
|
||||
* BUGFIX: [vmselect](https://docs.victoriametrics.com/): make vmselect resilient to absence of cache folder. If cache folder was mistakenly deleted by user or OS, vmselect will try re-creating it first. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5985).
|
||||
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): limit duration of requests to /api/v1/labels, /api/v1/label/.../values or /api/v1/series with `-search.maxLabelsAPIDuration` duration. Before, `-search.maxExportDuration` value was used by mistake. Thanks to @kbweave for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5992).
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fixed response body and headers for AWS Firehose HTTP Destination.
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fix response body and headers for AWS Firehose HTTP Destination.
|
||||
* BUGFIX: properly wait for force merge to be completed during the shutdown. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5944) for the details.
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): set correct `endsAt` value in notifications sent to the Alertmanager. Previously, a rule with evaluation intervals lower than 10s could never be triggered. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5995) for details.
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details.
|
||||
|
||||
## [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)
|
||||
|
||||
|
|
Loading…
Reference in a new issue