diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index da013ec106..82bc56f6e8 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -655,15 +655,19 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti // alertsToSend walks through the current alerts of AlertingRule // and returns only those which should be sent to notifier. // Isn't concurrent safe. -func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert { +func (ar *AlertingRule) alertsToSend(resolveDuration, resendDelay time.Duration) []notifier.Alert { + currentTime := time.Now() needsSending := func(a *notifier.Alert) bool { if a.State == notifier.StatePending { return false } - if a.ResolvedAt.After(a.LastSent) { + if a.State == notifier.StateFiring && a.End.Before(a.LastSent) { return true } - return a.LastSent.Add(resendDelay).Before(ts) + if a.State == notifier.StateInactive && a.ResolvedAt.After(a.LastSent) { + return true + } + return a.LastSent.Add(resendDelay).Before(currentTime) } var alerts []notifier.Alert @@ -671,11 +675,11 @@ func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay if !needsSending(a) { continue } - a.End = ts.Add(resolveDuration) + a.End = currentTime.Add(resolveDuration) if a.State == notifier.StateInactive { a.End = a.ResolvedAt } - a.LastSent = ts + a.LastSent = currentTime alerts = append(alerts, *a) } return alerts diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index 95539efd49..07dda4273b 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -43,11 +43,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { }, { newTestAlertingRule("instant extra labels", 0), - ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + ¬ifier.Alert{ + State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), Labels: map[string]string{ "job": "foo", "instance": "bar", - }}, + }, + }, []prompbmarshal.TimeSeries{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, @@ -66,11 +68,13 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { }, { newTestAlertingRule("instant labels override", 0), - ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + ¬ifier.Alert{ + State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), Labels: map[string]string{ alertStateLabel: "foo", "__name__": "bar", - }}, + }, + }, []prompbmarshal.TimeSeries{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, @@ -572,25 +576,33 @@ func TestAlertingRule_ExecRange(t *testing.T) { }, }, []*notifier.Alert{ - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + { + State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), Labels: map[string]string{ "source": "vm", - }}, - {State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), + }, + }, + { + State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), Labels: map[string]string{ "source": "vm", - }}, + }, + }, // - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + { + State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), Labels: map[string]string{ "foo": "bar", "source": "vm", - }}, - {State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), + }, + }, + { + State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), Labels: map[string]string{ "foo": "bar", "source": "vm", - }}, + }, + }, }, nil, }, @@ -1042,7 +1054,7 @@ func TestAlertsToSend(t *testing.T) { for i, a := range alerts { ar.alerts[uint64(i)] = a } - gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay) + gotAlerts := ar.alertsToSend(resolveDuration, resendDelay) if gotAlerts == nil && expAlerts == nil { return } @@ -1058,60 +1070,36 @@ func TestAlertsToSend(t *testing.T) { }) for i, exp := range expAlerts { got := gotAlerts[i] - if got.LastSent != exp.LastSent { - t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent) - } - if got.End != exp.End { - t.Fatalf("expected End to be %v; got %v", exp.End, got.End) + if got.Name != exp.Name { + t.Fatalf("expected Name to be %v; got %v", exp.Name, got.Name) } } } - f( // send firing alert with custom resolve time - []*notifier.Alert{{State: notifier.StateFiring}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}}, + f( // check if firing alerts need to be sent with non-zero resendDelay + []*notifier.Alert{ + {Name: "a", State: notifier.StateFiring, Start: ts}, + // no need to resend firing + {Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)}, + // last message is for resolved, send firing message this time + {Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)}, + // resend firing + {Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)}, + }, + []*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}}, 5*time.Minute, time.Minute, ) - f( // resolve inactive alert at the current timestamp - []*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{LastSent: ts, End: ts}}, - time.Minute, time.Minute, - ) - f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting - []*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}}, + f( // check if resolved alerts need to be sent with non-zero resendDelay + []*notifier.Alert{ + {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)}, + // no need to resend resolved + {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts}, + // resend resolved + {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)}, + }, + []*notifier.Alert{{Name: "a"}, {Name: "c"}}, 5*time.Minute, time.Minute, ) - f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting - []*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}}, - []*notifier.Alert{{Name: "b", LastSent: ts, End: ts}}, - 5*time.Minute, time.Minute, - ) - f( // attempt to send alert that was already sent in the resendDelay interval - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, - nil, - time.Minute, time.Minute, - ) - f( // attempt to send alert that was sent out of the resendDelay interval - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, - time.Minute, time.Minute, - ) - f( // alert must be sent even if resendDelay interval is 0 - []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, - []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, - time.Minute, 0, - ) - f( // inactive alert which has been sent already - []*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}}, - nil, - time.Minute, time.Minute, - ) - f( // inactive alert which has been resolved after last send - []*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}}, - []*notifier.Alert{{LastSent: ts, End: ts}}, - time.Minute, time.Minute, - ) } func newTestRuleWithLabels(name string, labels ...string) *AlertingRule { diff --git a/app/vmalert/rule/group.go b/app/vmalert/rule/group.go index b635c1fa33..bf0c24e0db 100644 --- a/app/vmalert/rule/group.go +++ b/app/vmalert/rule/group.go @@ -704,7 +704,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati return nil } - alerts := ar.alertsToSend(ts, resolveDuration, *resendDelay) + alerts := ar.alertsToSend(resolveDuration, *resendDelay) if len(alerts) < 1 { return nil } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index ae778bb4e4..9d40dc9ade 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -68,8 +68,10 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix VictoriaLogs UI query handling to correctly apply `_time` filter across all queries. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5920). * BUGFIX: [vmselect](https://docs.victoriametrics.com/): make vmselect resilient to absence of cache folder. If cache folder was mistakenly deleted by user or OS, vmselect will try re-creating it first. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5985). * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): limit duration of requests to /api/v1/labels, /api/v1/label/.../values or /api/v1/series with `-search.maxLabelsAPIDuration` duration. Before, `-search.maxExportDuration` value was used by mistake. Thanks to @kbweave for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5992). -* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fixed response body and headers for AWS Firehose HTTP Destination. +* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fix response body and headers for AWS Firehose HTTP Destination. * BUGFIX: properly wait for force merge to be completed during the shutdown. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5944) for the details. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): set correct `endsAt` value in notifications sent to the Alertmanager. Previously, a rule with evaluation intervals lower than 10s could never be triggered. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5995) for details. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly account for `-rule.resendDelay` for alerting rules that are constantly switching state from inactive to firing. Before, notifications for such rules could have been skipped if state change happened more often than `-rule.resendDelay`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6028) for details. ## [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)