Added resendDelay for alerts (#2296)

* vmalert: add support of `resendDelay` flag for alerts

Co-authored-by: dmitryk-dk <dmitry.kozlov@brightlocal.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Dmytro Kozlov 2022-03-16 17:26:33 +02:00 committed by GitHub
parent 8ae9825bb4
commit 11ae1ae924
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 158 additions and 28 deletions

View file

@ -774,6 +774,8 @@ The shortlist of configuration flags is the following:
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
-rule.maxResolveDuration duration
Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group.
-rule.resendDelay duration
Minimum amount of time to wait before resending an alert to notifier
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates

View file

@ -550,3 +550,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
}
return nil
}
// alertsToSend walks through the current alerts of AlertingRule
// and returns only those which should be sent to notifier.
// Isn't concurrent safe.
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert {
var alerts []notifier.Alert
for _, a := range ar.alerts {
switch a.State {
case notifier.StateFiring:
if time.Since(a.LastSent) < resendDelay {
continue
}
a.End = ts.Add(resolveDuration)
a.LastSent = ts
alerts = append(alerts, *a)
case notifier.StateInactive:
a.End = ts
a.LastSent = ts
alerts = append(alerts, *a)
}
}
return alerts
}

View file

@ -4,6 +4,7 @@ import (
"context"
"errors"
"reflect"
"sort"
"strings"
"testing"
"time"
@ -781,6 +782,76 @@ func TestAlertingRule_Template(t *testing.T) {
}
}
func TestAlertsToSend(t *testing.T) {
ts := time.Now()
f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) {
t.Helper()
ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)}
for i, a := range alerts {
ar.alerts[uint64(i)] = a
}
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay)
if gotAlerts == nil && expAlerts == nil {
return
}
if len(gotAlerts) != len(expAlerts) {
t.Fatalf("expected to get %d alerts; got %d instead",
len(expAlerts), len(gotAlerts))
}
sort.Slice(expAlerts, func(i, j int) bool {
return expAlerts[i].Name < expAlerts[j].Name
})
sort.Slice(gotAlerts, func(i, j int) bool {
return gotAlerts[i].Name < gotAlerts[j].Name
})
for i, exp := range expAlerts {
got := gotAlerts[i]
if got.LastSent != exp.LastSent {
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent)
}
if got.End != exp.End {
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
}
}
}
f( // send firing alert with custom resolve time
[]*notifier.Alert{{State: notifier.StateFiring}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}},
5*time.Minute, time.Minute,
)
f( // resolve inactive alert at the current timestamp
[]*notifier.Alert{{State: notifier.StateInactive}},
[]*notifier.Alert{{LastSent: ts, End: ts}},
time.Minute, time.Minute,
)
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive}},
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive}},
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // attempt to send alert that was already sent in the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // attempt to send alert that was sent out of the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, time.Minute,
)
f( // alert must be sent even if resendDelay interval is 0
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, 0,
)
}
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
r := newTestAlertingRule(name, 0)
r.Labels = make(map[string]string)

View file

@ -0,0 +1,12 @@
groups:
- name: groupTest
interval: 1s
rules:
- alert: VMRows
for: 2s
expr: sum(rate(vm_http_request_errors_total[2s])) > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"

View file

@ -277,8 +277,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
g.metrics.iterationTotal.Inc()
iterationStart := time.Now()
if len(g.Rules) > 0 {
resolveDuration := getResolveDuration(g.Interval)
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, getResolveDuration(g.Interval))
for err := range errs {
if err != nil {
logger.Errorf("group %q: %s", g.Name, err)
@ -291,15 +290,18 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
}
// resolveDuration for alerts is equal to 3 interval evaluations
// so in case if vmalert stops sending updates for some reason,
// notifier could automatically resolve the alert.
// getResolveDuration returns the duration after which firing alert
// can be considered as resolved.
func getResolveDuration(groupInterval time.Duration) time.Duration {
resolveInterval := groupInterval * 3
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
return *maxResolveDuration
delta := *resendDelay
if groupInterval > delta {
delta = groupInterval
}
return resolveInterval
resolveDuration := delta * 4
if *maxResolveDuration > 0 && resolveDuration > *maxResolveDuration {
resolveDuration = *maxResolveDuration
}
return resolveDuration
}
type executor struct {
@ -370,19 +372,8 @@ func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Dur
if !ok {
return nil
}
var alerts []notifier.Alert
for _, a := range ar.alerts {
switch a.State {
case notifier.StateFiring:
a.End = now.Add(resolveDuration)
alerts = append(alerts, *a)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = now
alerts = append(alerts, *a)
}
}
alerts := ar.alertsToSend(now, resolveDuration, *resendDelay)
if len(alerts) < 1 {
return nil
}

View file

@ -158,10 +158,11 @@ func TestGroupStart(t *testing.T) {
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const evalInterval = time.Millisecond
fs := &fakeQuerier{}
fn := &fakeNotifier{}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
@ -223,6 +224,12 @@ func TestGroupStart(t *testing.T) {
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
gotAlertsNum := fn.getCounter()
if gotAlertsNum < len(expectedAlerts)*2 {
t.Fatalf("expected to receive at least %d alerts; got %d instead",
len(expectedAlerts)*2, gotAlertsNum)
}
// reset previous data
fs.reset()
// and set only one datapoint for response
@ -243,18 +250,29 @@ func TestResolveDuration(t *testing.T) {
testCases := []struct {
groupInterval time.Duration
maxDuration time.Duration
resendDelay time.Duration
expected time.Duration
}{
{time.Minute, 0, 3 * time.Minute},
{3 * time.Minute, 0, 9 * time.Minute},
{time.Minute, 2 * time.Minute, 2 * time.Minute},
{0, 0, 0},
{time.Minute, 0, 0, 4 * time.Minute},
{time.Minute, 0, 2 * time.Minute, 8 * time.Minute},
{time.Minute, 4 * time.Minute, 4 * time.Minute, 4 * time.Minute},
{2 * time.Minute, time.Minute, 2 * time.Minute, time.Minute},
{time.Minute, 2 * time.Minute, 1 * time.Minute, 2 * time.Minute},
{2 * time.Minute, 0, 1 * time.Minute, 8 * time.Minute},
{0, 0, 0, 0},
}
defaultResolveDuration := *maxResolveDuration
defer func() { *maxResolveDuration = defaultResolveDuration }()
defaultResendDelay := *resendDelay
defer func() {
*maxResolveDuration = defaultResolveDuration
*resendDelay = defaultResendDelay
}()
for _, tc := range testCases {
t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) {
*maxResolveDuration = tc.maxDuration
*resendDelay = tc.resendDelay
got := getResolveDuration(tc.groupInterval)
if got != tc.expected {
t.Errorf("expected to have %v; got %v", tc.expected, got)

View file

@ -61,6 +61,8 @@ func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric,
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
// records number of received alerts in total
counter int
}
func (*fakeNotifier) Close() {}
@ -68,10 +70,17 @@ func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()

View file

@ -47,6 +47,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which is by default equal to 3 evaluation intervals of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)

View file

@ -30,6 +30,8 @@ type Alert struct {
Start time.Time
// End defines the moment of time when Alert supposed to expire
End time.Time
// LastSent defines the moment when Alert was sent last time
LastSent time.Time
// Value stores the value returned from evaluating expression from Expr field
Value float64
// ID is the unique identifer for the Alert