mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
Added resendDelay for alerts (#2296)
* vmalert: add support of `resendDelay` flag for alerts Co-authored-by: dmitryk-dk <dmitry.kozlov@brightlocal.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
f0e96a84db
commit
2f350c200a
9 changed files with 158 additions and 28 deletions
|
@ -774,6 +774,8 @@ The shortlist of configuration flags is the following:
|
|||
Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead
|
||||
-rule.maxResolveDuration duration
|
||||
Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group.
|
||||
-rule.resendDelay duration
|
||||
Minimum amount of time to wait before resending an alert to notifier
|
||||
-rule.validateExpressions
|
||||
Whether to validate rules expressions via MetricsQL engine (default true)
|
||||
-rule.validateTemplates
|
||||
|
|
|
@ -550,3 +550,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// alertsToSend walks through the current alerts of AlertingRule
|
||||
// and returns only those which should be sent to notifier.
|
||||
// Isn't concurrent safe.
|
||||
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert {
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
if time.Since(a.LastSent) < resendDelay {
|
||||
continue
|
||||
}
|
||||
a.End = ts.Add(resolveDuration)
|
||||
a.LastSent = ts
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
a.End = ts
|
||||
a.LastSent = ts
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"context"
|
||||
"errors"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -781,6 +782,76 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestAlertsToSend(t *testing.T) {
|
||||
ts := time.Now()
|
||||
f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) {
|
||||
t.Helper()
|
||||
ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)}
|
||||
for i, a := range alerts {
|
||||
ar.alerts[uint64(i)] = a
|
||||
}
|
||||
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay)
|
||||
if gotAlerts == nil && expAlerts == nil {
|
||||
return
|
||||
}
|
||||
if len(gotAlerts) != len(expAlerts) {
|
||||
t.Fatalf("expected to get %d alerts; got %d instead",
|
||||
len(expAlerts), len(gotAlerts))
|
||||
}
|
||||
sort.Slice(expAlerts, func(i, j int) bool {
|
||||
return expAlerts[i].Name < expAlerts[j].Name
|
||||
})
|
||||
sort.Slice(gotAlerts, func(i, j int) bool {
|
||||
return gotAlerts[i].Name < gotAlerts[j].Name
|
||||
})
|
||||
for i, exp := range expAlerts {
|
||||
got := gotAlerts[i]
|
||||
if got.LastSent != exp.LastSent {
|
||||
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent)
|
||||
}
|
||||
if got.End != exp.End {
|
||||
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
f( // send firing alert with custom resolve time
|
||||
[]*notifier.Alert{{State: notifier.StateFiring}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // resolve inactive alert at the current timestamp
|
||||
[]*notifier.Alert{{State: notifier.StateInactive}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive}},
|
||||
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
|
||||
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive}},
|
||||
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
|
||||
5*time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was already sent in the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
nil,
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // attempt to send alert that was sent out of the resendDelay interval
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, time.Minute,
|
||||
)
|
||||
f( // alert must be sent even if resendDelay interval is 0
|
||||
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
|
||||
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
|
||||
time.Minute, 0,
|
||||
)
|
||||
}
|
||||
|
||||
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
||||
r := newTestAlertingRule(name, 0)
|
||||
r.Labels = make(map[string]string)
|
||||
|
|
12
app/vmalert/config/testdata/rules_interval_good.rules
vendored
Normal file
12
app/vmalert/config/testdata/rules_interval_good.rules
vendored
Normal file
|
@ -0,0 +1,12 @@
|
|||
groups:
|
||||
- name: groupTest
|
||||
interval: 1s
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 2s
|
||||
expr: sum(rate(vm_http_request_errors_total[2s])) > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
|
@ -277,8 +277,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
|||
g.metrics.iterationTotal.Inc()
|
||||
iterationStart := time.Now()
|
||||
if len(g.Rules) > 0 {
|
||||
resolveDuration := getResolveDuration(g.Interval)
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
|
||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, getResolveDuration(g.Interval))
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: %s", g.Name, err)
|
||||
|
@ -291,15 +290,18 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
|||
}
|
||||
}
|
||||
|
||||
// resolveDuration for alerts is equal to 3 interval evaluations
|
||||
// so in case if vmalert stops sending updates for some reason,
|
||||
// notifier could automatically resolve the alert.
|
||||
// getResolveDuration returns the duration after which firing alert
|
||||
// can be considered as resolved.
|
||||
func getResolveDuration(groupInterval time.Duration) time.Duration {
|
||||
resolveInterval := groupInterval * 3
|
||||
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
|
||||
return *maxResolveDuration
|
||||
delta := *resendDelay
|
||||
if groupInterval > delta {
|
||||
delta = groupInterval
|
||||
}
|
||||
return resolveInterval
|
||||
resolveDuration := delta * 4
|
||||
if *maxResolveDuration > 0 && resolveDuration > *maxResolveDuration {
|
||||
resolveDuration = *maxResolveDuration
|
||||
}
|
||||
return resolveDuration
|
||||
}
|
||||
|
||||
type executor struct {
|
||||
|
@ -370,19 +372,8 @@ func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Dur
|
|||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var alerts []notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
switch a.State {
|
||||
case notifier.StateFiring:
|
||||
a.End = now.Add(resolveDuration)
|
||||
alerts = append(alerts, *a)
|
||||
case notifier.StateInactive:
|
||||
// set End to execStart to notify
|
||||
// that it was just resolved
|
||||
a.End = now
|
||||
alerts = append(alerts, *a)
|
||||
}
|
||||
}
|
||||
|
||||
alerts := ar.alertsToSend(now, resolveDuration, *resendDelay)
|
||||
if len(alerts) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -158,10 +158,11 @@ func TestGroupStart(t *testing.T) {
|
|||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
|
||||
fs := &fakeQuerier{}
|
||||
fn := &fakeNotifier{}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
|
@ -223,6 +224,12 @@ func TestGroupStart(t *testing.T) {
|
|||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
gotAlertsNum := fn.getCounter()
|
||||
if gotAlertsNum < len(expectedAlerts)*2 {
|
||||
t.Fatalf("expected to receive at least %d alerts; got %d instead",
|
||||
len(expectedAlerts)*2, gotAlertsNum)
|
||||
}
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
// and set only one datapoint for response
|
||||
|
@ -243,18 +250,29 @@ func TestResolveDuration(t *testing.T) {
|
|||
testCases := []struct {
|
||||
groupInterval time.Duration
|
||||
maxDuration time.Duration
|
||||
resendDelay time.Duration
|
||||
expected time.Duration
|
||||
}{
|
||||
{time.Minute, 0, 3 * time.Minute},
|
||||
{3 * time.Minute, 0, 9 * time.Minute},
|
||||
{time.Minute, 2 * time.Minute, 2 * time.Minute},
|
||||
{0, 0, 0},
|
||||
{time.Minute, 0, 0, 4 * time.Minute},
|
||||
{time.Minute, 0, 2 * time.Minute, 8 * time.Minute},
|
||||
{time.Minute, 4 * time.Minute, 4 * time.Minute, 4 * time.Minute},
|
||||
{2 * time.Minute, time.Minute, 2 * time.Minute, time.Minute},
|
||||
{time.Minute, 2 * time.Minute, 1 * time.Minute, 2 * time.Minute},
|
||||
{2 * time.Minute, 0, 1 * time.Minute, 8 * time.Minute},
|
||||
{0, 0, 0, 0},
|
||||
}
|
||||
|
||||
defaultResolveDuration := *maxResolveDuration
|
||||
defer func() { *maxResolveDuration = defaultResolveDuration }()
|
||||
defaultResendDelay := *resendDelay
|
||||
defer func() {
|
||||
*maxResolveDuration = defaultResolveDuration
|
||||
*resendDelay = defaultResendDelay
|
||||
}()
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) {
|
||||
*maxResolveDuration = tc.maxDuration
|
||||
*resendDelay = tc.resendDelay
|
||||
got := getResolveDuration(tc.groupInterval)
|
||||
if got != tc.expected {
|
||||
t.Errorf("expected to have %v; got %v", tc.expected, got)
|
||||
|
|
|
@ -61,6 +61,8 @@ func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric,
|
|||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
// records number of received alerts in total
|
||||
counter int
|
||||
}
|
||||
|
||||
func (*fakeNotifier) Close() {}
|
||||
|
@ -68,10 +70,17 @@ func (*fakeNotifier) Addr() string { return "" }
|
|||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getCounter() int {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.counter
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
|
|
|
@ -47,6 +47,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
|
|||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
|
||||
"which is by default equal to 3 evaluation intervals of the parent group.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
|
||||
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
|
|
|
@ -30,6 +30,8 @@ type Alert struct {
|
|||
Start time.Time
|
||||
// End defines the moment of time when Alert supposed to expire
|
||||
End time.Time
|
||||
// LastSent defines the moment when Alert was sent last time
|
||||
LastSent time.Time
|
||||
// Value stores the value returned from evaluating expression from Expr field
|
||||
Value float64
|
||||
// ID is the unique identifer for the Alert
|
||||
|
|
Loading…
Reference in a new issue