vmalert: add flag to limit the max value for auto-resovle duration for alerts (#1609)

* vmalert: add flag to limit the max value for auto-resovle duration for alerts

The new flag `rule.maxResolveDuration` suppose to limit max value for
alert.End param, which is used by notifiers like Alertmanager for alerts auto resolve.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1586
This commit is contained in:
Roman Khavronenko 2021-09-13 15:48:18 +03:00 committed by GitHub
parent b9727a36dc
commit 5494bc02a6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 71 additions and 20 deletions

View file

@ -277,8 +277,8 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
case <-t.C: case <-t.C:
g.metrics.iterationTotal.Inc() g.metrics.iterationTotal.Inc()
iterationStart := time.Now() iterationStart := time.Now()
resolveDuration := getResolveDuration(g.Interval)
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval) errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
for err := range errs { for err := range errs {
if err != nil { if err != nil {
logger.Errorf("group %q: %s", g.Name, err) logger.Errorf("group %q: %s", g.Name, err)
@ -290,6 +290,17 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
} }
} }
// resolveDuration for alerts is equal to 3 interval evaluations
// so in case if vmalert stops sending updates for some reason,
// notifier could automatically resolve the alert.
func getResolveDuration(groupInterval time.Duration) time.Duration {
resolveInterval := groupInterval * 3
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
return *maxResolveDuration
}
return resolveInterval
}
type executor struct { type executor struct {
notifiers []eNotifier notifiers []eNotifier
rw *remotewrite.Client rw *remotewrite.Client
@ -301,12 +312,12 @@ type eNotifier struct {
alertsSendErrors *counter alertsSendErrors *counter
} }
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error { func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, resolveDuration time.Duration) chan error {
res := make(chan error, len(rules)) res := make(chan error, len(rules))
if concurrency == 1 { if concurrency == 1 {
// fast path // fast path
for _, rule := range rules { for _, rule := range rules {
res <- e.exec(ctx, rule, interval) res <- e.exec(ctx, rule, resolveDuration)
} }
close(res) close(res)
return res return res
@ -319,7 +330,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
sem <- struct{}{} sem <- struct{}{}
wg.Add(1) wg.Add(1)
go func(r Rule) { go func(r Rule) {
res <- e.exec(ctx, r, interval) res <- e.exec(ctx, r, resolveDuration)
<-sem <-sem
wg.Done() wg.Done()
}(rule) }(rule)
@ -339,7 +350,7 @@ var (
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
) )
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error { func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Duration) error {
execTotal.Inc() execTotal.Inc()
tss, err := rule.Exec(ctx) tss, err := rule.Exec(ctx)
@ -365,10 +376,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
for _, a := range ar.alerts { for _, a := range ar.alerts {
switch a.State { switch a.State {
case notifier.StateFiring: case notifier.StateFiring:
// set End to execStart + 3 intervals a.End = time.Now().Add(resolveDuration)
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = time.Now().Add(3 * interval)
alerts = append(alerts, *a) alerts = append(alerts, *a)
case notifier.StateInactive: case notifier.StateInactive:
// set End to execStart to notify // set End to execStart to notify

View file

@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"fmt"
"sort" "sort"
"testing" "testing"
"time" "time"
@ -235,3 +236,27 @@ func TestGroupStart(t *testing.T) {
g.close() g.close()
<-finished <-finished
} }
func TestResolveDuration(t *testing.T) {
testCases := []struct {
groupInterval time.Duration
maxDuration time.Duration
expected time.Duration
}{
{time.Minute, 0, 3 * time.Minute},
{3 * time.Minute, 0, 9 * time.Minute},
{time.Minute, 2 * time.Minute, 2 * time.Minute},
{0, 0, 0},
}
defaultResolveDuration := *maxResolveDuration
defer func() { *maxResolveDuration = defaultResolveDuration }()
for _, tc := range testCases {
t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) {
*maxResolveDuration = tc.maxDuration
got := getResolveDuration(tc.groupInterval)
if got != tc.expected {
t.Errorf("expected to have %v; got %v", tc.expected, got)
}
})
}
}

View file

@ -42,6 +42,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates") validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which is by default equal to 3 evaluation intervals of the parent group.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier") externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service. externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`) eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)

View file

@ -94,14 +94,18 @@ groups:
*rulesCheckInterval = 200 * time.Millisecond *rulesCheckInterval = 200 * time.Millisecond
*rulePath = []string{f.Name()} *rulePath = []string{f.Name()}
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
defer cancel()
m := &manager{ m := &manager{
querierBuilder: &fakeQuerier{}, querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*Group), groups: make(map[uint64]*Group),
labels: map[string]string{}, labels: map[string]string{},
} }
go configReload(ctx, m, nil)
syncCh := make(chan struct{})
go func() {
configReload(ctx, m, nil)
close(syncCh)
}()
lenLocked := func(m *manager) int { lenLocked := func(m *manager) int {
m.groupsMu.RLock() m.groupsMu.RLock()
@ -138,6 +142,9 @@ groups:
if groupsLen != 1 { // should remain unchanged if groupsLen != 1 { // should remain unchanged
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
} }
cancel()
<-syncCh
} }
func writeToFile(t *testing.T, file, b string) { func writeToFile(t *testing.T, file, b string) {

View file

@ -14,17 +14,26 @@ import (
// Alert the triggered alert // Alert the triggered alert
// TODO: Looks like alert name isn't unique // TODO: Looks like alert name isn't unique
type Alert struct { type Alert struct {
GroupID uint64 // GroupID contains the ID of the parent rules group
Name string GroupID uint64
Labels map[string]string // Name represents Alert name
Name string
// Labels is the list of label-value pairs attached to the Alert
Labels map[string]string
// Annotations is the list of annotations generated on Alert evaluation
Annotations map[string]string Annotations map[string]string
State AlertState // State represents the current state of the Alert
State AlertState
Expr string // Expr contains expression that was executed to generate the Alert
Expr string
// Start defines the moment of time when Alert has triggered
Start time.Time Start time.Time
End time.Time // End defines the moment of time when Alert supposed to expire
End time.Time
// Value stores the value returned from evaluating expression from Expr field
Value float64 Value float64
ID uint64 // ID is the unique identifer for the Alert
ID uint64
} }
// AlertState type indicates the Alert state // AlertState type indicates the Alert state