mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: add flag to limit the max value for auto-resovle duration for alerts (#1609)
* vmalert: add flag to limit the max value for auto-resovle duration for alerts The new flag `rule.maxResolveDuration` suppose to limit max value for alert.End param, which is used by notifiers like Alertmanager for alerts auto resolve. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1586
This commit is contained in:
parent
b9727a36dc
commit
5494bc02a6
5 changed files with 71 additions and 20 deletions
|
@ -277,8 +277,8 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||||
case <-t.C:
|
case <-t.C:
|
||||||
g.metrics.iterationTotal.Inc()
|
g.metrics.iterationTotal.Inc()
|
||||||
iterationStart := time.Now()
|
iterationStart := time.Now()
|
||||||
|
resolveDuration := getResolveDuration(g.Interval)
|
||||||
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
|
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
|
||||||
for err := range errs {
|
for err := range errs {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("group %q: %s", g.Name, err)
|
logger.Errorf("group %q: %s", g.Name, err)
|
||||||
|
@ -290,6 +290,17 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// resolveDuration for alerts is equal to 3 interval evaluations
|
||||||
|
// so in case if vmalert stops sending updates for some reason,
|
||||||
|
// notifier could automatically resolve the alert.
|
||||||
|
func getResolveDuration(groupInterval time.Duration) time.Duration {
|
||||||
|
resolveInterval := groupInterval * 3
|
||||||
|
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
|
||||||
|
return *maxResolveDuration
|
||||||
|
}
|
||||||
|
return resolveInterval
|
||||||
|
}
|
||||||
|
|
||||||
type executor struct {
|
type executor struct {
|
||||||
notifiers []eNotifier
|
notifiers []eNotifier
|
||||||
rw *remotewrite.Client
|
rw *remotewrite.Client
|
||||||
|
@ -301,12 +312,12 @@ type eNotifier struct {
|
||||||
alertsSendErrors *counter
|
alertsSendErrors *counter
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, resolveDuration time.Duration) chan error {
|
||||||
res := make(chan error, len(rules))
|
res := make(chan error, len(rules))
|
||||||
if concurrency == 1 {
|
if concurrency == 1 {
|
||||||
// fast path
|
// fast path
|
||||||
for _, rule := range rules {
|
for _, rule := range rules {
|
||||||
res <- e.exec(ctx, rule, interval)
|
res <- e.exec(ctx, rule, resolveDuration)
|
||||||
}
|
}
|
||||||
close(res)
|
close(res)
|
||||||
return res
|
return res
|
||||||
|
@ -319,7 +330,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
|
||||||
sem <- struct{}{}
|
sem <- struct{}{}
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(r Rule) {
|
go func(r Rule) {
|
||||||
res <- e.exec(ctx, r, interval)
|
res <- e.exec(ctx, r, resolveDuration)
|
||||||
<-sem
|
<-sem
|
||||||
wg.Done()
|
wg.Done()
|
||||||
}(rule)
|
}(rule)
|
||||||
|
@ -339,7 +350,7 @@ var (
|
||||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
|
func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Duration) error {
|
||||||
execTotal.Inc()
|
execTotal.Inc()
|
||||||
|
|
||||||
tss, err := rule.Exec(ctx)
|
tss, err := rule.Exec(ctx)
|
||||||
|
@ -365,10 +376,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
|
||||||
for _, a := range ar.alerts {
|
for _, a := range ar.alerts {
|
||||||
switch a.State {
|
switch a.State {
|
||||||
case notifier.StateFiring:
|
case notifier.StateFiring:
|
||||||
// set End to execStart + 3 intervals
|
a.End = time.Now().Add(resolveDuration)
|
||||||
// so notifier can resolve it automatically if `vmalert`
|
|
||||||
// won't be able to send resolve for some reason
|
|
||||||
a.End = time.Now().Add(3 * interval)
|
|
||||||
alerts = append(alerts, *a)
|
alerts = append(alerts, *a)
|
||||||
case notifier.StateInactive:
|
case notifier.StateInactive:
|
||||||
// set End to execStart to notify
|
// set End to execStart to notify
|
||||||
|
|
|
@ -2,6 +2,7 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
@ -235,3 +236,27 @@ func TestGroupStart(t *testing.T) {
|
||||||
g.close()
|
g.close()
|
||||||
<-finished
|
<-finished
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveDuration(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
groupInterval time.Duration
|
||||||
|
maxDuration time.Duration
|
||||||
|
expected time.Duration
|
||||||
|
}{
|
||||||
|
{time.Minute, 0, 3 * time.Minute},
|
||||||
|
{3 * time.Minute, 0, 9 * time.Minute},
|
||||||
|
{time.Minute, 2 * time.Minute, 2 * time.Minute},
|
||||||
|
{0, 0, 0},
|
||||||
|
}
|
||||||
|
defaultResolveDuration := *maxResolveDuration
|
||||||
|
defer func() { *maxResolveDuration = defaultResolveDuration }()
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) {
|
||||||
|
*maxResolveDuration = tc.maxDuration
|
||||||
|
got := getResolveDuration(tc.groupInterval)
|
||||||
|
if got != tc.expected {
|
||||||
|
t.Errorf("expected to have %v; got %v", tc.expected, got)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -42,6 +42,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor
|
||||||
|
|
||||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||||
|
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
|
||||||
|
"which is by default equal to 3 evaluation intervals of the parent group.")
|
||||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||||
|
|
|
@ -94,14 +94,18 @@ groups:
|
||||||
*rulesCheckInterval = 200 * time.Millisecond
|
*rulesCheckInterval = 200 * time.Millisecond
|
||||||
*rulePath = []string{f.Name()}
|
*rulePath = []string{f.Name()}
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
m := &manager{
|
m := &manager{
|
||||||
querierBuilder: &fakeQuerier{},
|
querierBuilder: &fakeQuerier{},
|
||||||
groups: make(map[uint64]*Group),
|
groups: make(map[uint64]*Group),
|
||||||
labels: map[string]string{},
|
labels: map[string]string{},
|
||||||
}
|
}
|
||||||
go configReload(ctx, m, nil)
|
|
||||||
|
syncCh := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
configReload(ctx, m, nil)
|
||||||
|
close(syncCh)
|
||||||
|
}()
|
||||||
|
|
||||||
lenLocked := func(m *manager) int {
|
lenLocked := func(m *manager) int {
|
||||||
m.groupsMu.RLock()
|
m.groupsMu.RLock()
|
||||||
|
@ -138,6 +142,9 @@ groups:
|
||||||
if groupsLen != 1 { // should remain unchanged
|
if groupsLen != 1 { // should remain unchanged
|
||||||
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cancel()
|
||||||
|
<-syncCh
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeToFile(t *testing.T, file, b string) {
|
func writeToFile(t *testing.T, file, b string) {
|
||||||
|
|
|
@ -14,17 +14,26 @@ import (
|
||||||
// Alert the triggered alert
|
// Alert the triggered alert
|
||||||
// TODO: Looks like alert name isn't unique
|
// TODO: Looks like alert name isn't unique
|
||||||
type Alert struct {
|
type Alert struct {
|
||||||
GroupID uint64
|
// GroupID contains the ID of the parent rules group
|
||||||
Name string
|
GroupID uint64
|
||||||
Labels map[string]string
|
// Name represents Alert name
|
||||||
|
Name string
|
||||||
|
// Labels is the list of label-value pairs attached to the Alert
|
||||||
|
Labels map[string]string
|
||||||
|
// Annotations is the list of annotations generated on Alert evaluation
|
||||||
Annotations map[string]string
|
Annotations map[string]string
|
||||||
State AlertState
|
// State represents the current state of the Alert
|
||||||
|
State AlertState
|
||||||
Expr string
|
// Expr contains expression that was executed to generate the Alert
|
||||||
|
Expr string
|
||||||
|
// Start defines the moment of time when Alert has triggered
|
||||||
Start time.Time
|
Start time.Time
|
||||||
End time.Time
|
// End defines the moment of time when Alert supposed to expire
|
||||||
|
End time.Time
|
||||||
|
// Value stores the value returned from evaluating expression from Expr field
|
||||||
Value float64
|
Value float64
|
||||||
ID uint64
|
// ID is the unique identifer for the Alert
|
||||||
|
ID uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// AlertState type indicates the Alert state
|
// AlertState type indicates the Alert state
|
||||||
|
|
Loading…
Reference in a new issue