mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 15:16:42 +00:00
vmalert: add keep_firing_for
field for alerting rule (#4669)
vmalert: support `keep_firing_for` field for alerting rule https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4529 --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
7fc34aa1e6
commit
ae0e4a8c90
13 changed files with 817 additions and 620 deletions
|
@ -203,6 +203,10 @@ expr: <string>
|
|||
# as firing once they return.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Alert will continue firing for this long even when the alerting expression no longer has results.
|
||||
# This allows you to delay alert resolution.
|
||||
[ keep_firing_for: <duration> | default = 0s ]
|
||||
|
||||
# Whether to print debug information into logs.
|
||||
# Information includes alerts state changes and requests sent to the datasource.
|
||||
# Please note, that if rule's query params contain sensitive
|
||||
|
@ -736,6 +740,7 @@ See full description for these flags in `./vmalert -help`.
|
|||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
|
|
|
@ -21,17 +21,18 @@ import (
|
|||
|
||||
// AlertingRule is basic alert entity
|
||||
type AlertingRule struct {
|
||||
Type config.Type
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
GroupName string
|
||||
EvalInterval time.Duration
|
||||
Debug bool
|
||||
Type config.Type
|
||||
RuleID uint64
|
||||
Name string
|
||||
Expr string
|
||||
For time.Duration
|
||||
KeepFiringFor time.Duration
|
||||
Labels map[string]string
|
||||
Annotations map[string]string
|
||||
GroupID uint64
|
||||
GroupName string
|
||||
EvalInterval time.Duration
|
||||
Debug bool
|
||||
|
||||
q datasource.Querier
|
||||
|
||||
|
@ -56,17 +57,18 @@ type alertingRuleMetrics struct {
|
|||
|
||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
ar := &AlertingRule{
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For.Duration(),
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: group.ID(),
|
||||
GroupName: group.Name,
|
||||
EvalInterval: group.Interval,
|
||||
Debug: cfg.Debug,
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
Name: cfg.Alert,
|
||||
Expr: cfg.Expr,
|
||||
For: cfg.For.Duration(),
|
||||
KeepFiringFor: cfg.KeepFiringFor.Duration(),
|
||||
Labels: cfg.Labels,
|
||||
Annotations: cfg.Annotations,
|
||||
GroupID: group.ID(),
|
||||
GroupName: group.Name,
|
||||
EvalInterval: group.Interval,
|
||||
Debug: cfg.Debug,
|
||||
q: qb.BuildWithParams(datasource.QuerierParams{
|
||||
DataSourceType: group.Type.String(),
|
||||
EvaluationInterval: group.Interval,
|
||||
|
@ -366,6 +368,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
a.KeepFiringSince = time.Time{}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ls, start, qFn)
|
||||
|
@ -391,12 +394,24 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round")
|
||||
continue
|
||||
}
|
||||
// check if alert should keep StateFiring if rule has
|
||||
// `keep_firing_for` field
|
||||
if a.State == notifier.StateFiring {
|
||||
a.State = notifier.StateInactive
|
||||
a.ResolvedAt = ts
|
||||
ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
|
||||
if ar.KeepFiringFor > 0 {
|
||||
if a.KeepFiringSince.IsZero() {
|
||||
a.KeepFiringSince = ts
|
||||
}
|
||||
}
|
||||
// alerts with ar.KeepFiringFor>0 may remain FIRING
|
||||
// even if their expression isn't true anymore
|
||||
if ts.Sub(a.KeepFiringSince) > ar.KeepFiringFor {
|
||||
a.State = notifier.StateInactive
|
||||
a.ResolvedAt = ts
|
||||
ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
|
||||
continue
|
||||
}
|
||||
ar.logDebugf(ts, a, "KEEP_FIRING: will keep firing for %fs since %v", ar.KeepFiringFor.Seconds(), a.KeepFiringSince)
|
||||
}
|
||||
continue
|
||||
}
|
||||
numActivePending++
|
||||
if a.State == notifier.StatePending && ts.Sub(a.ActiveAt) >= ar.For {
|
||||
|
@ -436,6 +451,7 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
|
|||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.KeepFiringFor = nr.KeepFiringFor
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
|
@ -508,6 +524,7 @@ func (ar *AlertingRule) ToAPI() APIRule {
|
|||
Name: ar.Name,
|
||||
Query: ar.Expr,
|
||||
Duration: ar.For.Seconds(),
|
||||
KeepFiringFor: ar.KeepFiringFor.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: lastState.time,
|
||||
|
@ -576,6 +593,9 @@ func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
|||
if alertURLGeneratorFn != nil {
|
||||
aa.SourceLink = alertURLGeneratorFn(a)
|
||||
}
|
||||
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
|
||||
aa.Stabilizing = true
|
||||
}
|
||||
return aa
|
||||
}
|
||||
|
||||
|
|
|
@ -113,7 +113,7 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
testCases := []struct {
|
||||
rule *AlertingRule
|
||||
steps [][]datasource.Metric
|
||||
expAlerts []testAlert
|
||||
expAlerts map[int][]testAlert
|
||||
}{
|
||||
{
|
||||
newTestAlertingRule("empty", 0),
|
||||
|
@ -125,50 +125,8 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
[][]datasource.Metric{
|
||||
{datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}},
|
||||
},
|
||||
[]testAlert{
|
||||
{alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
map[int][]testAlert{
|
||||
0: {{alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -180,12 +138,16 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
{},
|
||||
{},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
|
||||
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive=>firing", 0),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{},
|
||||
|
@ -194,8 +156,13 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -207,10 +174,12 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
metricWithLabels(t, "name", "foo2"),
|
||||
},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
{labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
map[int][]testAlert{
|
||||
0: {
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
{labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -223,10 +192,19 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
// 1: fire first alert
|
||||
// 2: fire second alert, set first inactive
|
||||
// 3: fire third alert, set second inactive
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
map[int][]testAlert{
|
||||
0: {
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
1: {
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
2: {
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
{labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -234,8 +212,8 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -244,8 +222,9 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -253,34 +232,13 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset and delete pending alerts
|
||||
// empty step to delete pending alerts
|
||||
{},
|
||||
},
|
||||
nil,
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
2: {},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -288,13 +246,57 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to reset pending alerts
|
||||
// empty step to set alert inactive
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
[]testAlert{
|
||||
{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>firing", defaultStep, defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to keep firing
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
{
|
||||
newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>keepfiring=>inactive=>pending=>firing", defaultStep, 2*defaultStep),
|
||||
[][]datasource.Metric{
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
// empty step to keep firing
|
||||
{},
|
||||
// another empty step to keep firing
|
||||
{},
|
||||
// empty step to set alert inactive
|
||||
{},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
{metricWithLabels(t, "name", "foo")},
|
||||
},
|
||||
map[int][]testAlert{
|
||||
0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}},
|
||||
5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}},
|
||||
6: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -304,7 +306,7 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
fq := &fakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for _, step := range tc.steps {
|
||||
for i, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
|
@ -312,28 +314,31 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
}
|
||||
// artificial delay between applying steps
|
||||
time.Sleep(defaultStep)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
|
||||
}
|
||||
expAlerts := make(map[uint64]*notifier.Alert)
|
||||
for _, ta := range tc.expAlerts {
|
||||
labels := make(map[string]string)
|
||||
for i := 0; i < len(ta.labels); i += 2 {
|
||||
k, v := ta.labels[i], ta.labels[i+1]
|
||||
labels[k] = v
|
||||
if _, ok := tc.expAlerts[i]; !ok {
|
||||
continue
|
||||
}
|
||||
labels[alertNameLabel] = tc.rule.Name
|
||||
h := hash(labels)
|
||||
expAlerts[h] = ta.alert
|
||||
}
|
||||
for key, exp := range expAlerts {
|
||||
got, ok := tc.rule.alerts[key]
|
||||
if !ok {
|
||||
t.Fatalf("expected to have key %d", key)
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts[i]) {
|
||||
t.Fatalf("evalIndex %d: expected %d alerts; got %d", i, len(tc.expAlerts[i]), len(tc.rule.alerts))
|
||||
}
|
||||
if got.State != exp.State {
|
||||
t.Fatalf("expected state %d; got %d", exp.State, got.State)
|
||||
expAlerts := make(map[uint64]*notifier.Alert)
|
||||
for _, ta := range tc.expAlerts[i] {
|
||||
labels := make(map[string]string)
|
||||
for i := 0; i < len(ta.labels); i += 2 {
|
||||
k, v := ta.labels[i], ta.labels[i+1]
|
||||
labels[k] = v
|
||||
}
|
||||
labels[alertNameLabel] = tc.rule.Name
|
||||
h := hash(labels)
|
||||
expAlerts[h] = ta.alert
|
||||
}
|
||||
for key, exp := range expAlerts {
|
||||
got, ok := tc.rule.alerts[key]
|
||||
if !ok {
|
||||
t.Fatalf("evalIndex %d: expected to have key %d", i, key)
|
||||
}
|
||||
if got.State != exp.State {
|
||||
t.Fatalf("evalIndex %d: expected state %d; got %d", i, exp.State, got.State)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
@ -867,7 +872,6 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
gotAlert := tc.rule.alerts[hash]
|
||||
if gotAlert == nil {
|
||||
t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations)
|
||||
break
|
||||
}
|
||||
if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) {
|
||||
t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations)
|
||||
|
@ -970,11 +974,18 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
|||
}
|
||||
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
rule := AlertingRule{
|
||||
Name: name,
|
||||
For: waitFor,
|
||||
EvalInterval: waitFor,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(10),
|
||||
}
|
||||
return &rule
|
||||
}
|
||||
|
||||
func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
|
||||
rule := newTestAlertingRule(name, waitFor)
|
||||
rule.KeepFiringFor = keepFiringFor
|
||||
return rule
|
||||
}
|
||||
|
|
|
@ -105,14 +105,16 @@ func (g *Group) Validate(validateTplFn ValidateTplFn, validateExpressions bool)
|
|||
// Rule describes entity that represent either
|
||||
// recording rule or alerting rule.
|
||||
type Rule struct {
|
||||
ID uint64
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For *promutils.Duration `yaml:"for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
Debug bool `yaml:"debug,omitempty"`
|
||||
ID uint64
|
||||
Record string `yaml:"record,omitempty"`
|
||||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For *promutils.Duration `yaml:"for,omitempty"`
|
||||
// Alert will continue firing for this long even when the alerting expression no longer has results.
|
||||
KeepFiringFor *promutils.Duration `yaml:"keep_firing_for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
Debug bool `yaml:"debug,omitempty"`
|
||||
// UpdateEntriesLimit defines max number of rule's state updates stored in memory.
|
||||
// Overrides `-rule.updateEntriesLimit`.
|
||||
UpdateEntriesLimit *int `yaml:"update_entries_limit,omitempty"`
|
||||
|
|
|
@ -404,7 +404,7 @@ func TestHashRule(t *testing.T) {
|
|||
true,
|
||||
},
|
||||
{
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: promutils.NewDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1", For: promutils.NewDuration(time.Minute), KeepFiringFor: promutils.NewDuration(time.Minute)},
|
||||
Rule{Alert: "alert", Expr: "up == 1"},
|
||||
true,
|
||||
},
|
||||
|
|
|
@ -46,18 +46,36 @@ func TestUpdateWith(t *testing.T) {
|
|||
"summary": "{{ $value|humanize }}",
|
||||
"description": "{{$labels}}",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
{
|
||||
Alert: "bar",
|
||||
Expr: "up > 0",
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
[]config.Rule{
|
||||
{
|
||||
Alert: "foo",
|
||||
Expr: "up > 10",
|
||||
For: promutils.NewDuration(time.Second),
|
||||
Labels: map[string]string{
|
||||
"baz": "bar",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "none",
|
||||
},
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": "none",
|
||||
},
|
||||
}},
|
||||
{
|
||||
Alert: "bar",
|
||||
Expr: "up > 0",
|
||||
For: promutils.NewDuration(2 * time.Second),
|
||||
KeepFiringFor: promutils.NewDuration(time.Minute),
|
||||
Labels: map[string]string{
|
||||
"bar": "baz",
|
||||
},
|
||||
}},
|
||||
},
|
||||
{
|
||||
"update recording rule",
|
||||
|
|
|
@ -272,6 +272,9 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
|||
if a.For != b.For {
|
||||
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
|
||||
}
|
||||
if a.KeepFiringFor != b.KeepFiringFor {
|
||||
return fmt.Errorf("expected to have KeepFiringFor %q; got %q", a.KeepFiringFor, b.KeepFiringFor)
|
||||
}
|
||||
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
|
||||
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
|
||||
}
|
||||
|
|
|
@ -39,6 +39,8 @@ type Alert struct {
|
|||
ResolvedAt time.Time
|
||||
// LastSent defines the moment when Alert was sent last time
|
||||
LastSent time.Time
|
||||
// KeepFiringSince defines the moment when StateFiring was kept because of `keep_firing_for` instead of real alert
|
||||
KeepFiringSince time.Time
|
||||
// Value stores the value returned from evaluating expression from Expr field
|
||||
Value float64
|
||||
// ID is the unique identifier for the Alert
|
||||
|
|
|
@ -116,7 +116,11 @@ btn-primary
|
|||
<div class="row">
|
||||
<div class="col-12 mb-2">
|
||||
{% if r.Type == "alerting" %}
|
||||
{% if r.KeepFiringFor > 0 %}
|
||||
<b>alert:</b> {%s r.Name %} (for: {%v r.Duration %} seconds, keep_firing_for: {%v r.KeepFiringFor %} seconds)
|
||||
{% else %}
|
||||
<b>alert:</b> {%s r.Name %} (for: {%v r.Duration %} seconds)
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<b>record:</b> {%s r.Name %}
|
||||
{% endif %}
|
||||
|
@ -225,6 +229,7 @@ btn-primary
|
|||
<td>
|
||||
{%s ar.ActiveAt.Format("2006-01-02T15:04:05Z07:00") %}
|
||||
{% if ar.Restored %}{%= badgeRestored() %}{% endif %}
|
||||
{% if ar.Stabilizing %}{%= badgeStabilizing() %}{% endif %}
|
||||
</td>
|
||||
<td>{%s ar.Value %}</td>
|
||||
<td>
|
||||
|
@ -442,6 +447,18 @@ btn-primary
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% if rule.KeepFiringFor > 0 %}
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
Keep firing for
|
||||
</div>
|
||||
<div class="col">
|
||||
{%v rule.KeepFiringFor %} seconds
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
|
@ -561,6 +578,10 @@ btn-primary
|
|||
<span class="badge bg-warning text-dark" title="Alert state was restored after the service restart from remote storage">restored</span>
|
||||
{% endfunc %}
|
||||
|
||||
{% func badgeStabilizing() %}
|
||||
<span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span>
|
||||
{% endfunc %}
|
||||
|
||||
{% func seriesFetchedWarn(r APIRule) %}
|
||||
{% if isNoMatch(r) %}
|
||||
<svg xmlns="http://www.w3.org/2000/svg"
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -32,6 +32,9 @@ type APIAlert struct {
|
|||
SourceLink string `json:"source"`
|
||||
// Restored shows whether Alert's state was restored on restart
|
||||
Restored bool `json:"restored"`
|
||||
// Stabilizing shows when firing state is kept because of
|
||||
// `keep_firing_for` instead of real alert
|
||||
Stabilizing bool `json:"stabilizing"`
|
||||
}
|
||||
|
||||
// WebLink returns a link to the alert which can be used in UI.
|
||||
|
@ -96,9 +99,11 @@ type APIRule struct {
|
|||
// Query represents Rule's `expression` field
|
||||
Query string `json:"query"`
|
||||
// Duration represents Rule's `for` field
|
||||
Duration float64 `json:"duration"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations,omitempty"`
|
||||
Duration float64 `json:"duration"`
|
||||
// Alert will continue firing for this long even when the alerting expression no longer has results.
|
||||
KeepFiringFor float64 `json:"keep_firing_for"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Annotations map[string]string `json:"annotations,omitempty"`
|
||||
// LastError contains the error faced while executing the rule.
|
||||
LastError string `json:"lastError"`
|
||||
// EvaluationTime is the time taken to completely evaluate the rule in float seconds.
|
||||
|
|
|
@ -66,7 +66,8 @@ The previous behavior can be restored in the following ways:
|
|||
* FEATUTE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow disabling of `step` param attached to [instant queries](https://docs.victoriametrics.com/keyConcepts.html#instant-query). This might be useful for using vmalert with datasources that to not support this param, unlike VictoriaMetrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4573) for details.
|
||||
* FEATUTE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support option for "blackholing" alerting notifications if `-notifier.blackhole` cmd-line flag is set. Enable this flag if you want vmalert to evaluate alerting rules without sending any notifications to external receivers (eg. alertmanager). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4122) for details. Thanks to @venkatbvc for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4639).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add unit test for alerting and recording rules, see more [details](https://docs.victoriametrics.com/vmalert.html#unit-testing-for-rules) here. Thanks to @Haleygo for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4596).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow overriding default GET params for rules with `graphite` datasource type, in the same way as it happens for `prometheus` type. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4685).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow overriding default GET params for rules with `graphite` datasource type, in the same way as it happens for `prometheus` type. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4685).
|
||||
* FEATUTE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support `keep_firing_for` field for alerting rules. See docs updated [here](https://docs.victoriametrics.com/vmalert.html#alerting-rules) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4529). Thanks to @Haleygo for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4669).
|
||||
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): expose `vmauth_user_request_duration_seconds` and `vmauth_unauthorized_user_request_duration_seconds` summary metrics for measuring requests latency per user.
|
||||
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): show backup progress percentage in log during backup uploading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4460).
|
||||
* FEATURE: [vmrestore](https://docs.victoriametrics.com/vmrestore.html): show restoring progress percentage in log during backup downloading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4460).
|
||||
|
|
|
@ -214,6 +214,10 @@ expr: <string>
|
|||
# as firing once they return.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# Alert will continue firing for this long even when the alerting expression no longer has results.
|
||||
# This allows you to delay alert resolution.
|
||||
[ keep_firing_for: <duration> | default = 0s ]
|
||||
|
||||
# Whether to print debug information into logs.
|
||||
# Information includes alerts state changes and requests sent to the datasource.
|
||||
# Please note, that if rule's query params contain sensitive
|
||||
|
@ -747,6 +751,7 @@ See full description for these flags in `./vmalert -help`.
|
|||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
|
|
Loading…
Reference in a new issue