mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: avoid sending resolves for pending alerts (#498)
Before the change we were sending notifications to notifier if following conditions are met: * alert is in Fire state * alert is in Inactive state We were sending Inactive notifications to resolve alert ASAP. Unfortunately, we were sending resolves for Pending alerts that become Inactive, which is wrong. In this change we delete alert from the active list if it was Pending and become Inactive. In this way we now have Inactive alerts only if they were in state Fire before. See test change for example.
This commit is contained in:
parent
e5f5342e18
commit
c7f3e58032
3 changed files with 48 additions and 33 deletions
|
@ -69,7 +69,7 @@ func (g *Group) updateWith(newGroup Group) {
|
||||||
|
|
||||||
// copy all significant fields.
|
// copy all significant fields.
|
||||||
// alerts state isn't copied since
|
// alerts state isn't copied since
|
||||||
// it should be updated in next 2 Evals
|
// it should be updated in next 2 Execs
|
||||||
or.For = nr.For
|
or.For = nr.For
|
||||||
or.Expr = nr.Expr
|
or.Expr = nr.Expr
|
||||||
or.Labels = nr.Labels
|
or.Labels = nr.Labels
|
||||||
|
@ -151,30 +151,47 @@ func (g *Group) start(ctx context.Context, interval time.Duration,
|
||||||
|
|
||||||
var alertsToSend []notifier.Alert
|
var alertsToSend []notifier.Alert
|
||||||
for _, a := range rule.alerts {
|
for _, a := range rule.alerts {
|
||||||
if a.State != notifier.StatePending {
|
switch a.State {
|
||||||
|
case notifier.StateFiring:
|
||||||
|
// set End to execStart + 3 intervals
|
||||||
|
// so notifier can resolve it automatically if `vmalert`
|
||||||
|
// won't be able to send resolve for some reason
|
||||||
|
a.End = execStart.Add(3 * interval)
|
||||||
|
alertsToSend = append(alertsToSend, *a)
|
||||||
|
pushToRW(rw, rule, a, execStart)
|
||||||
|
case notifier.StatePending:
|
||||||
|
pushToRW(rw, rule, a, execStart)
|
||||||
|
case notifier.StateInactive:
|
||||||
|
// set End to execStart to notify
|
||||||
|
// that it was just resolved
|
||||||
|
a.End = execStart
|
||||||
alertsToSend = append(alertsToSend, *a)
|
alertsToSend = append(alertsToSend, *a)
|
||||||
}
|
}
|
||||||
if a.State == notifier.StateInactive || rw == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
tss := rule.AlertToTimeSeries(a, execStart)
|
|
||||||
for _, ts := range tss {
|
|
||||||
remoteWriteSent.Inc()
|
|
||||||
if err := rw.Push(ts); err != nil {
|
|
||||||
remoteWriteErrors.Inc()
|
|
||||||
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if len(alertsToSend) > 0 {
|
if len(alertsToSend) == 0 {
|
||||||
alertsSent.Add(len(alertsToSend))
|
continue
|
||||||
if err := nr.Send(ctx, alertsToSend); err != nil {
|
}
|
||||||
alertsSendErrors.Inc()
|
alertsSent.Add(len(alertsToSend))
|
||||||
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
|
if err := nr.Send(ctx, alertsToSend); err != nil {
|
||||||
}
|
alertsSendErrors.Inc()
|
||||||
|
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
iterationDuration.UpdateDuration(iterationStart)
|
iterationDuration.UpdateDuration(iterationStart)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
|
||||||
|
if rw == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
tss := rule.AlertToTimeSeries(a, timestamp)
|
||||||
|
remoteWriteSent.Add(len(tss))
|
||||||
|
for _, ts := range tss {
|
||||||
|
if err := rw.Push(ts); err != nil {
|
||||||
|
remoteWriteErrors.Inc()
|
||||||
|
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -71,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
for h, a := range r.alerts {
|
for h, a := range r.alerts {
|
||||||
// cleanup inactive alerts from previous Eval
|
// cleanup inactive alerts from previous Exec
|
||||||
if a.State == notifier.StateInactive {
|
if a.State == notifier.StateInactive {
|
||||||
delete(r.alerts, h)
|
delete(r.alerts, h)
|
||||||
}
|
}
|
||||||
|
@ -109,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
||||||
// if alert wasn't updated in this iteration
|
// if alert wasn't updated in this iteration
|
||||||
// means it is resolved already
|
// means it is resolved already
|
||||||
if _, ok := updated[h]; !ok {
|
if _, ok := updated[h]; !ok {
|
||||||
|
if a.State == notifier.StatePending {
|
||||||
|
// alert was in Pending state - it is not
|
||||||
|
// active anymore
|
||||||
|
delete(r.alerts, h)
|
||||||
|
continue
|
||||||
|
}
|
||||||
a.State = notifier.StateInactive
|
a.State = notifier.StateInactive
|
||||||
// set endTime to last execution time
|
|
||||||
// so it can be sent by notifier on next step
|
|
||||||
a.End = r.lastExecTime
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
||||||
a.State = notifier.StateFiring
|
a.State = notifier.StateFiring
|
||||||
alertsFired.Inc()
|
alertsFired.Inc()
|
||||||
}
|
}
|
||||||
if a.State == notifier.StateFiring {
|
|
||||||
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -293,7 +293,7 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time)
|
||||||
|
|
||||||
// Restore restores the state of active alerts basing on previously written timeseries.
|
// Restore restores the state of active alerts basing on previously written timeseries.
|
||||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||||
// to be updated on next Eval, as well as Value field.
|
// to be updated on next Exec, as well as Value field.
|
||||||
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||||
if q == nil {
|
if q == nil {
|
||||||
return fmt.Errorf("querier is nil")
|
return fmt.Errorf("querier is nil")
|
||||||
|
@ -313,7 +313,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
|
||||||
labels := m.Labels
|
labels := m.Labels
|
||||||
m.Labels = make([]datasource.Label, 0)
|
m.Labels = make([]datasource.Label, 0)
|
||||||
// drop all extra labels, so hash key will
|
// drop all extra labels, so hash key will
|
||||||
// be identical to timeseries received in Eval
|
// be identical to timeseries received in Exec
|
||||||
for _, l := range labels {
|
for _, l := range labels {
|
||||||
if l.Name == alertNameLabel {
|
if l.Name == alertNameLabel {
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -296,16 +296,14 @@ func TestRule_Exec(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
newTestRule("for-pending=>inactive", time.Millisecond),
|
newTestRule("for-pending=>empty", time.Second),
|
||||||
[][]datasource.Metric{
|
[][]datasource.Metric{
|
||||||
{metricWithLabels(t, "name", "foo")},
|
{metricWithLabels(t, "name", "foo")},
|
||||||
{metricWithLabels(t, "name", "foo")},
|
{metricWithLabels(t, "name", "foo")},
|
||||||
// empty step to reset pending alerts
|
// empty step to reset and delete pending alerts
|
||||||
{},
|
{},
|
||||||
},
|
},
|
||||||
map[uint64]*notifier.Alert{
|
map[uint64]*notifier.Alert{},
|
||||||
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
|
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
|
||||||
|
|
Loading…
Reference in a new issue