2020-04-27 21:19:27 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2020-11-09 22:27:32 +00:00
|
|
|
"errors"
|
2022-09-14 12:04:24 +00:00
|
|
|
"sync"
|
2021-06-09 09:20:38 +00:00
|
|
|
"time"
|
2022-03-29 13:09:07 +00:00
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
2020-04-27 21:19:27 +00:00
|
|
|
)
|
|
|
|
|
2020-06-01 10:46:37 +00:00
|
|
|
// Rule represents alerting or recording rule
|
|
|
|
// that has unique ID, can be Executed and
|
|
|
|
// updated with other Rule.
|
|
|
|
type Rule interface {
|
2021-06-09 09:20:38 +00:00
|
|
|
// ID returns unique ID that may be used for
|
2020-06-01 10:46:37 +00:00
|
|
|
// identifying this Rule among others.
|
|
|
|
ID() uint64
|
2022-06-09 06:21:30 +00:00
|
|
|
// Exec executes the rule with given context at the given timestamp and limit.
|
|
|
|
// returns an err if number of resulting time series exceeds the limit.
|
|
|
|
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
2022-06-09 06:58:25 +00:00
|
|
|
// ExecRange executes the rule on the given time range.
|
|
|
|
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
2020-06-01 10:46:37 +00:00
|
|
|
// UpdateWith performs modification of current Rule
|
|
|
|
// with fields of the given Rule.
|
|
|
|
UpdateWith(Rule) error
|
2022-03-15 11:54:53 +00:00
|
|
|
// ToAPI converts Rule into APIRule
|
|
|
|
ToAPI() APIRule
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
// Close performs the shutdown procedures for rule
|
|
|
|
// such as metrics unregister
|
|
|
|
Close()
|
2020-05-04 21:51:22 +00:00
|
|
|
}
|
2020-11-09 22:27:32 +00:00
|
|
|
|
|
|
|
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
2022-09-14 12:04:24 +00:00
|
|
|
|
|
|
|
type ruleState struct {
|
|
|
|
sync.RWMutex
|
|
|
|
entries []ruleStateEntry
|
|
|
|
cur int
|
|
|
|
}
|
|
|
|
|
|
|
|
type ruleStateEntry struct {
|
|
|
|
// stores last moment of time rule.Exec was called
|
|
|
|
time time.Time
|
|
|
|
// stores the timesteamp with which rule.Exec was called
|
|
|
|
at time.Time
|
|
|
|
// stores the duration of the last rule.Exec call
|
|
|
|
duration time.Duration
|
|
|
|
// stores last error that happened in Exec func
|
|
|
|
// resets on every successful Exec
|
|
|
|
// may be used as Health ruleState
|
|
|
|
err error
|
|
|
|
// stores the number of samples returned during
|
|
|
|
// the last evaluation
|
|
|
|
samples int
|
2022-12-09 15:13:29 +00:00
|
|
|
// stores the curl command reflecting the HTTP request used during rule.Exec
|
|
|
|
curl string
|
2022-09-14 12:04:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
const defaultStateEntriesLimit = 20
|
|
|
|
|
|
|
|
func newRuleState() *ruleState {
|
|
|
|
return &ruleState{
|
|
|
|
entries: make([]ruleStateEntry, defaultStateEntriesLimit),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *ruleState) getLast() ruleStateEntry {
|
|
|
|
s.RLock()
|
|
|
|
defer s.RUnlock()
|
|
|
|
return s.entries[s.cur]
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *ruleState) getAll() []ruleStateEntry {
|
|
|
|
entries := make([]ruleStateEntry, 0)
|
|
|
|
|
|
|
|
s.RLock()
|
|
|
|
defer s.RUnlock()
|
|
|
|
|
|
|
|
cur := s.cur
|
|
|
|
for {
|
|
|
|
e := s.entries[cur]
|
|
|
|
if !e.time.IsZero() || !e.at.IsZero() {
|
|
|
|
entries = append(entries, e)
|
|
|
|
}
|
|
|
|
cur--
|
|
|
|
if cur < 0 {
|
|
|
|
cur = cap(s.entries) - 1
|
|
|
|
}
|
|
|
|
if cur == s.cur {
|
|
|
|
return entries
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *ruleState) add(e ruleStateEntry) {
|
|
|
|
s.Lock()
|
|
|
|
defer s.Unlock()
|
|
|
|
|
|
|
|
s.cur++
|
|
|
|
if s.cur > cap(s.entries)-1 {
|
|
|
|
s.cur = 0
|
|
|
|
}
|
|
|
|
s.entries[s.cur] = e
|
|
|
|
}
|