VictoriaMetrics/app/vmalert/rule.go

package main

import (
	"context"
	"errors"
	"sync"
	"time"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)

// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
	// ID returns unique ID that may be used for
	// identifying this Rule among others.
	ID() uint64
	// Exec executes the rule with given context at the given timestamp and limit.
	// returns an err if number of resulting time series exceeds the limit.
	Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
	// ExecRange executes the rule on the given time range.
	ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
	// UpdateWith performs modification of current Rule
	// with fields of the given Rule.
	UpdateWith(Rule) error
	// ToAPI converts Rule into APIRule
	ToAPI() APIRule
	// Close performs the shutdown procedures for rule
	// such as metrics unregister
	Close()
}

var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")

type ruleState struct {
	sync.RWMutex
	entries []ruleStateEntry
	cur     int
}

type ruleStateEntry struct {
	// stores last moment of time rule.Exec was called
	time time.Time
	// stores the timesteamp with which rule.Exec was called
	at time.Time
	// stores the duration of the last rule.Exec call
	duration time.Duration
	// stores last error that happened in Exec func
	// resets on every successful Exec
	// may be used as Health ruleState
	err error
	// stores the number of samples returned during
	// the last evaluation
	samples int
	// stores the curl command reflecting the HTTP request used during rule.Exec
	curl string
}

const defaultStateEntriesLimit = 20

func newRuleState() *ruleState {
	return &ruleState{
		entries: make([]ruleStateEntry, defaultStateEntriesLimit),
	}
}

func (s *ruleState) getLast() ruleStateEntry {
	s.RLock()
	defer s.RUnlock()
	return s.entries[s.cur]
}

func (s *ruleState) getAll() []ruleStateEntry {
	entries := make([]ruleStateEntry, 0)

	s.RLock()
	defer s.RUnlock()

	cur := s.cur
	for {
		e := s.entries[cur]
		if !e.time.IsZero() || !e.at.IsZero() {
			entries = append(entries, e)
		}
		cur--
		if cur < 0 {
			cur = cap(s.entries) - 1
		}
		if cur == s.cur {
			return entries
		}
	}
}

func (s *ruleState) add(e ruleStateEntry) {
	s.Lock()
	defer s.Unlock()

	s.cur++
	if s.cur > cap(s.entries)-1 {
		s.cur = 0
	}
	s.entries[s.cur] = e
}
app/vmalert: sync with master branch 2020-04-27 21:19:27 +00:00			`package main`

			`import (`
			`"context"`
vmalert: explicitly set extra labels to alert entities (#886) The previous implementation treated extra labels (global and rule labels) as separate label set to returned time series labels. Hence, time series always contained only original labels and alert ID was generated from sorted labels key-values. Extra labels didn't affect the generated ID and were applied on the following actions: - templating for Summary and Annotations; - persisting state via remote write; - restoring state via remote read. Such behaviour caused difficulties on restore procedure because extra labels had to be dropped before checking the alert ID, but that not always worked. Consider the case when expression returns the following time series `up{job="foo"}` and rule has extra label `job=bar`. This would mean that restored alert ID will be always different to the real time series because of collision. To solve the situation extra labels are now always applied beforehand and `vmalert` doesn't store original labels anymore. However, this could result into a new error situation. Consider the case when expression returns two time series `up{job="foo"}` and `up{job="baz"}`, while rule has extra label `job=bar`. In such case, applying extra labels will result into two identical time series and `vmalert` will return error: `result contains metrics with the same labelset after applying rule labels` https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870 2020-11-09 22:27:32 +00:00			`"errors"`
vmalert: add experimental feature of storing Rule's evaluation state (#3106) vmalert: add experimental feature of storing Rule's evaluation state The new feature keeps last 20 state changes of each Rule in memory. The state are available for view on the Rule's view page. The page can be opened by clicking on `Details` link next to Rule's name on the `/groups` page. States change suppose to help in investigating cases when Rule doesn't generate alerts or records. Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-09-14 12:04:24 +00:00			`"sync"`
vmalert: support rules backfilling (aka `replay`) (#1358) * vmalert: support rules backfilling (aka `replay`) vmalert can `replay` configured rules in the past and backfill results via remote write protocol. It supports MetricsQL/PromQL storage as data source, and can backfill data to remote write compatible storage. Supports recording and alerting rules `replay`. See more details in README. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/836 * vmalert: review fixes * vmalert: readme fixes 2021-06-09 09:20:38 +00:00			`"time"`
Vmalert compliance 2 (#2340) * vmalert: split alert's `Start` field into `ActiveAt` and `Start` The `ActiveAt` field identifies when alert becomes active for rules with `for > 0`. Previously, this value was stored in field `Start`. The field `Start` now identifies the moment alert became `FIRING`. The split is needed in order to distinguish these two moments in the API responses for alerts. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: support specific moment of time for rules evaluation The Querier interface was extended to accept a new argument used as a timestamp at which evaluation should be made. It is needed to align rules execution time within the group. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: mark disappeared series as stale Series generated by alerting rules, which were sent to remote write now will be marked as stale if they will disappear on the next evaluation. This would make ALERTS and ALERTS_FOR_TIME series more precise. Signed-off-by: hagen1778 <roman@victoriametrics.com> * wip Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: evaluate rules at fixed timestamp Before, time at which rules were evaluated was calculated right before rule execution. The change makes sure that timestamp is calculated only once per evalution round and all rules are using the same timestamp. It also updates the logic of resending of already resolved alert notification. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: allow overridin `alertname` label value if it is present in response Previously, `alertname` was always equal to the Alerting Rule name. Now, its value can be overriden if series in response containt the different value for this label. The change is needed for improving compatibility with Prometheus. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: align rules evaluation in time Now, evaluation timestamp for rules evaluates as if there was no delay in rules evaluation. It means, that rules will be evaluated at fixed timestamps+group_interval. This way provides more consistent evaluation results and improves compatibility with Prometheus, Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: add metric for missed iterations New metric `vmalert_iteration_missed_total` will show whether rules evaluation round was missed. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: reduce delay before the initial rule evaluation in group Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: rollback alertname override According to the spec: ``` The alert name from the alerting rule (HighRequestLatency from the example above) MUST be added to the labels of the alert with the label name as alertname. It MUST override any existing alertname label. ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-3 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: throw err immediately on dedup detection ``` The execution of an alerting rule MUST error out immediately and MUST NOT send any alerts or add samples to samples receiver if there is more than one alert with the same labels ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-4 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: cleanup Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: use strings builder to reduce allocs Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-03-29 13:09:07 +00:00
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"`
app/vmalert: sync with master branch 2020-04-27 21:19:27 +00:00			`)`

vmalert: Add recording rules support. (#519) * vmalert: Add recording rules support. Recording rules support required additional service refactoring since it wasn't planned to support them from the very beginning. The list of changes is following: * new entity RecordingRule was added for writing results of MetricsQL expressions into remote storage; * interface Rule now unites both recording and alerting rules; * configuration parser was moved to separate package and now performs more strict validation; * new endpoint for listing all groups and rules in json format was added; * evaluation interval may be set to every particular group; * vmalert: uncomment tests * vmalert: rm outdated TODO * vmalert: fix typos in README 2020-06-01 10:46:37 +00:00			`// Rule represents alerting or recording rule`
			`// that has unique ID, can be Executed and`
			`// updated with other Rule.`
			`type Rule interface {`
vmalert: support rules backfilling (aka `replay`) (#1358) * vmalert: support rules backfilling (aka `replay`) vmalert can `replay` configured rules in the past and backfill results via remote write protocol. It supports MetricsQL/PromQL storage as data source, and can backfill data to remote write compatible storage. Supports recording and alerting rules `replay`. See more details in README. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/836 * vmalert: review fixes * vmalert: readme fixes 2021-06-09 09:20:38 +00:00			`// ID returns unique ID that may be used for`
vmalert: Add recording rules support. (#519) * vmalert: Add recording rules support. Recording rules support required additional service refactoring since it wasn't planned to support them from the very beginning. The list of changes is following: * new entity RecordingRule was added for writing results of MetricsQL expressions into remote storage; * interface Rule now unites both recording and alerting rules; * configuration parser was moved to separate package and now performs more strict validation; * new endpoint for listing all groups and rules in json format was added; * evaluation interval may be set to every particular group; * vmalert: uncomment tests * vmalert: rm outdated TODO * vmalert: fix typos in README 2020-06-01 10:46:37 +00:00			`// identifying this Rule among others.`
			`ID() uint64`
feat: rule limit (#2676) vmalert: support `limit` param in groups definition `limit` param limits number of time series samples produced by a single rule during execution. On reaching the limit rule will return an err. Signed-off-by: lihaowei <haoweili35@gmail.com> 2022-06-09 06:21:30 +00:00			`// Exec executes the rule with given context at the given timestamp and limit.`
			`// returns an err if number of resulting time series exceeds the limit.`
			`Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)`
vmalert: followup for https://github.com/VictoriaMetrics/VictoriaMetrics/commit/76f05f86707dea55282934d49a9abf5e0f62aa19 (#2706) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-06-09 06:58:25 +00:00			`// ExecRange executes the rule on the given time range.`
			`ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)`
vmalert: Add recording rules support. (#519) * vmalert: Add recording rules support. Recording rules support required additional service refactoring since it wasn't planned to support them from the very beginning. The list of changes is following: * new entity RecordingRule was added for writing results of MetricsQL expressions into remote storage; * interface Rule now unites both recording and alerting rules; * configuration parser was moved to separate package and now performs more strict validation; * new endpoint for listing all groups and rules in json format was added; * evaluation interval may be set to every particular group; * vmalert: uncomment tests * vmalert: rm outdated TODO * vmalert: fix typos in README 2020-06-01 10:46:37 +00:00			`// UpdateWith performs modification of current Rule`
			`// with fields of the given Rule.`
			`UpdateWith(Rule) error`
Vmalert compliance improvements (#2320) * vmalert: add support for `sortByLabel` template function * vmalert: update API according to Prometheus conformance program The changes to the API, field names and URL path has been made according to the Prometheus specification for `alert_generator` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md * vmalert: fix the timestamp of the evaluated rules The timestamp used for alert's `EndsAt` was calculated before sending the notification. While the correct way is to use the timestamp taken right before rules evaluation. * vmalert: add `-datasource.queryTimeAlignment` flag The flag is supposed to provide ability to disable `time` param alignment when executing rules. By default, this flag is enabled, so it remains backward compatible. The flag was introduced to achieve better compatibility with Prometheus behaviour according to https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-03-15 11:54:53 +00:00			`// ToAPI converts Rule into APIRule`
			`ToAPI() APIRule`
app/vmalert: extend metrics set exported by `vmalert` #573 (#654) * app/vmalert: extend metrics set exported by `vmalert` #573 New metrics were added to improve observability: + vmalert_alerts_pending{alertname, group} - number of pending alerts per group per alert; + vmalert_alerts_acitve{alertname, group} - number of active alerts per group per alert; + vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error during prev execution, is 0 if no errors happened; + vmalert_recording_rules_error{recording, group} - is 1 if recording rule ended up with error during prev execution, is 0 if no errors happened; * vmalert_iteration_total{group, file} - now contains group and file name labels. This should improve control over specific groups; * vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups; Some collisions for alerts and recording rules are possible, because neither group name nor alert/recording rule name are unique for compatibility reasons. Commit contains list of TODOs for Unregistering metrics since groups and rules are ephemeral and could be removed without application restart. In order to unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13 * app/vmalert: extend metrics set exported by `vmalert` #573 The changes are following: * add an ID label to rules metrics, since `name` collisions within one group is a common case - see the k8s example alerts; * supports metrics unregistering on rule updates. Consider the case when one rule was added or removed from the group, or the whole group was added or removed. The change depends on https://github.com/VictoriaMetrics/metrics/pull/16 where race condition for Unregister method was fixed. 2020-08-09 06:41:29 +00:00			`// Close performs the shutdown procedures for rule`
			`// such as metrics unregister`
			`Close()`
app/vmalert: restore alerts state from datasource metrics (#461) * app/vmalert: restore alerts state from datasource metrics Vmalert will restore alerts state for rules that have `rule.For` > 0 from previously written timeseries via `remotewrite.url` flag. * app/vmalert: mention remotewerite and remoteread configuration in README 2020-05-04 21:51:22 +00:00			`}`
vmalert: explicitly set extra labels to alert entities (#886) The previous implementation treated extra labels (global and rule labels) as separate label set to returned time series labels. Hence, time series always contained only original labels and alert ID was generated from sorted labels key-values. Extra labels didn't affect the generated ID and were applied on the following actions: - templating for Summary and Annotations; - persisting state via remote write; - restoring state via remote read. Such behaviour caused difficulties on restore procedure because extra labels had to be dropped before checking the alert ID, but that not always worked. Consider the case when expression returns the following time series `up{job="foo"}` and rule has extra label `job=bar`. This would mean that restored alert ID will be always different to the real time series because of collision. To solve the situation extra labels are now always applied beforehand and `vmalert` doesn't store original labels anymore. However, this could result into a new error situation. Consider the case when expression returns two time series `up{job="foo"}` and `up{job="baz"}`, while rule has extra label `job=bar`. In such case, applying extra labels will result into two identical time series and `vmalert` will return error: `result contains metrics with the same labelset after applying rule labels` https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870 2020-11-09 22:27:32 +00:00
			`var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")`
vmalert: add experimental feature of storing Rule's evaluation state (#3106) vmalert: add experimental feature of storing Rule's evaluation state The new feature keeps last 20 state changes of each Rule in memory. The state are available for view on the Rule's view page. The page can be opened by clicking on `Details` link next to Rule's name on the `/groups` page. States change suppose to help in investigating cases when Rule doesn't generate alerts or records. Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-09-14 12:04:24 +00:00
			`type ruleState struct {`
			`sync.RWMutex`
			`entries []ruleStateEntry`
			`cur int`
			`}`

			`type ruleStateEntry struct {`
			`// stores last moment of time rule.Exec was called`
			`time time.Time`
			`// stores the timesteamp with which rule.Exec was called`
			`at time.Time`
			`// stores the duration of the last rule.Exec call`
			`duration time.Duration`
			`// stores last error that happened in Exec func`
			`// resets on every successful Exec`
			`// may be used as Health ruleState`
			`err error`
			`// stores the number of samples returned during`
			`// the last evaluation`
			`samples int`
vmalert: do not hold pointer to http.Request (#3467) http.Request was used as a part of state struct for generating the curl command when viewing the rule's state changes. It appears, that holding a referencing is far more expensive than generating the curl command immediately. On the test with 40k rules, this change reduces memory and CPU usage by 50%. Signed-off-by: hagen1778 <roman@victoriametrics.com> Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-12-09 15:13:29 +00:00			`// stores the curl command reflecting the HTTP request used during rule.Exec`
			`curl string`
vmalert: add experimental feature of storing Rule's evaluation state (#3106) vmalert: add experimental feature of storing Rule's evaluation state The new feature keeps last 20 state changes of each Rule in memory. The state are available for view on the Rule's view page. The page can be opened by clicking on `Details` link next to Rule's name on the `/groups` page. States change suppose to help in investigating cases when Rule doesn't generate alerts or records. Signed-off-by: hagen1778 <roman@victoriametrics.com> 2022-09-14 12:04:24 +00:00			`}`

			`const defaultStateEntriesLimit = 20`

			`func newRuleState() *ruleState {`
			`return &ruleState{`
			`entries: make([]ruleStateEntry, defaultStateEntriesLimit),`
			`}`
			`}`

			`func (s *ruleState) getLast() ruleStateEntry {`
			`s.RLock()`
			`defer s.RUnlock()`
			`return s.entries[s.cur]`
			`}`

			`func (s *ruleState) getAll() []ruleStateEntry {`
			`entries := make([]ruleStateEntry, 0)`

			`s.RLock()`
			`defer s.RUnlock()`

			`cur := s.cur`
			`for {`
			`e := s.entries[cur]`
			`if !e.time.IsZero() \|\| !e.at.IsZero() {`
			`entries = append(entries, e)`
			`}`
			`cur--`
			`if cur < 0 {`
			`cur = cap(s.entries) - 1`
			`}`
			`if cur == s.cur {`
			`return entries`
			`}`
			`}`
			`}`

			`func (s *ruleState) add(e ruleStateEntry) {`
			`s.Lock()`
			`defer s.Unlock()`

			`s.cur++`
			`if s.cur > cap(s.entries)-1 {`
			`s.cur = 0`
			`}`
			`s.entries[s.cur] = e`
			`}`