2023-10-13 11:54:33 +00:00
|
|
|
package rule
|
2020-06-01 10:46:37 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"sort"
|
2021-05-15 10:25:57 +00:00
|
|
|
"strings"
|
2020-06-01 10:46:37 +00:00
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
2022-02-02 12:11:41 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
2020-06-01 10:46:37 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
|
|
|
)
|
|
|
|
|
|
|
|
// RecordingRule is a Rule that supposed
|
|
|
|
// to evaluate configured Expression and
|
|
|
|
// return TimeSeries as result.
|
|
|
|
type RecordingRule struct {
|
2022-07-22 08:44:55 +00:00
|
|
|
Type config.Type
|
2020-06-15 19:15:47 +00:00
|
|
|
RuleID uint64
|
2020-06-01 10:46:37 +00:00
|
|
|
Name string
|
|
|
|
Expr string
|
|
|
|
Labels map[string]string
|
|
|
|
GroupID uint64
|
|
|
|
|
2021-04-28 20:41:15 +00:00
|
|
|
q datasource.Querier
|
|
|
|
|
2022-09-14 12:04:24 +00:00
|
|
|
// state stores recent state changes
|
|
|
|
// during evaluations
|
|
|
|
state *ruleState
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
|
|
|
|
metrics *recordingRuleMetrics
|
|
|
|
}
|
|
|
|
|
|
|
|
type recordingRuleMetrics struct {
|
2022-02-02 12:11:41 +00:00
|
|
|
errors *utils.Gauge
|
|
|
|
samples *utils.Gauge
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// String implements Stringer interface
|
|
|
|
func (rr *RecordingRule) String() string {
|
|
|
|
return rr.Name
|
|
|
|
}
|
|
|
|
|
|
|
|
// ID returns unique Rule ID
|
|
|
|
// within the parent Group.
|
|
|
|
func (rr *RecordingRule) ID() uint64 {
|
2020-06-15 19:15:47 +00:00
|
|
|
return rr.RuleID
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
// NewRecordingRule creates a new RecordingRule
|
|
|
|
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
rr := &RecordingRule{
|
2021-11-05 17:49:32 +00:00
|
|
|
Type: group.Type,
|
2020-06-15 19:15:47 +00:00
|
|
|
RuleID: cfg.ID,
|
2020-06-01 10:46:37 +00:00
|
|
|
Name: cfg.Record,
|
|
|
|
Expr: cfg.Expr,
|
|
|
|
Labels: cfg.Labels,
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
GroupID: group.ID(),
|
|
|
|
metrics: &recordingRuleMetrics{},
|
2021-04-30 06:46:03 +00:00
|
|
|
q: qb.BuildWithParams(datasource.QuerierParams{
|
2022-07-22 08:44:55 +00:00
|
|
|
DataSourceType: group.Type.String(),
|
2021-04-30 06:46:03 +00:00
|
|
|
EvaluationInterval: group.Interval,
|
2021-12-02 12:45:08 +00:00
|
|
|
QueryParams: group.Params,
|
2022-07-21 13:59:55 +00:00
|
|
|
Headers: group.Headers,
|
2021-04-30 06:46:03 +00:00
|
|
|
}),
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
2021-02-01 13:02:44 +00:00
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
entrySize := *ruleUpdateEntriesLimit
|
2022-12-29 11:36:44 +00:00
|
|
|
if cfg.UpdateEntriesLimit != nil {
|
2023-10-13 11:54:33 +00:00
|
|
|
entrySize = *cfg.UpdateEntriesLimit
|
|
|
|
}
|
|
|
|
if entrySize < 1 {
|
|
|
|
entrySize = 1
|
|
|
|
}
|
|
|
|
rr.state = &ruleState{
|
|
|
|
entries: make([]StateEntry, entrySize),
|
2022-12-29 11:36:44 +00:00
|
|
|
}
|
|
|
|
|
2023-11-02 15:01:31 +00:00
|
|
|
labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID())
|
2022-02-02 12:11:41 +00:00
|
|
|
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
func() float64 {
|
2022-09-14 12:04:24 +00:00
|
|
|
e := rr.state.getLast()
|
2023-10-13 11:54:33 +00:00
|
|
|
if e.Err == nil {
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
return 1
|
|
|
|
})
|
2022-02-02 12:11:41 +00:00
|
|
|
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
2021-08-05 06:59:46 +00:00
|
|
|
func() float64 {
|
2022-09-14 12:04:24 +00:00
|
|
|
e := rr.state.getLast()
|
2023-10-13 11:54:33 +00:00
|
|
|
return float64(e.Samples)
|
2021-08-05 06:59:46 +00:00
|
|
|
})
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 06:41:29 +00:00
|
|
|
return rr
|
|
|
|
}
|
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
// close unregisters rule metrics
|
|
|
|
func (rr *RecordingRule) close() {
|
2022-02-02 12:11:41 +00:00
|
|
|
rr.metrics.errors.Unregister()
|
|
|
|
rr.metrics.samples.Unregister()
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
// execRange executes recording rule on the given time range similarly to Exec.
|
2021-06-09 09:20:38 +00:00
|
|
|
// It doesn't update internal states of the Rule and meant to be used just
|
|
|
|
// to get time series for backfilling.
|
2023-10-13 11:54:33 +00:00
|
|
|
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
2023-05-08 07:36:39 +00:00
|
|
|
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
|
2021-06-09 09:20:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2023-05-08 07:36:39 +00:00
|
|
|
duplicates := make(map[string]struct{}, len(res.Data))
|
2021-06-09 09:20:38 +00:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
2023-05-08 07:36:39 +00:00
|
|
|
for _, s := range res.Data {
|
2021-06-09 09:20:38 +00:00
|
|
|
ts := rr.toTimeSeries(s)
|
|
|
|
key := stringifyLabels(ts)
|
|
|
|
if _, ok := duplicates[key]; ok {
|
|
|
|
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", s.Labels, key, errDuplicate)
|
|
|
|
}
|
|
|
|
duplicates[key] = struct{}{}
|
|
|
|
tss = append(tss, ts)
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
2021-06-09 09:20:38 +00:00
|
|
|
return tss, nil
|
|
|
|
}
|
2020-06-01 10:46:37 +00:00
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
// exec executes RecordingRule expression via the given Querier.
|
|
|
|
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
2022-09-14 12:04:24 +00:00
|
|
|
start := time.Now()
|
2023-05-08 07:36:39 +00:00
|
|
|
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
|
2023-10-13 11:54:33 +00:00
|
|
|
curState := StateEntry{
|
|
|
|
Time: start,
|
|
|
|
At: ts,
|
|
|
|
Duration: time.Since(start),
|
|
|
|
Samples: len(res.Data),
|
|
|
|
SeriesFetched: res.SeriesFetched,
|
|
|
|
Curl: requestToCurl(req),
|
2022-09-14 12:04:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
rr.state.add(curState)
|
|
|
|
}()
|
2020-06-01 10:46:37 +00:00
|
|
|
|
|
|
|
if err != nil {
|
2023-10-13 11:54:33 +00:00
|
|
|
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
|
|
|
return nil, curState.Err
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
2023-05-08 07:36:39 +00:00
|
|
|
qMetrics := res.Data
|
2022-06-09 06:21:30 +00:00
|
|
|
numSeries := len(qMetrics)
|
|
|
|
if limit > 0 && numSeries > limit {
|
2023-10-13 11:54:33 +00:00
|
|
|
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
|
|
|
return nil, curState.Err
|
2022-06-09 06:21:30 +00:00
|
|
|
}
|
|
|
|
|
2021-05-15 10:25:57 +00:00
|
|
|
duplicates := make(map[string]struct{}, len(qMetrics))
|
2020-06-01 10:46:37 +00:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
|
|
|
for _, r := range qMetrics {
|
2021-06-09 09:20:38 +00:00
|
|
|
ts := rr.toTimeSeries(r)
|
2021-05-15 10:25:57 +00:00
|
|
|
key := stringifyLabels(ts)
|
|
|
|
if _, ok := duplicates[key]; ok {
|
2023-10-13 11:54:33 +00:00
|
|
|
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
|
|
|
return nil, curState.Err
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
2021-05-15 10:25:57 +00:00
|
|
|
duplicates[key] = struct{}{}
|
2020-06-01 10:46:37 +00:00
|
|
|
tss = append(tss, ts)
|
|
|
|
}
|
|
|
|
return tss, nil
|
|
|
|
}
|
|
|
|
|
2021-05-15 10:25:57 +00:00
|
|
|
func stringifyLabels(ts prompbmarshal.TimeSeries) string {
|
2020-06-01 10:46:37 +00:00
|
|
|
labels := ts.Labels
|
2021-05-15 10:25:57 +00:00
|
|
|
if len(labels) > 1 {
|
|
|
|
sort.Slice(labels, func(i, j int) bool {
|
|
|
|
return labels[i].Name < labels[j].Name
|
|
|
|
})
|
|
|
|
}
|
|
|
|
b := strings.Builder{}
|
|
|
|
for i, l := range labels {
|
|
|
|
b.WriteString(l.Name)
|
|
|
|
b.WriteString("=")
|
|
|
|
b.WriteString(l.Value)
|
|
|
|
if i != len(labels)-1 {
|
|
|
|
b.WriteString(",")
|
|
|
|
}
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
2021-05-15 10:25:57 +00:00
|
|
|
return b.String()
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
2021-06-09 09:20:38 +00:00
|
|
|
func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSeries {
|
2020-06-01 10:46:37 +00:00
|
|
|
labels := make(map[string]string)
|
|
|
|
for _, l := range m.Labels {
|
|
|
|
labels[l.Name] = l.Value
|
|
|
|
}
|
|
|
|
labels["__name__"] = rr.Name
|
|
|
|
// override existing labels with configured ones
|
|
|
|
for k, v := range rr.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
2021-06-09 09:20:38 +00:00
|
|
|
return newTimeSeries(m.Values, m.Timestamps, labels)
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
|
2023-10-13 11:54:33 +00:00
|
|
|
// updateWith copies all significant fields.
|
|
|
|
func (rr *RecordingRule) updateWith(r Rule) error {
|
2020-06-01 10:46:37 +00:00
|
|
|
nr, ok := r.(*RecordingRule)
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
|
|
|
}
|
|
|
|
rr.Expr = nr.Expr
|
|
|
|
rr.Labels = nr.Labels
|
2021-05-22 21:26:01 +00:00
|
|
|
rr.q = nr.q
|
2020-06-01 10:46:37 +00:00
|
|
|
return nil
|
|
|
|
}
|