diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile index c25722668..e4ed5d128 100644 --- a/app/vmalert/Makefile +++ b/app/vmalert/Makefile @@ -88,7 +88,7 @@ run-vmalert-sd: vmalert -configCheckInterval=10s replay-vmalert: vmalert - ./bin/vmalert -rule=app/vmalert/config/testdata/rules-replay-good.rules \ + ./bin/vmalert -rule=app/vmalert/config/testdata/rules/rules-replay-good.rules \ -datasource.url=http://localhost:8428 \ -remoteWrite.url=http://localhost:8428 \ -external.label=cluster=east-1 \ diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 297186cf2..8f12c5059 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -539,6 +539,7 @@ See full description for these flags in `./vmalert --help`. * Graphite engine isn't supported yet; * `query` template function is disabled for performance reasons (might be changed in future); +* `limit` group's param has no effect during replay (might be changed in future); ## Monitoring diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 1af935c31..3f629dab4 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -193,13 +193,12 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l // It doesn't update internal states of the Rule and meant to be used just // to get time series for backfilling. // It returns ALERT and ALERT_FOR_STATE time series as result. -func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error) { +func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) { series, err := ar.q.QueryRange(ctx, ar.Expr, start, end) if err != nil { return nil, err } var result []prompbmarshal.TimeSeries - timestamp2Series := make(map[int64][]prompbmarshal.TimeSeries, 0) qFn := func(query string) ([]datasource.Metric, error) { return nil, fmt.Errorf("`query` template isn't supported in replay mode") } @@ -211,14 +210,11 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, lim if ar.For == 0 { // if alert is instant a.State = notifier.StateFiring for i := range s.Values { - if limit > 0 { - timestamp2Series[s.Timestamps[i]] = append(timestamp2Series[s.Timestamps[i]], ar.alertToTimeSeries(a, s.Timestamps[i])...) - } else { - result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) - } + result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) } continue } + // if alert with For > 0 prevT := time.Time{} for i := range s.Values { @@ -232,28 +228,9 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, lim a.Start = at } prevT = at - if limit > 0 { - timestamp2Series[s.Timestamps[i]] = append(timestamp2Series[s.Timestamps[i]], ar.alertToTimeSeries(a, s.Timestamps[i])...) - } else { - result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) - } + result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) } } - if limit <= 0 { - return result, nil - } - sortedTimestamp := make([]int64, 0) - for timestamp := range timestamp2Series { - sortedTimestamp = append(sortedTimestamp, timestamp) - } - sort.Slice(sortedTimestamp, func(i, j int) bool { return sortedTimestamp[i] < sortedTimestamp[j] }) - for _, timestamp := range sortedTimestamp { - if len(timestamp2Series[timestamp]) > limit { - logger.Errorf("exec exceeded limit of %d with %d alerts", limit, len(timestamp2Series[timestamp])) - continue - } - result = append(result, timestamp2Series[timestamp]...) - } return result, nil } diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index f0b1cc4eb..ab5be6392 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -3,7 +3,6 @@ package main import ( "context" "errors" - "fmt" "reflect" "sort" "strings" @@ -473,7 +472,7 @@ func TestAlertingRule_ExecRange(t *testing.T) { tc.rule.q = fq tc.rule.GroupID = fakeGroup.ID() fq.add(tc.data...) - gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now(), 0) + gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now()) if err != nil { t.Fatalf("unexpected err: %s", err) } @@ -691,15 +690,6 @@ func TestAlertingRuleLimit(t *testing.T) { t.Fatal(err) } } - for _, testCase := range testCases { - tss, err := ar.ExecRange(context.TODO(), timestamp, timestamp, testCase.limit) - if err != nil { - t.Fatal(err) - } - if len(tss) != testCase.tssNum { - t.Fatal(fmt.Errorf("tss len %d is not equal to supposed %d", len(tss), testCase.tssNum)) - } - } fq.reset() } diff --git a/app/vmalert/config/config_test.go b/app/vmalert/config/config_test.go index c81717ed0..f63950f17 100644 --- a/app/vmalert/config/config_test.go +++ b/app/vmalert/config/config_test.go @@ -489,6 +489,22 @@ rules: name: TestGroup params: nocache: ["0"] +rules: + - alert: foo + expr: sum by(job) (up == 1) +`) + }) + + t.Run("`limit` change", func(t *testing.T) { + f(t, ` +name: TestGroup +limit: 5 +rules: + - alert: foo + expr: sum by(job) (up == 1) +`, ` +name: TestGroup +limit: 10 rules: - alert: foo expr: sum by(job) (up == 1) diff --git a/app/vmalert/config/testdata/rules/rules-replay-good.rules b/app/vmalert/config/testdata/rules/rules-replay-good.rules index 134a238bc..a8138e5fc 100644 --- a/app/vmalert/config/testdata/rules/rules-replay-good.rules +++ b/app/vmalert/config/testdata/rules/rules-replay-good.rules @@ -2,6 +2,7 @@ groups: - name: ReplayGroup interval: 1m concurrency: 1 + limit: 1000 rules: - record: type:vm_cache_entries:rate5m expr: sum(rate(vm_cache_entries[5m])) by (type) diff --git a/app/vmalert/config/testdata/rules/rules2-good.rules b/app/vmalert/config/testdata/rules/rules2-good.rules index 0387a41cd..658adacfb 100644 --- a/app/vmalert/config/testdata/rules/rules2-good.rules +++ b/app/vmalert/config/testdata/rules/rules2-good.rules @@ -2,6 +2,7 @@ groups: - name: TestGroup interval: 2s concurrency: 2 + limit: 1000 params: denyPartialResponse: ["true"] extra_label: ["env=dev"] diff --git a/app/vmalert/group.go b/app/vmalert/group.go index a9983ce1c..747a4ca54 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -218,6 +218,7 @@ func (g *Group) updateWith(newGroup *Group) error { g.Concurrency = newGroup.Concurrency g.Params = newGroup.Params g.Labels = newGroup.Labels + g.Limit = newGroup.Limit g.Checksum = newGroup.Checksum g.Rules = newRules return nil diff --git a/app/vmalert/recording.go b/app/vmalert/recording.go index cb5ac8e2f..a22285906 100644 --- a/app/vmalert/recording.go +++ b/app/vmalert/recording.go @@ -104,7 +104,7 @@ func (rr *RecordingRule) Close() { // ExecRange executes recording rule on the given time range similarly to Exec. // It doesn't update internal states of the Rule and meant to be used just // to get time series for backfilling. -func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error) { +func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) { series, err := rr.q.QueryRange(ctx, rr.Expr, start, end) if err != nil { return nil, err @@ -120,9 +120,6 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time, li duplicates[key] = struct{}{} tss = append(tss, ts) } - if limit > 0 && len(tss) > limit { - return nil, fmt.Errorf("exec exceeded limit of %d with %d series", limit, len(tss)) - } return tss, nil } diff --git a/app/vmalert/recording_test.go b/app/vmalert/recording_test.go index 4fe94d8e8..5594c11ec 100644 --- a/app/vmalert/recording_test.go +++ b/app/vmalert/recording_test.go @@ -158,7 +158,7 @@ func TestRecordingRule_ExecRange(t *testing.T) { fq := &fakeQuerier{} fq.add(tc.metrics...) tc.rule.q = fq - tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now(), 0) + tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now()) if err != nil { t.Fatalf("unexpected Exec err: %s", err) } @@ -207,10 +207,6 @@ func TestRecordingRuleLimit(t *testing.T) { if err != nil && !strings.EqualFold(err.Error(), testCase.err) { t.Fatal(err) } - _, err = rule.ExecRange(context.TODO(), timestamp.Add(-2*time.Second), timestamp, testCase.limit) - if err != nil && !strings.EqualFold(err.Error(), testCase.err) { - t.Fatal(err) - } } } diff --git a/app/vmalert/replay.go b/app/vmalert/replay.go index 2ebfa6997..1adf253d9 100644 --- a/app/vmalert/replay.go +++ b/app/vmalert/replay.go @@ -88,6 +88,10 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int { "\nrequests to make: \t%d"+ "\nmax range per request: \t%v\n", g.Name, g.Interval, iterations, step) + if g.Limit > 0 { + fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n", + g.Limit) + } for _, rule := range g.Rules { fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID()) var bar *pb.ProgressBar @@ -96,7 +100,7 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int { } ri.reset() for ri.next() { - n, err := replayRule(rule, ri.s, ri.e, rw, g.Limit) + n, err := replayRule(rule, ri.s, ri.e, rw) if err != nil { logger.Fatalf("rule %q: %s", rule, err) } @@ -115,11 +119,11 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int { return total } -func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client, limit int) (int, error) { +func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) { var err error var tss []prompbmarshal.TimeSeries for i := 0; i < *replayRuleRetryAttempts; i++ { - tss, err = rule.ExecRange(context.Background(), start, end, limit) + tss, err = rule.ExecRange(context.Background(), start, end) if err == nil { break } diff --git a/app/vmalert/rule.go b/app/vmalert/rule.go index a1332005c..630947060 100644 --- a/app/vmalert/rule.go +++ b/app/vmalert/rule.go @@ -18,9 +18,8 @@ type Rule interface { // Exec executes the rule with given context at the given timestamp and limit. // returns an err if number of resulting time series exceeds the limit. Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) - // ExecRange executes the rule on the given time range and limit. - // returns an err if number of resulting time series exceeds the limit. - ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error) + // ExecRange executes the rule on the given time range. + ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) // UpdateWith performs modification of current Rule // with fields of the given Rule. UpdateWith(Rule) error diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index cdae4daf0..f8c61fef6 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -23,6 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: add support of `lowercase` and `uppercase` relabeling actions in the same way as [Prometheus 2.36.0 does](https://github.com/prometheus/prometheus/releases/tag/v2.36.0). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2664). * FEATURE: add ability to change the `indexdb` rotation timezone offset via `-retentionTimezoneOffset` command-line flag. Previously it was performed at 4am UTC time. This could lead to performance degradation in the middle of the day when VictoriaMetrics runs in time zones located too far from UTC. Thanks to @cnych for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2574). * FEATURE: limit the number of background merge threads on systems with big number of CPU cores by default. This increases the max size of parts, which can be created during background merge when `-storageDataPath` directory has limited free disk space. This may improve on-disk data compression efficiency and query performance. The limits can be tuned if needed with `-smallMergeConcurrency` and `-bigMergeConcurrency` command-line flags. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2673). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support `limit` param per-group for limiting number of produced samples per each rule. Thanks to @Howie59 for [implementation](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2676). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): remove dependency on Internet access at [web API pages](https://docs.victoriametrics.com/vmalert.html#web). Previously the functionality and the layout of these pages was broken without Internet access. See [shis issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2594). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): implement the `http://vmagent:8429/service-discovery` page in the same way as Prometheus does. This page shows the original labels for all the discovered targets alongside the resulting labels after the relabeling. This simplifies service discovery debugging. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): remove dependency on Internet access at `http://vmagent:8429/targets` page. Previously the page layout was broken without Internet access. See [shis issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2594). diff --git a/docs/vmalert.md b/docs/vmalert.md index 5e56a886c..7f8f3ad1b 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -543,6 +543,7 @@ See full description for these flags in `./vmalert --help`. * Graphite engine isn't supported yet; * `query` template function is disabled for performance reasons (might be changed in future); +* `limit` group's param has no effect during replay (might be changed in future); ## Monitoring