mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
vmalert: followup for 76f05f8670
(#2706)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
4afd7aa695
commit
3c09d25039
14 changed files with 39 additions and 54 deletions
|
@ -88,7 +88,7 @@ run-vmalert-sd: vmalert
|
|||
-configCheckInterval=10s
|
||||
|
||||
replay-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules-replay-good.rules \
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules/rules-replay-good.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-external.label=cluster=east-1 \
|
||||
|
|
|
@ -539,6 +539,7 @@ See full description for these flags in `./vmalert --help`.
|
|||
|
||||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
|
||||
## Monitoring
|
||||
|
||||
|
|
|
@ -193,13 +193,12 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
|
|||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
||||
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
series, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var result []prompbmarshal.TimeSeries
|
||||
timestamp2Series := make(map[int64][]prompbmarshal.TimeSeries, 0)
|
||||
qFn := func(query string) ([]datasource.Metric, error) {
|
||||
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
||||
}
|
||||
|
@ -211,14 +210,11 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, lim
|
|||
if ar.For == 0 { // if alert is instant
|
||||
a.State = notifier.StateFiring
|
||||
for i := range s.Values {
|
||||
if limit > 0 {
|
||||
timestamp2Series[s.Timestamps[i]] = append(timestamp2Series[s.Timestamps[i]], ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
} else {
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// if alert with For > 0
|
||||
prevT := time.Time{}
|
||||
for i := range s.Values {
|
||||
|
@ -232,28 +228,9 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time, lim
|
|||
a.Start = at
|
||||
}
|
||||
prevT = at
|
||||
if limit > 0 {
|
||||
timestamp2Series[s.Timestamps[i]] = append(timestamp2Series[s.Timestamps[i]], ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
} else {
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||
}
|
||||
}
|
||||
if limit <= 0 {
|
||||
return result, nil
|
||||
}
|
||||
sortedTimestamp := make([]int64, 0)
|
||||
for timestamp := range timestamp2Series {
|
||||
sortedTimestamp = append(sortedTimestamp, timestamp)
|
||||
}
|
||||
sort.Slice(sortedTimestamp, func(i, j int) bool { return sortedTimestamp[i] < sortedTimestamp[j] })
|
||||
for _, timestamp := range sortedTimestamp {
|
||||
if len(timestamp2Series[timestamp]) > limit {
|
||||
logger.Errorf("exec exceeded limit of %d with %d alerts", limit, len(timestamp2Series[timestamp]))
|
||||
continue
|
||||
}
|
||||
result = append(result, timestamp2Series[timestamp]...)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ package main
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
|
@ -473,7 +472,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.data...)
|
||||
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now(), 0)
|
||||
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
|
@ -691,15 +690,6 @@ func TestAlertingRuleLimit(t *testing.T) {
|
|||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
for _, testCase := range testCases {
|
||||
tss, err := ar.ExecRange(context.TODO(), timestamp, timestamp, testCase.limit)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(tss) != testCase.tssNum {
|
||||
t.Fatal(fmt.Errorf("tss len %d is not equal to supposed %d", len(tss), testCase.tssNum))
|
||||
}
|
||||
}
|
||||
fq.reset()
|
||||
}
|
||||
|
||||
|
|
|
@ -489,6 +489,22 @@ rules:
|
|||
name: TestGroup
|
||||
params:
|
||||
nocache: ["0"]
|
||||
rules:
|
||||
- alert: foo
|
||||
expr: sum by(job) (up == 1)
|
||||
`)
|
||||
})
|
||||
|
||||
t.Run("`limit` change", func(t *testing.T) {
|
||||
f(t, `
|
||||
name: TestGroup
|
||||
limit: 5
|
||||
rules:
|
||||
- alert: foo
|
||||
expr: sum by(job) (up == 1)
|
||||
`, `
|
||||
name: TestGroup
|
||||
limit: 10
|
||||
rules:
|
||||
- alert: foo
|
||||
expr: sum by(job) (up == 1)
|
||||
|
|
|
@ -2,6 +2,7 @@ groups:
|
|||
- name: ReplayGroup
|
||||
interval: 1m
|
||||
concurrency: 1
|
||||
limit: 1000
|
||||
rules:
|
||||
- record: type:vm_cache_entries:rate5m
|
||||
expr: sum(rate(vm_cache_entries[5m])) by (type)
|
||||
|
|
|
@ -2,6 +2,7 @@ groups:
|
|||
- name: TestGroup
|
||||
interval: 2s
|
||||
concurrency: 2
|
||||
limit: 1000
|
||||
params:
|
||||
denyPartialResponse: ["true"]
|
||||
extra_label: ["env=dev"]
|
||||
|
|
|
@ -218,6 +218,7 @@ func (g *Group) updateWith(newGroup *Group) error {
|
|||
g.Concurrency = newGroup.Concurrency
|
||||
g.Params = newGroup.Params
|
||||
g.Labels = newGroup.Labels
|
||||
g.Limit = newGroup.Limit
|
||||
g.Checksum = newGroup.Checksum
|
||||
g.Rules = newRules
|
||||
return nil
|
||||
|
|
|
@ -104,7 +104,7 @@ func (rr *RecordingRule) Close() {
|
|||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
series, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -120,9 +120,6 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time, li
|
|||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
}
|
||||
if limit > 0 && len(tss) > limit {
|
||||
return nil, fmt.Errorf("exec exceeded limit of %d with %d series", limit, len(tss))
|
||||
}
|
||||
return tss, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ func TestRecordingRule_ExecRange(t *testing.T) {
|
|||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now(), 0)
|
||||
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
|
@ -207,10 +207,6 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
_, err = rule.ExecRange(context.TODO(), timestamp.Add(-2*time.Second), timestamp, testCase.limit)
|
||||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -88,6 +88,10 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
|||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, iterations, step)
|
||||
if g.Limit > 0 {
|
||||
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
|
||||
g.Limit)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
var bar *pb.ProgressBar
|
||||
|
@ -96,7 +100,7 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
|||
}
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw, g.Limit)
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
|
@ -115,11 +119,11 @@ func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
|||
return total
|
||||
}
|
||||
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client, limit int) (int, error) {
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < *replayRuleRetryAttempts; i++ {
|
||||
tss, err = rule.ExecRange(context.Background(), start, end, limit)
|
||||
tss, err = rule.ExecRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
|
|
|
@ -18,9 +18,8 @@ type Rule interface {
|
|||
// Exec executes the rule with given context at the given timestamp and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
ExecRange(ctx context.Context, start, end time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range.
|
||||
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
|
|
|
@ -23,6 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
|||
* FEATURE: add support of `lowercase` and `uppercase` relabeling actions in the same way as [Prometheus 2.36.0 does](https://github.com/prometheus/prometheus/releases/tag/v2.36.0). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2664).
|
||||
* FEATURE: add ability to change the `indexdb` rotation timezone offset via `-retentionTimezoneOffset` command-line flag. Previously it was performed at 4am UTC time. This could lead to performance degradation in the middle of the day when VictoriaMetrics runs in time zones located too far from UTC. Thanks to @cnych for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2574).
|
||||
* FEATURE: limit the number of background merge threads on systems with big number of CPU cores by default. This increases the max size of parts, which can be created during background merge when `-storageDataPath` directory has limited free disk space. This may improve on-disk data compression efficiency and query performance. The limits can be tuned if needed with `-smallMergeConcurrency` and `-bigMergeConcurrency` command-line flags. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2673).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): support `limit` param per-group for limiting number of produced samples per each rule. Thanks to @Howie59 for [implementation](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2676).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): remove dependency on Internet access at [web API pages](https://docs.victoriametrics.com/vmalert.html#web). Previously the functionality and the layout of these pages was broken without Internet access. See [shis issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2594).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): implement the `http://vmagent:8429/service-discovery` page in the same way as Prometheus does. This page shows the original labels for all the discovered targets alongside the resulting labels after the relabeling. This simplifies service discovery debugging.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): remove dependency on Internet access at `http://vmagent:8429/targets` page. Previously the page layout was broken without Internet access. See [shis issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2594).
|
||||
|
|
|
@ -543,6 +543,7 @@ See full description for these flags in `./vmalert --help`.
|
|||
|
||||
* Graphite engine isn't supported yet;
|
||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
|
||||
## Monitoring
|
||||
|
||||
|
|
Loading…
Reference in a new issue