diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile index 64b9e27ad..35906236c 100644 --- a/app/vmalert/Makefile +++ b/app/vmalert/Makefile @@ -56,7 +56,10 @@ test-vmalert: run-vmalert: vmalert ./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \ - -datasource.url=http://localhost:8428 -notifier.url=http://localhost:9093 \ + -datasource.url=http://localhost:8428 \ + -notifier.url=http://localhost:9093 \ + -remotewrite.url=http://localhost:8428 \ + -remoteread.url=http://localhost:8428 \ -evaluationInterval=3s vmalert-amd64: diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 1d96b112b..d949d5497 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -13,8 +13,6 @@ sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager). * Lightweight without extra dependencies. ### TODO: -* Persist alerts state as timeseries in TSDB. Currently, alerts state is stored -in process memory only and will be lost on restart; * Configuration hot reload. ### QuickStart @@ -51,27 +49,52 @@ Rules in group evaluated one-by-one sequentially. Used as alert source in AlertManager. * `http:///metrics` - application metrics. +`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries +via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries +may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured. + + ### Configuration The shortlist of configuration flags is the following: ``` Usage of vmalert: - -datasource.url string - Victoria Metrics or VMSelect url. Required parameter. e.g. http://127.0.0.1:8428 -datasource.basicAuth.password string - Optional basic auth password to use for -datasource.url + Optional basic auth password for -datasource.url -datasource.basicAuth.username string - Optional basic auth username to use for -datasource.url + Optional basic auth username for -datasource.url + -datasource.url string + Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428 + -enableTCP6 + Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used -evaluationInterval duration How often to evaluate the rules. Default 1m (default 1m0s) -external.url string External URL is used as alert's source for sent alerts to the notifier + -http.maxGracefulShutdownDuration duration + The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s) + -httpAuth.password string + Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty + -httpAuth.username string + Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password -httpListenAddr string Address to listen for http connections (default ":8880") -notifier.url string Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093 + -remoteread.basicAuth.password string + Optional basic auth password for -remoteread.url + -remoteread.basicAuth.username string + Optional basic auth username for -remoteread.url + -remoteread.lookback duration + Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s) + -remoteread.url vmalert + Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428 + -remotewrite.basicAuth.password string + Optional basic auth password for -remotewrite.url + -remotewrite.basicAuth.username string + Optional basic auth username for -remotewrite.url -remotewrite.url string - Optional URL to remote-write compatible storage where to write timeseriesbased on active alerts. E.g. http://127.0.0.1:8428 + Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428 -rule value Path to the file with alert rules. Supports patterns. Flag can be specified multiple times. diff --git a/app/vmalert/config.go b/app/vmalert/config.go index ba15956a1..59a10c187 100644 --- a/app/vmalert/config.go +++ b/app/vmalert/config.go @@ -46,7 +46,7 @@ func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) { return nil, fmt.Errorf("invalid labels filepath:%s, group %s:%w", file, group.Name, err) } } - rule.group = &group + rule.group = group } } groups = append(groups, gr...) diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 7b54ba1df..f73659bac 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -32,18 +32,31 @@ Examples: absolute path to all .yaml files in root.`) validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates") httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections") - datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter. e.g. http://127.0.0.1:8428") - basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username to use for -datasource.url") - basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password to use for -datasource.url") - remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to remote-write compatible storage where to write timeseries"+ - "based on active alerts. E.g. http://127.0.0.1:8428") - evaluationInterval = flag.Duration("evaluationInterval", 1*time.Minute, "How often to evaluate the rules. Default 1m") + + datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+ + " E.g. http://127.0.0.1:8428") + basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url") + basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url") + + remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+ + " in form of timeseries. E.g. http://127.0.0.1:8428") + remoteWriteUsername = flag.String("remotewrite.basicAuth.username", "", "Optional basic auth username for -remotewrite.url") + remoteWritePassword = flag.String("remotewrite.basicAuth.password", "", "Optional basic auth password for -remotewrite.url") + + remoteReadURL = flag.String("remoteread.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+ + " state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state."+ + " E.g. http://127.0.0.1:8428") + remoteReadUsername = flag.String("remoteread.basicAuth.username", "", "Optional basic auth username for -remoteread.url") + remoteReadPassword = flag.String("remoteread.basicAuth.password", "", "Optional basic auth password for -remoteread.url") + remoteReadLookBack = flag.Duration("remoteread.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+ + " For example, if lookback=1h then range from now() to now()-1h will be scanned.") + + evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m") notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093") externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier") ) // TODO: hot configuration reload -// TODO: alerts state persistence func main() { envflag.Parse() buildinfo.Init() @@ -73,6 +86,8 @@ func main() { c, err := remotewrite.NewClient(ctx, remotewrite.Config{ Addr: *remoteWriteURL, FlushInterval: *evaluationInterval, + BasicAuthUser: *remoteWriteUsername, + BasicAuthPass: *remoteWritePassword, }) if err != nil { logger.Fatalf("failed to init remotewrite client: %s", err) @@ -80,13 +95,24 @@ func main() { w.rw = c } + var restoreDS *datasource.VMStorage + if *remoteReadURL != "" { + restoreDS = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{}) + } + wg := sync.WaitGroup{} - for i := range groups { + for _, g := range groups { + if restoreDS != nil { + err := g.Restore(ctx, restoreDS, *remoteReadLookBack) + if err != nil { + logger.Errorf("error while restoring state for group %q: %s", g.Name, err) + } + } wg.Add(1) go func(group Group) { w.run(ctx, group, *evaluationInterval) wg.Done() - }(groups[i]) + }(g) } go httpserver.Serve(*httpListenAddr, (&requestHandler{groups: groups}).handler) diff --git a/app/vmalert/rule.go b/app/vmalert/rule.go index 9d40a4442..c8bfb79c1 100644 --- a/app/vmalert/rule.go +++ b/app/vmalert/rule.go @@ -12,6 +12,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/metricsql" ) @@ -22,6 +23,19 @@ type Group struct { Rules []*Rule } +// Restore restores alerts state for all group rules with For > 0 +func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error { + for _, rule := range g.Rules { + if rule.For == 0 { + return nil + } + if err := rule.Restore(ctx, q, lookback); err != nil { + return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err) + } + } + return nil +} + // Rule is basic alert entity type Rule struct { Name string `yaml:"alert"` @@ -30,7 +44,7 @@ type Rule struct { Labels map[string]string `yaml:"labels"` Annotations map[string]string `yaml:"annotations"` - group *Group + group Group // guard status fields mu sync.RWMutex @@ -83,7 +97,9 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error { for _, m := range qMetrics { h := hash(m) updated[h] = struct{}{} - if _, ok := r.alerts[h]; ok { + if a, ok := r.alerts[h]; ok { + // update Value field with latest value + a.Value = m.Value continue } a, err := r.newAlert(m) @@ -125,6 +141,10 @@ func hash(m datasource.Metric) uint64 { return labels[i].Name < labels[j].Name }) for _, l := range labels { + // drop __name__ to be consistent with Prometheus alerting + if l.Name == "__name__" { + continue + } hash.Write([]byte(l.Name)) hash.Write([]byte(l.Value)) hash.Write([]byte("\xff")) @@ -144,6 +164,10 @@ func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) { // 1. use data labels for _, l := range m.Labels { + // drop __name__ to be consistent with Prometheus alerting + if l.Name == "__name__" { + continue + } a.Labels[l.Name] = l.Value } @@ -194,7 +218,8 @@ func (r *Rule) AlertsAPI() []*APIAlert { func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert { return &APIAlert{ - ID: a.ID, + // encode as string to avoid rounding + ID: fmt.Sprintf("%d", a.ID), Name: a.Name, Group: a.Group, Expression: r.Expr, @@ -239,6 +264,8 @@ func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prom return newTimeSeries(1, labels, timestamp) } +// alertForToTimeSeries returns a timeseries that represents +// state of active alerts, where value is time when alert become active func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries { labels := make(map[string]string) for k, v := range a.Labels { @@ -268,3 +295,46 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) } return ts } + +// Restore restores the state of active alerts basing on previously written timeseries. +// Restore restores only Start field. Field State will be always Pending and supposed +// to be updated on next Eval, as well as Value field. +func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error { + // Get the last datapoint in range via MetricsQL `last_over_time`. + // We don't use plain PromQL since Prometheus doesn't support + // remote write protocol which is used for state persistence in vmalert. + expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])", + alertForStateMetricName, r.Name, int(lookback.Seconds())) + qMetrics, err := q.Query(ctx, expr) + if err != nil { + return err + } + + for _, m := range qMetrics { + labels := m.Labels + m.Labels = make([]datasource.Label, 0) + // drop all extra labels, so hash key will + // be identical to timeseries received in Eval + for _, l := range labels { + if l.Name == alertNameLabel { + continue + } + // drop all overridden labels + if _, ok := r.Labels[l.Name]; ok { + continue + } + m.Labels = append(m.Labels, l) + } + + a, err := r.newAlert(m) + if err != nil { + return fmt.Errorf("failed to create alert: %s", err) + } + a.ID = hash(m) + a.State = notifier.StatePending + a.Start = time.Unix(int64(m.Value), 0) + r.alerts[a.ID] = a + logger.Infof("alert %q.%q restored to state at %v", a.Group, a.Name, a.Start) + } + return nil +} diff --git a/app/vmalert/rule_test.go b/app/vmalert/rule_test.go index f3ccded06..db3915dd9 100644 --- a/app/vmalert/rule_test.go +++ b/app/vmalert/rule_test.go @@ -157,53 +157,62 @@ func TestRule_Exec(t *testing.T) { map[uint64]*notifier.Alert{}, }, { - newTestRule("single-firing", 0), + newTestRule("empty labels", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {datasource.Metric{}}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, + hash(datasource.Metric{}): {State: notifier.StateFiring}, + }, + }, + { + newTestRule("single-firing", 0), + [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + }, + map[uint64]*notifier.Alert{ + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, }, }, { newTestRule("single-firing=>inactive", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateInactive}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, }, }, { newTestRule("single-firing=>inactive=>firing", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, }, }, { newTestRule("single-firing=>inactive=>firing=>inactive", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateInactive}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, }, }, { newTestRule("single-firing=>inactive=>firing=>inactive=>empty", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, {}, }, @@ -212,45 +221,45 @@ func TestRule_Exec(t *testing.T) { { newTestRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, {}, {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, }, }, { newTestRule("multiple-firing", 0), [][]datasource.Metric{ { - metricWithLabels(t, "__name__", "foo"), - metricWithLabels(t, "__name__", "foo1"), - metricWithLabels(t, "__name__", "foo2"), + metricWithLabels(t, "name", "foo"), + metricWithLabels(t, "name", "foo1"), + metricWithLabels(t, "name", "foo2"), }, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, - hash(metricWithLabels(t, "__name__", "foo1")): {State: notifier.StateFiring}, - hash(metricWithLabels(t, "__name__", "foo2")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring}, }, }, { newTestRule("multiple-steps-firing", 0), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo1")}, - {metricWithLabels(t, "__name__", "foo2")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo1")}, + {metricWithLabels(t, "name", "foo2")}, }, // 1: fire first alert // 2: fire second alert, set first inactive // 3: fire third alert, set second inactive, delete first one map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo1")): {State: notifier.StateInactive}, - hash(metricWithLabels(t, "__name__", "foo2")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateInactive}, + hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring}, }, }, { @@ -258,93 +267,93 @@ func TestRule_Exec(t *testing.T) { [][]datasource.Metric{ { // metrics with the same labelset should result in one alert - metricWithLabels(t, "__name__", "foo", "type", "bar"), - metricWithLabels(t, "type", "bar", "__name__", "foo"), + metricWithLabels(t, "name", "foo", "type", "bar"), + metricWithLabels(t, "type", "bar", "name", "foo"), }, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo", "type", "bar")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo", "type", "bar")): {State: notifier.StateFiring}, }, }, { newTestRule("for-pending", time.Minute), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StatePending}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending}, }, }, { newTestRule("for-fired", time.Millisecond), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, }, }, { newTestRule("for-pending=>inactive", time.Millisecond), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, // empty step to reset pending alerts {}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateInactive}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, }, }, { newTestRule("for-pending=>firing=>inactive", time.Millisecond), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, // empty step to reset pending alerts {}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateInactive}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, }, }, { newTestRule("for-pending=>firing=>inactive=>pending", time.Millisecond), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, // empty step to reset pending alerts {}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StatePending}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending}, }, }, { newTestRule("for-pending=>firing=>inactive=>pending=>firing", time.Millisecond), [][]datasource.Metric{ - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, // empty step to reset pending alerts {}, - {metricWithLabels(t, "__name__", "foo")}, - {metricWithLabels(t, "__name__", "foo")}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "__name__", "foo")): {State: notifier.StateFiring}, + hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, }, }, } - fakeGroup := &Group{Name: "TestRule_Exec"} + fakeGroup := Group{Name: "TestRule_Exec"} for _, tc := range testCases { t.Run(tc.rule.Name, func(t *testing.T) { fq := &fakeQuerier{} tc.rule.group = fakeGroup for _, step := range tc.steps { fq.reset() - fq.add(t, step...) + fq.add(step...) if err := tc.rule.Exec(context.TODO(), fq); err != nil { t.Fatalf("unexpected err: %s", err) } @@ -390,10 +399,137 @@ func (fq *fakeQuerier) reset() { fq.metrics = fq.metrics[:0] } -func (fq *fakeQuerier) add(t *testing.T, metrics ...datasource.Metric) { +func (fq *fakeQuerier) add(metrics ...datasource.Metric) { fq.metrics = append(fq.metrics, metrics...) } func (fq fakeQuerier) Query(ctx context.Context, query string) ([]datasource.Metric, error) { return fq.metrics, nil } + +func TestRule_Restore(t *testing.T) { + testCases := []struct { + rule *Rule + metrics []datasource.Metric + expAlerts map[uint64]*notifier.Alert + }{ + { + newTestRuleWithLabels("no extra labels"), + []datasource.Metric{ + metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), + "__name__", alertForStateMetricName, + alertNameLabel, "", + ), + }, + map[uint64]*notifier.Alert{ + hash(datasource.Metric{}): {State: notifier.StatePending, + Start: time.Now().Truncate(time.Hour)}, + }, + }, + { + newTestRuleWithLabels("metric labels"), + []datasource.Metric{ + metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), + "__name__", alertForStateMetricName, + alertNameLabel, "", + "foo", "bar", + "namespace", "baz", + ), + }, + map[uint64]*notifier.Alert{ + hash(metricWithLabels(t, + "foo", "bar", + "namespace", "baz", + )): {State: notifier.StatePending, + Start: time.Now().Truncate(time.Hour)}, + }, + }, + { + newTestRuleWithLabels("rule labels", "source", "vm"), + []datasource.Metric{ + metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), + "__name__", alertForStateMetricName, + alertNameLabel, "", + "foo", "bar", + "namespace", "baz", + // following pair supposed to be dropped + "source", "vm", + ), + }, + map[uint64]*notifier.Alert{ + hash(metricWithLabels(t, + "foo", "bar", + "namespace", "baz", + )): {State: notifier.StatePending, + Start: time.Now().Truncate(time.Hour)}, + }, + }, + { + newTestRuleWithLabels("multiple alerts"), + []datasource.Metric{ + metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), + "__name__", alertForStateMetricName, + "host", "localhost-1", + ), + metricWithValueAndLabels(t, float64(time.Now().Truncate(2*time.Hour).Unix()), + "__name__", alertForStateMetricName, + "host", "localhost-2", + ), + metricWithValueAndLabels(t, float64(time.Now().Truncate(3*time.Hour).Unix()), + "__name__", alertForStateMetricName, + "host", "localhost-3", + ), + }, + map[uint64]*notifier.Alert{ + hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending, + Start: time.Now().Truncate(time.Hour)}, + hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending, + Start: time.Now().Truncate(2 * time.Hour)}, + hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending, + Start: time.Now().Truncate(3 * time.Hour)}, + }, + }, + } + fakeGroup := Group{Name: "TestRule_Exec"} + for _, tc := range testCases { + t.Run(tc.rule.Name, func(t *testing.T) { + fq := &fakeQuerier{} + tc.rule.group = fakeGroup + fq.add(tc.metrics...) + if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil { + t.Fatalf("unexpected err: %s", err) + } + if len(tc.rule.alerts) != len(tc.expAlerts) { + t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts)) + } + for key, exp := range tc.expAlerts { + got, ok := tc.rule.alerts[key] + if !ok { + t.Fatalf("expected to have key %d", key) + } + if got.State != exp.State { + t.Fatalf("expected state %d; got %d", exp.State, got.State) + } + if got.Start != exp.Start { + t.Fatalf("expected Start %v; got %v", exp.Start, got.Start) + } + } + }) + } +} + +func newTestRuleWithLabels(name string, labels ...string) *Rule { + r := newTestRule(name, 0) + r.Labels = make(map[string]string) + for i := 0; i < len(labels); i += 2 { + r.Labels[labels[i]] = labels[i+1] + } + return r +} + +func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric { + t.Helper() + m := metricWithLabels(t, labels...) + m.Value = value + return m +} diff --git a/app/vmalert/testdata/rules0-good.rules b/app/vmalert/testdata/rules0-good.rules index ddb55a1d8..7d7df09b1 100644 --- a/app/vmalert/testdata/rules0-good.rules +++ b/app/vmalert/testdata/rules0-good.rules @@ -6,7 +6,7 @@ groups: expr: vm_rows > 0 labels: label: bar - template: "{{ $value|humanize }}" + host: "{{ $labels.instance }}" annotations: summary: "{{ $value|humanize }}" description: "{{$labels}}" diff --git a/app/vmalert/web.go b/app/vmalert/web.go index 8269a35ba..732e0f612 100644 --- a/app/vmalert/web.go +++ b/app/vmalert/web.go @@ -14,7 +14,7 @@ import ( // APIAlert has info for an alert. type APIAlert struct { - ID uint64 `json:"id"` + ID string `json:"id"` Name string `json:"name"` Group string `json:"group"` Expression string `json:"expression"` @@ -75,7 +75,7 @@ func (rh *requestHandler) list() ([]byte, error) { // sort list of alerts for deterministic output sort.Slice(lr.Data.Alerts, func(i, j int) bool { - return lr.Data.Alerts[i].Name < lr.Data.Alerts[j].Name + return lr.Data.Alerts[i].ID < lr.Data.Alerts[j].ID }) b, err := json.Marshal(lr) @@ -109,8 +109,8 @@ func (rh *requestHandler) alert(path string) ([]byte, error) { if g.Name != group { continue } - for i := range g.Rules { - if apiAlert := g.Rules[i].AlertAPI(id); apiAlert != nil { + for _, rule := range g.Rules { + if apiAlert := rule.AlertAPI(id); apiAlert != nil { return json.Marshal(apiAlert) } }