From 44fcdf0cf0f10bca0c6219e536ecf0de74f1623a Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 2 Nov 2023 22:22:13 +0800 Subject: [PATCH] vmalert: reduce restore query request for each alerting rule (#5265) reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. (cherry picked from commit 90d45574bf7cf63ae05ea5e4868c13866d225e4f) --- app/vmalert/main.go | 4 +- app/vmalert/rule/alerting.go | 67 +++++++++++++++---------------- app/vmalert/rule/alerting_test.go | 2 +- docs/CHANGELOG.md | 1 + 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/app/vmalert/main.go b/app/vmalert/main.go index bed190eff..697d35987 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -230,7 +230,9 @@ func newManager(ctx context.Context) (*manager, error) { if err != nil { return nil, fmt.Errorf("failed to init remoteWrite: %w", err) } - manager.rw = rw + if rw != nil { + manager.rw = rw + } rr, err := remoteread.Init() if err != nil { diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index 2139a6f56..614755773 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -614,44 +614,41 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti return nil } - for _, a := range ar.alerts { + nameStr := fmt.Sprintf("%s=%q", alertNameLabel, ar.Name) + if !*disableAlertGroupLabel { + nameStr = fmt.Sprintf("%s=%q,%s=%q", alertGroupNameLabel, ar.GroupName, alertNameLabel, ar.Name) + } + var labelsFilter string + for k, v := range ar.Labels { + labelsFilter += fmt.Sprintf(",%s=%q", k, v) + } + expr := fmt.Sprintf("last_over_time(%s{%s%s}[%ds])", + alertForStateMetricName, nameStr, labelsFilter, int(lookback.Seconds())) + + res, _, err := q.Query(ctx, expr, ts) + if err != nil { + return fmt.Errorf("failed to execute restore query %q: %w ", expr, err) + } + + if len(res.Data) < 1 { + ar.logDebugf(ts, nil, "no response was received from restore query") + return nil + } + for _, series := range res.Data { + series.DelLabel("__name__") + labelSet := make(map[string]string, len(series.Labels)) + for _, v := range series.Labels { + labelSet[v.Name] = v.Value + } + id := hash(labelSet) + a, ok := ar.alerts[id] + if !ok { + continue + } if a.Restored || a.State != notifier.StatePending { continue } - - var labelsFilter []string - for k, v := range a.Labels { - labelsFilter = append(labelsFilter, fmt.Sprintf("%s=%q", k, v)) - } - sort.Strings(labelsFilter) - expr := fmt.Sprintf("last_over_time(%s{%s}[%ds])", - alertForStateMetricName, strings.Join(labelsFilter, ","), int(lookback.Seconds())) - - ar.logDebugf(ts, nil, "restoring alert state via query %q", expr) - - res, _, err := q.Query(ctx, expr, ts) - if err != nil { - return err - } - - qMetrics := res.Data - if len(qMetrics) < 1 { - ar.logDebugf(ts, nil, "no response was received from restore query") - continue - } - - // only one series expected in response - m := qMetrics[0] - // __name__ supposed to be alertForStateMetricName - m.DelLabel("__name__") - - // we assume that restore query contains all label matchers, - // so all received labels will match anyway if their number is equal. - if len(m.Labels) != len(a.Labels) { - ar.logDebugf(ts, nil, "state restore query returned not expected label-set %v", m.Labels) - continue - } - a.ActiveAt = time.Unix(int64(m.Values[0]), 0) + a.ActiveAt = time.Unix(int64(series.Values[0]), 0) a.Restored = true logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt) } diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index 759ffb3fe..c220674e9 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -675,7 +675,7 @@ func TestGroup_Restore(t *testing.T) { // two rules, two active alerts, one with state restored ts = time.Now().Truncate(time.Hour) fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, - stateMetric("foo", ts)) + stateMetric("bar", ts)) fn( []config.Rule{ {Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}, diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e7148f785..a52b290b3 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -50,6 +50,7 @@ The sandbox cluster installation is running under the constant load generated by See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `-rule.evalDelay` flag and `eval_delay` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups). The new flag and param can be used to adjust the `time` parameter for rule evaluation requests to match [intentional query delay](https://docs.victoriametrics.com/keyConcepts.html#query-latency) from the datasource. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).