vmalert: reduce restore query request for each alerting rule (#5265)

reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. (cherry picked from commit 90d45574bf)
2025-01-30 15:22:07 +00:00 · 2023-11-02 22:22:13 +08:00 · 2023-11-02 22:22:13 +08:00 · 44fcdf0cf0
commit 44fcdf0cf0
parent 7fc5178a4b
4 changed files with 37 additions and 37 deletions
--- a/app/vmalert/main.go
+++ b/app/vmalert/main.go
@ -230,7 +230,9 @@ func newManager(ctx context.Context) (*manager, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
 	}
-	manager.rw = rw
+	if rw != nil {
+		manager.rw = rw
+	}

 	rr, err := remoteread.Init()
 	if err != nil {
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@ -614,44 +614,41 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti
 		return nil
 	}

-	for _, a := range ar.alerts {
+	nameStr := fmt.Sprintf("%s=%q", alertNameLabel, ar.Name)
+	if !*disableAlertGroupLabel {
+		nameStr = fmt.Sprintf("%s=%q,%s=%q", alertGroupNameLabel, ar.GroupName, alertNameLabel, ar.Name)
+	}
+	var labelsFilter string
+	for k, v := range ar.Labels {
+		labelsFilter += fmt.Sprintf(",%s=%q", k, v)
+	}
+	expr := fmt.Sprintf("last_over_time(%s{%s%s}[%ds])",
+		alertForStateMetricName, nameStr, labelsFilter, int(lookback.Seconds()))
+
+	res, _, err := q.Query(ctx, expr, ts)
+	if err != nil {
+		return fmt.Errorf("failed to execute restore query %q: %w ", expr, err)
+	}
+
+	if len(res.Data) < 1 {
+		ar.logDebugf(ts, nil, "no response was received from restore query")
+		return nil
+	}
+	for _, series := range res.Data {
+		series.DelLabel("__name__")
+		labelSet := make(map[string]string, len(series.Labels))
+		for _, v := range series.Labels {
+			labelSet[v.Name] = v.Value
+		}
+		id := hash(labelSet)
+		a, ok := ar.alerts[id]
+		if !ok {
+			continue
+		}
 		if a.Restored || a.State != notifier.StatePending {
 			continue
 		}
-
-		var labelsFilter []string
-		for k, v := range a.Labels {
-			labelsFilter = append(labelsFilter, fmt.Sprintf("%s=%q", k, v))
-		}
-		sort.Strings(labelsFilter)
-		expr := fmt.Sprintf("last_over_time(%s{%s}[%ds])",
-			alertForStateMetricName, strings.Join(labelsFilter, ","), int(lookback.Seconds()))
-
-		ar.logDebugf(ts, nil, "restoring alert state via query %q", expr)
-
-		res, _, err := q.Query(ctx, expr, ts)
-		if err != nil {
-			return err
-		}
-
-		qMetrics := res.Data
-		if len(qMetrics) < 1 {
-			ar.logDebugf(ts, nil, "no response was received from restore query")
-			continue
-		}
-
-		// only one series expected in response
-		m := qMetrics[0]
-		// __name__ supposed to be alertForStateMetricName
-		m.DelLabel("__name__")
-
-		// we assume that restore query contains all label matchers,
-		// so all received labels will match anyway if their number is equal.
-		if len(m.Labels) != len(a.Labels) {
-			ar.logDebugf(ts, nil, "state restore query returned not expected label-set %v", m.Labels)
-			continue
-		}
-		a.ActiveAt = time.Unix(int64(m.Values[0]), 0)
+		a.ActiveAt = time.Unix(int64(series.Values[0]), 0)
 		a.Restored = true
 		logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
 	}
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@ -675,7 +675,7 @@ func TestGroup_Restore(t *testing.T) {
 	// two rules, two active alerts, one with state restored
 	ts = time.Now().Truncate(time.Hour)
 	fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
-		stateMetric("foo", ts))
+		stateMetric("bar", ts))
 	fn(
 		[]config.Rule{
 			{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -50,6 +50,7 @@ The sandbox cluster installation is running under the constant load generated by
  See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `-rule.evalDelay` flag and `eval_delay` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups). The new flag and param can be used to adjust the `time` parameter for rule evaluation requests to match [intentional query delay](https://docs.victoriametrics.com/keyConcepts.html#query-latency) from the datasource. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5155).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184).
+* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).