vmalert: automatically add exported_ prefix for original evaluation… (#5398)

automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161 Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
2025-03-11 15:34:56 +00:00 · 2023-12-22 23:07:47 +08:00 · 2023-12-22 23:07:47 +08:00 · 1f477aba41
commit 1f477aba41
parent be20501376
6 changed files with 111 additions and 67 deletions
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@ -237,11 +237,30 @@ type labelSet struct {
 	origin map[string]string
 	// processed labels includes origin labels
 	// plus extra labels (group labels, service labels like alertNameLabel).
-	// in case of conflicts, extra labels are preferred.
+	// in case of key conflicts, origin labels are renamed with prefix `exported_` and extra labels are preferred.
+	// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161
 	// used as labels attached to notifier.Alert and ALERTS series written to remote storage.
 	processed map[string]string
 }

+// add adds a value v with key k to origin and processed label sets.
+// On k conflicts in processed set, the passed v is preferred.
+// On k conflicts in origin set, the original value is preferred and copied
+// to processed with `exported_%k` key. The copy happens only if passed v isn't equal to origin[k] value.
+func (ls *labelSet) add(k, v string) {
+	ls.processed[k] = v
+	ov, ok := ls.origin[k]
+	if !ok {
+		ls.origin[k] = v
+		return
+	}
+	if ov != v {
+		// copy value only if v and ov are different
+		key := fmt.Sprintf("exported_%s", k)
+		ls.processed[key] = ov
+	}
+}
+
 // toLabels converts labels from given Metric
 // to labelSet which contains original and processed labels.
 func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*labelSet, error) {
@ -267,24 +286,14 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
 		return nil, fmt.Errorf("failed to expand labels: %w", err)
 	}
 	for k, v := range extraLabels {
-		ls.processed[k] = v
-		if _, ok := ls.origin[k]; !ok {
-			ls.origin[k] = v
-		}
+		ls.add(k, v)
 	}
-
 	// set additional labels to identify group and rule name
 	if ar.Name != "" {
-		ls.processed[alertNameLabel] = ar.Name
-		if _, ok := ls.origin[alertNameLabel]; !ok {
-			ls.origin[alertNameLabel] = ar.Name
-		}
+		ls.add(alertNameLabel, ar.Name)
 	}
 	if !*disableAlertGroupLabel && ar.GroupName != "" {
-		ls.processed[alertGroupNameLabel] = ar.GroupName
-		if _, ok := ls.origin[alertGroupNameLabel]; !ok {
-			ls.origin[alertGroupNameLabel] = ar.GroupName
-		}
+		ls.add(alertGroupNameLabel, ar.GroupName)
 	}
 	return ls, nil
 }
@ -414,8 +423,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
 		}
 		h := hash(ls.processed)
 		if _, ok := updated[h]; ok {
-			// duplicate may be caused by extra labels
-			// conflicting with the metric labels
+			// duplicate may be caused the removal of `__name__` label
 			curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
 			return nil, curState.Err
 		}
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@ -768,14 +768,16 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
 	ar.q = fq

 	// successful attempt
+	// label `job` will be overridden by rule extra label, the original value will be reserved by "exported_job"
 	fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
+	fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
 	_, err := ar.exec(context.TODO(), time.Now(), 0)
 	if err != nil {
 		t.Fatal(err)
 	}

-	// label `job` will collide with rule extra label and will make both time series equal
-	fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
+	// label `__name__` will be omitted and get duplicated results here
+	fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo_1", "job", "bar"))
 	_, err = ar.exec(context.TODO(), time.Now(), 0)
 	if !errors.Is(err, errDuplicate) {
 		t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
@ -899,20 +901,22 @@ func TestAlertingRule_Template(t *testing.T) {
 				metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"),
 			},
 			map[uint64]*notifier.Alert{
-				hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
+				hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): {
 					Labels: map[string]string{
-						alertNameLabel: "override label",
-						"instance":     "foo",
+						alertNameLabel:       "override label",
+						"exported_alertname": "override",
+						"instance":           "foo",
 					},
 					Annotations: map[string]string{
 						"summary":     `first: Too high connection number for "foo"`,
 						"description": `override: It is 2 connections for "foo"`,
 					},
 				},
-				hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
+				hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): {
 					Labels: map[string]string{
-						alertNameLabel: "override label",
-						"instance":     "bar",
+						alertNameLabel:       "override label",
+						"exported_alertname": "override",
+						"instance":           "bar",
 					},
 					Annotations: map[string]string{
 						"summary":     `second: Too high connection number for "bar"`,
@ -941,14 +945,18 @@ func TestAlertingRule_Template(t *testing.T) {
 			},
 			map[uint64]*notifier.Alert{
 				hash(map[string]string{
-					alertNameLabel:      "OriginLabels",
-					alertGroupNameLabel: "Testing",
-					"instance":          "foo",
+					alertNameLabel:        "OriginLabels",
+					"exported_alertname":  "originAlertname",
+					alertGroupNameLabel:   "Testing",
+					"exported_alertgroup": "originGroupname",
+					"instance":            "foo",
 				}): {
 					Labels: map[string]string{
-						alertNameLabel:      "OriginLabels",
-						alertGroupNameLabel: "Testing",
-						"instance":          "foo",
+						alertNameLabel:        "OriginLabels",
+						"exported_alertname":  "originAlertname",
+						alertGroupNameLabel:   "Testing",
+						"exported_alertgroup": "originGroupname",
+						"instance":            "foo",
 					},
 					Annotations: map[string]string{
 						"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
@ -1092,3 +1100,54 @@ func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.
 	rule.KeepFiringFor = keepFiringFor
 	return rule
 }
+
+func TestAlertingRule_ToLabels(t *testing.T) {
+	metric := datasource.Metric{
+		Labels: []datasource.Label{
+			{Name: "instance", Value: "0.0.0.0:8800"},
+			{Name: "group", Value: "vmalert"},
+			{Name: "alertname", Value: "ConfigurationReloadFailure"},
+		},
+		Values:     []float64{1},
+		Timestamps: []int64{time.Now().UnixNano()},
+	}
+
+	ar := &AlertingRule{
+		Labels: map[string]string{
+			"instance": "override", // this should override instance with new value
+			"group":    "vmalert",  // this shouldn't have effect since value in metric is equal
+		},
+		Expr:      "sum(vmalert_alerting_rules_error) by(instance, group, alertname) > 0",
+		Name:      "AlertingRulesError",
+		GroupName: "vmalert",
+	}
+
+	expectedOriginLabels := map[string]string{
+		"instance":   "0.0.0.0:8800",
+		"group":      "vmalert",
+		"alertname":  "ConfigurationReloadFailure",
+		"alertgroup": "vmalert",
+	}
+
+	expectedProcessedLabels := map[string]string{
+		"instance":           "override",
+		"exported_instance":  "0.0.0.0:8800",
+		"alertname":          "AlertingRulesError",
+		"exported_alertname": "ConfigurationReloadFailure",
+		"group":              "vmalert",
+		"alertgroup":         "vmalert",
+	}
+
+	ls, err := ar.toLabels(metric, nil)
+	if err != nil {
+		t.Fatalf("unexpected error: %s", err)
+	}
+
+	if !reflect.DeepEqual(ls.origin, expectedOriginLabels) {
+		t.Errorf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels)
+	}
+
+	if !reflect.DeepEqual(ls.processed, expectedProcessedLabels) {
+		t.Errorf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels)
+	}
+}
--- a/app/vmalert/rule/recording.go
+++ b/app/vmalert/rule/recording.go
@ -194,6 +194,9 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
 	labels["__name__"] = rr.Name
 	// override existing labels with configured ones
 	for k, v := range rr.Labels {
+		if _, ok := labels[k]; ok && labels[k] != v {
+			labels[fmt.Sprintf("exported_%s", k)] = labels[k]
+		}
 		labels[k] = v
 	}
 	return newTimeSeries(m.Values, m.Timestamps, labels)
@ -203,7 +206,7 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
 func (rr *RecordingRule) updateWith(r Rule) error {
 	nr, ok := r.(*RecordingRule)
 	if !ok {
-		return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
+		return fmt.Errorf("BUG: attempt to update recording rule with wrong type %#v", r)
 	}
 	rr.Expr = nr.Expr
 	rr.Labels = nr.Labels
--- a/app/vmalert/rule/recording_test.go
+++ b/app/vmalert/rule/recording_test.go
@ -61,7 +61,7 @@ func TestRecordingRule_Exec(t *testing.T) {
 			},
 			[]datasource.Metric{
 				metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
-				metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
+				metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar", "source", "origin"),
 			},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
@ -70,9 +70,10 @@ func TestRecordingRule_Exec(t *testing.T) {
 					"source":   "test",
 				}),
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
-					"__name__": "job:foo",
-					"job":      "bar",
-					"source":   "test",
+					"__name__":        "job:foo",
+					"job":             "bar",
+					"source":          "test",
+					"exported_source": "origin",
 				}),
 			},
 		},
@ -254,10 +255,7 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
 	fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))

 	_, err = rr.exec(context.TODO(), time.Now(), 0)
-	if err == nil {
-		t.Fatalf("expected to get err; got nil")
-	}
-	if !strings.Contains(err.Error(), errDuplicate.Error()) {
-		t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
+	if err != nil {
+		t.Fatal(err)
 	}
 }
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -44,6 +44,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped.
 * BUGFIX: `vmstorage`: added missing `-inmemoryDataFlushInterval` command-line flag, which was missing in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) after implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337) in [v1.85.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.0).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
 * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.
--- a/docs/vmalert.md
+++ b/docs/vmalert.md
@ -100,6 +100,8 @@ See the full list of configuration flags in [configuration](#configuration) sect

 If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
 to specify different `-external.label` command-line flags in order to define which `vmalert` generated rules or alerts.
+If rule result metrics have label that conflict with `-external.label`, `vmalert` will automatically rename
+it with prefix `exported_`.

 Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
 and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
@ -896,33 +898,6 @@ max(vmalert_alerting_rules_last_evaluation_series_fetched) by(group, alertname)
 See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
 This feature is available only if vmalert is using VictoriaMetrics v1.90 or higher as a datasource.

-### Series with the same labelset
-
-vmalert can produce the following error message during rules evaluation:
-```
-result contains metrics with the same labelset after applying rule labels
-```
-
-The error means there is a collision between [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
-after applying extra labels to result.
-
-For example, a rule with `expr: foo > 0` returns two distinct time series in response:
-```
-foo{bar="baz"} 1
-foo{bar="qux"} 2
-```
-
-If user configures `-external.label=bar=baz` cmd-line flag to enforce
-adding `bar="baz"` label-value pair, then time series won't be distinct anymore:
-```
-foo{bar="baz"} 1
-foo{bar="baz"} 2 # 'bar' label was overriden by `-external.label=bar=baz
-```
-
-The same issue can be caused by collision of configured `labels` on [Group](#groups) or [Rule](#rules) levels.
-To fix it one should avoid collisions by carefully picking label overrides in configuration.
-
-
 ## Security

 See general recommendations regarding security [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#security).