vmalert: fix labels and annotations processing for alerts (#2403)

To improve compatibility with Prometheus alerting the order of templates processing has changed. Before, vmalert did all labels processing beforehand. It meant all extra labels (such as `alertname`, `alertgroup` or rule labels) were available in templating. All collisions were resolved in favour of extra labels. In Prometheus, only labels from the received metric are available in templating, so no collisions are possible. This change makes vmalert's behaviour similar to Prometheus. For example, consider alerting rule which is triggered by time series with `alertname` label. In vmalert, this label would be overriden by alerting rule's name everywhere: for alert labels, for annotations, etc. In Prometheus, it would be overriden for alert's labels only, but in annotations the original label value would be available. See more details here https://github.com/prometheus/compliance/issues/80 Signed-off-by: hagen1778 <roman@victoriametrics.com>
2025-01-10 15:14:09 +00:00 · 2022-04-06 20:24:45 +02:00 · 2022-04-06 20:24:45 +02:00 · 4de1b2b74a
commit 4de1b2b74a
parent cb319b15bb
7 changed files with 142 additions and 138 deletions
--- a/app/vmalert/alerting.go
+++ b/app/vmalert/alerting.go
@ -141,6 +141,53 @@ func (ar *AlertingRule) ID() uint64 {
 	return ar.RuleID
 }

+type labelSet struct {
+	// origin labels from series
+	// used for templating
+	origin map[string]string
+	// processed labels with additional data
+	// used as Alert labels
+	processed map[string]string
+}
+
+// toLabels converts labels from given Metric
+// to labelSet which contains original and processed labels.
+func (ar *AlertingRule) toLabels(m datasource.Metric, qFn notifier.QueryFn) (*labelSet, error) {
+	ls := &labelSet{
+		origin:    make(map[string]string, len(m.Labels)),
+		processed: make(map[string]string),
+	}
+	for _, l := range m.Labels {
+		// drop __name__ to be consistent with Prometheus alerting
+		if l.Name == "__name__" {
+			continue
+		}
+		ls.origin[l.Name] = l.Value
+		ls.processed[l.Name] = l.Value
+	}
+
+	extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{
+		Labels: ls.origin,
+		Value:  m.Values[0],
+		Expr:   ar.Expr,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to expand labels: %s", err)
+	}
+	for k, v := range extraLabels {
+		ls.processed[k] = v
+	}
+
+	// set additional labels to identify group and rule name
+	if ar.Name != "" {
+		ls.processed[alertNameLabel] = ar.Name
+	}
+	if !*disableAlertGroupLabel && ar.GroupName != "" {
+		ls.processed[alertGroupNameLabel] = ar.GroupName
+	}
+	return ls, nil
+}
+
 // ExecRange executes alerting rule on the given time range similarly to Exec.
 // It doesn't update internal states of the Rule and meant to be used just
 // to get time series for backfilling.
@ -155,24 +202,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
 		return nil, fmt.Errorf("`query` template isn't supported in replay mode")
 	}
 	for _, s := range series {
-		// set additional labels to identify group and rule Name
-		if ar.Name != "" {
-			s.SetLabel(alertNameLabel, ar.Name)
-		}
-		if !*disableAlertGroupLabel && ar.GroupName != "" {
-			s.SetLabel(alertGroupNameLabel, ar.GroupName)
-		}
-		// extra labels could contain templates, so we expand them first
-		labels, err := expandLabels(s, qFn, ar)
-		if err != nil {
-			return nil, fmt.Errorf("failed to expand labels: %s", err)
-		}
-		for k, v := range labels {
-			// apply extra labels to datasource
-			// so the hash key will be consistent on restore
-			s.SetLabel(k, v)
-		}
-		a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
+		a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
 		if err != nil {
 			return nil, fmt.Errorf("failed to create alert: %s", err)
 		}
@ -234,28 +264,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
 	updated := make(map[uint64]struct{})
 	// update list of active alerts
 	for _, m := range qMetrics {
-		// set additional labels to identify group and rule name
-		if ar.Name != "" {
-			m.SetLabel(alertNameLabel, ar.Name)
-		}
-		if !*disableAlertGroupLabel && ar.GroupName != "" {
-			m.SetLabel(alertGroupNameLabel, ar.GroupName)
-		}
-		// extra labels could contain templates, so we expand them first
-		labels, err := expandLabels(m, qFn, ar)
+		ls, err := ar.toLabels(m, qFn)
 		if err != nil {
 			return nil, fmt.Errorf("failed to expand labels: %s", err)
 		}
-		for k, v := range labels {
-			// apply extra labels to datasource
-			// so the hash key will be consistent on restore
-			m.SetLabel(k, v)
-		}
-		h := hash(m)
+		h := hash(ls.processed)
 		if _, ok := updated[h]; ok {
 			// duplicate may be caused by extra labels
 			// conflicting with the metric labels
-			ar.lastExecError = fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
+			ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
 			return nil, ar.lastExecError
 		}
 		updated[h] = struct{}{}
@ -272,14 +289,14 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
 				a.Value = m.Values[0]
 				// and re-exec template since Value can be used
 				// in annotations
-				a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
+				a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
 				if err != nil {
 					return nil, err
 				}
 			}
 			continue
 		}
-		a, err := ar.newAlert(m, ar.lastExecTime, qFn)
+		a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
 		if err != nil {
 			ar.lastExecError = err
 			return nil, fmt.Errorf("failed to create alert: %w", err)
@ -315,19 +332,6 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
 	return ar.toTimeSeries(ts.Unix()), nil
 }

-func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
-	metricLabels := make(map[string]string)
-	for _, l := range m.Labels {
-		metricLabels[l.Name] = l.Value
-	}
-	tpl := notifier.AlertTplData{
-		Labels: metricLabels,
-		Value:  m.Values[0],
-		Expr:   ar.Expr,
-	}
-	return notifier.ExecTemplate(q, ar.Labels, tpl)
-}
-
 func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
 	var tss []prompbmarshal.TimeSeries
 	for _, a := range ar.alerts {
@ -358,42 +362,43 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
 }

 // TODO: consider hashing algorithm in VM
-func hash(m datasource.Metric) uint64 {
+func hash(labels map[string]string) uint64 {
 	hash := fnv.New64a()
-	labels := m.Labels
-	sort.Slice(labels, func(i, j int) bool {
-		return labels[i].Name < labels[j].Name
-	})
-	for _, l := range labels {
+	keys := make([]string, 0, len(labels))
+	for k := range labels {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	for _, k := range keys {
 		// drop __name__ to be consistent with Prometheus alerting
-		if l.Name == "__name__" {
+		if k == "__name__" {
 			continue
 		}
-		hash.Write([]byte(l.Name))
-		hash.Write([]byte(l.Value))
+		name, value := k, labels[k]
+		hash.Write([]byte(name))
+		hash.Write([]byte(value))
 		hash.Write([]byte("\xff"))
 	}
 	return hash.Sum64()
 }

-func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
+func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
+	var err error
+	if ls == nil {
+		ls, err = ar.toLabels(m, qFn)
+		if err != nil {
+			return nil, fmt.Errorf("failed to expand labels: %s", err)
+		}
+	}
 	a := &notifier.Alert{
 		GroupID:  ar.GroupID,
 		Name:     ar.Name,
-		Labels:   map[string]string{},
+		Labels:   ls.processed,
 		Value:    m.Values[0],
 		ActiveAt: start,
 		Expr:     ar.Expr,
 	}
-	for _, l := range m.Labels {
-		// drop __name__ to be consistent with Prometheus alerting
-		if l.Name == "__name__" {
-			continue
-		}
-		a.Labels[l.Name] = l.Value
-	}
-	var err error
-	a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
+	a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
 	return a, err
 }

@ -560,15 +565,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
 	}

 	for _, m := range qMetrics {
-		a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
+		ls := &labelSet{
+			origin:    make(map[string]string, len(m.Labels)),
+			processed: make(map[string]string, len(m.Labels)),
+		}
+		for _, l := range m.Labels {
+			if l.Name == "__name__" {
+				continue
+			}
+			ls.origin[l.Name] = l.Value
+			ls.processed[l.Name] = l.Value
+		}
+		a, err := ar.newAlert(m, ls, time.Unix(int64(m.Values[0]), 0), qFn)
 		if err != nil {
 			return fmt.Errorf("failed to create alert: %w", err)
 		}
-		a.ID = hash(m)
+		a.ID = hash(ls.processed)
 		a.State = notifier.StatePending
 		a.Restored = true
 		ar.alerts[a.ID] = a
-		logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
+		logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
 	}
 	return nil
 }
--- a/app/vmalert/alerting_test.go
+++ b/app/vmalert/alerting_test.go
@ -315,10 +315,13 @@ func TestAlertingRule_Exec(t *testing.T) {
 			}
 			expAlerts := make(map[uint64]*notifier.Alert)
 			for _, ta := range tc.expAlerts {
-				labels := ta.labels
-				labels = append(labels, alertNameLabel)
-				labels = append(labels, tc.rule.Name)
-				h := hash(metricWithLabels(t, labels...))
+				labels := make(map[string]string)
+				for i := 0; i < len(ta.labels); i += 2 {
+					k, v := ta.labels[i], ta.labels[i+1]
+					labels[k] = v
+				}
+				labels[alertNameLabel] = tc.rule.Name
+				h := hash(labels)
 				expAlerts[h] = ta.alert
 			}
 			for key, exp := range expAlerts {
@ -513,7 +516,7 @@ func TestAlertingRule_Restore(t *testing.T) {
 				),
 			},
 			map[uint64]*notifier.Alert{
-				hash(datasource.Metric{}): {State: notifier.StatePending,
+				hash(nil): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(time.Hour)},
 			},
 		},
@ -529,12 +532,12 @@ func TestAlertingRule_Restore(t *testing.T) {
 				),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t,
-					alertNameLabel, "metric labels",
-					alertGroupNameLabel, "groupID",
-					"foo", "bar",
-					"namespace", "baz",
-				)): {State: notifier.StatePending,
+				hash(map[string]string{
+					alertNameLabel:      "metric labels",
+					alertGroupNameLabel: "groupID",
+					"foo":               "bar",
+					"namespace":         "baz",
+				}): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(time.Hour)},
 			},
 		},
@ -550,11 +553,11 @@ func TestAlertingRule_Restore(t *testing.T) {
 				),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t,
-					"foo", "bar",
-					"namespace", "baz",
-					"source", "vm",
-				)): {State: notifier.StatePending,
+				hash(map[string]string{
+					"foo":       "bar",
+					"namespace": "baz",
+					"source":    "vm",
+				}): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(time.Hour)},
 			},
 		},
@ -575,11 +578,11 @@ func TestAlertingRule_Restore(t *testing.T) {
 				),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
+				hash(map[string]string{"host": "localhost-1"}): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(time.Hour)},
-				hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
+				hash(map[string]string{"host": "localhost-2"}): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(2 * time.Hour)},
-				hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
+				hash(map[string]string{"host": "localhost-3"}): {State: notifier.StatePending,
 					ActiveAt: time.Now().Truncate(3 * time.Hour)},
 			},
 		},
@ -659,7 +662,7 @@ func TestAlertingRule_Template(t *testing.T) {
 				metricWithValueAndLabels(t, 1, "instance", "bar"),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): {
+				hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): {
 					Annotations: map[string]string{},
 					Labels: map[string]string{
 						alertNameLabel: "common",
@ -667,7 +670,7 @@ func TestAlertingRule_Template(t *testing.T) {
 						"instance":     "foo",
 					},
 				},
-				hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): {
+				hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): {
 					Annotations: map[string]string{},
 					Labels: map[string]string{
 						alertNameLabel: "common",
@ -682,11 +685,10 @@ func TestAlertingRule_Template(t *testing.T) {
 				Name: "override label",
 				Labels: map[string]string{
 					"instance": "{{ $labels.instance }}",
-					"region":   "east",
 				},
 				Annotations: map[string]string{
-					"summary":     `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`,
-					"description": `It is {{ $value }} connections for "{{ $labels.instance }}"`,
+					"summary":     `Too high connection number for "{{ $labels.instance }}"`,
+					"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
 				},
 				alerts: make(map[uint64]*notifier.Alert),
 			},
@ -695,64 +697,58 @@ func TestAlertingRule_Template(t *testing.T) {
 				metricWithValueAndLabels(t, 10, "instance", "bar", alertNameLabel, "override"),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): {
+				hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
 					Labels: map[string]string{
 						alertNameLabel: "override label",
 						"instance":     "foo",
-						"region":       "east",
 					},
 					Annotations: map[string]string{
-						"summary":     `Too high connection number for "foo" for region east`,
-						"description": `It is 2 connections for "foo"`,
+						"summary":     `Too high connection number for "foo"`,
+						"description": `override: It is 2 connections for "foo"`,
 					},
 				},
-				hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): {
+				hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
 					Labels: map[string]string{
 						alertNameLabel: "override label",
 						"instance":     "bar",
-						"region":       "east",
 					},
 					Annotations: map[string]string{
-						"summary":     `Too high connection number for "bar" for region east`,
-						"description": `It is 10 connections for "bar"`,
+						"summary":     `Too high connection number for "bar"`,
+						"description": `override: It is 10 connections for "bar"`,
 					},
 				},
 			},
 		},
 		{
 			&AlertingRule{
-				Name:      "ExtraTemplating",
+				Name:      "OriginLabels",
 				GroupName: "Testing",
 				Labels: map[string]string{
-					"name":     "alert_{{ $labels.alertname }}",
-					"group":    "group_{{ $labels.alertgroup }}",
 					"instance": "{{ $labels.instance }}",
 				},
 				Annotations: map[string]string{
-					"summary":     `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
-					"description": `Alert "{{ $labels.name }}({{ $labels.group }})" for instance {{ $labels.instance }}`,
+					"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
 				},
 				alerts: make(map[uint64]*notifier.Alert),
 			},
 			[]datasource.Metric{
-				metricWithValueAndLabels(t, 1, "instance", "foo"),
+				metricWithValueAndLabels(t, 1,
+					alertNameLabel, "originAlertname",
+					alertGroupNameLabel, "originGroupname",
+					"instance", "foo"),
 			},
 			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, alertNameLabel, "ExtraTemplating",
-					"name", "alert_ExtraTemplating",
-					alertGroupNameLabel, "Testing",
-					"group", "group_Testing",
-					"instance", "foo")): {
+				hash(map[string]string{
+					alertNameLabel:      "OriginLabels",
+					alertGroupNameLabel: "Testing",
+					"instance":          "foo"}): {
 					Labels: map[string]string{
-						alertNameLabel:      "ExtraTemplating",
-						"name":              "alert_ExtraTemplating",
+						alertNameLabel:      "OriginLabels",
 						alertGroupNameLabel: "Testing",
-						"group":             "group_Testing",
 						"instance":          "foo",
 					},
 					Annotations: map[string]string{
-						"summary":     `Alert "ExtraTemplating(Testing)" for instance foo`,
-						"description": `Alert "alert_ExtraTemplating(group_Testing)" for instance foo`,
+						"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
 					},
 				},
 			},
--- a/app/vmalert/group_test.go
+++ b/app/vmalert/group_test.go
@ -174,7 +174,7 @@ func TestGroupStart(t *testing.T) {
 	m2 := metricWithLabels(t, "instance", inst2, "job", job)

 	r := g.Rules[0].(*AlertingRule)
-	alert1, err := r.newAlert(m1, time.Now(), nil)
+	alert1, err := r.newAlert(m1, nil, time.Now(), nil)
 	if err != nil {
 		t.Fatalf("faield to create alert: %s", err)
 	}
@ -187,13 +187,9 @@ func TestGroupStart(t *testing.T) {
 	// add service labels
 	alert1.Labels[alertNameLabel] = alert1.Name
 	alert1.Labels[alertGroupNameLabel] = g.Name
-	var labels1 []string
-	for k, v := range alert1.Labels {
-		labels1 = append(labels1, k, v)
-	}
-	alert1.ID = hash(metricWithLabels(t, labels1...))
+	alert1.ID = hash(alert1.Labels)

-	alert2, err := r.newAlert(m2, time.Now(), nil)
+	alert2, err := r.newAlert(m2, nil, time.Now(), nil)
 	if err != nil {
 		t.Fatalf("faield to create alert: %s", err)
 	}
@ -206,11 +202,7 @@ func TestGroupStart(t *testing.T) {
 	// add service labels
 	alert2.Labels[alertNameLabel] = alert2.Name
 	alert2.Labels[alertGroupNameLabel] = g.Name
-	var labels2 []string
-	for k, v := range alert2.Labels {
-		labels2 = append(labels2, k, v)
-	}
-	alert2.ID = hash(metricWithLabels(t, labels2...))
+	alert2.ID = hash(alert2.Labels)

 	finished := make(chan struct{})
 	fs.add(m1)
--- a/app/vmalert/main.go
+++ b/app/vmalert/main.go
@ -243,7 +243,7 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
 		"tpl": externalAlertSource,
 	}
 	return func(alert notifier.Alert) string {
-		templated, err := alert.ExecTemplate(nil, m)
+		templated, err := alert.ExecTemplate(nil, nil, m)
 		if err != nil {
 			logger.Errorf("can not exec source template %s", err)
 		}
--- a/app/vmalert/manager.go
+++ b/app/vmalert/manager.go
@ -37,7 +37,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {

 	g, ok := m.groups[gID]
 	if !ok {
-		return nil, fmt.Errorf("can't find group with id %q", gID)
+		return nil, fmt.Errorf("can't find group with id %d", gID)
 	}
 	for _, rule := range g.Rules {
 		ar, ok := rule.(*AlertingRule)
@ -48,7 +48,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
 			return apiAlert, nil
 		}
 	}
-	return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
+	return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name)
 }

 func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
--- a/app/vmalert/notifier/alert.go
+++ b/app/vmalert/notifier/alert.go
@ -88,8 +88,8 @@ var tplHeaders = []string{
 // map of annotations.
 // Every alert could have a different datasource, so function
 // requires a queryFunction as an argument.
-func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) {
-	tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
+func (a *Alert) ExecTemplate(q QueryFn, labels, annotations map[string]string) (map[string]string, error) {
+	tplData := AlertTplData{Value: a.Value, Labels: labels, Expr: a.Expr}
 	return templateAnnotations(annotations, tplData, funcsWithQuery(q))
 }

--- a/app/vmalert/notifier/alert_test.go
+++ b/app/vmalert/notifier/alert_test.go
@ -130,7 +130,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations)
+			tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations)
 			if err != nil {
 				t.Fatal(err)
 			}