vmalert: explicitly set extra labels to alert entities (#886)

The previous implementation treated extra labels (global and rule labels) as separate label set to returned time series labels. Hence, time series always contained only original labels and alert ID was generated from sorted labels key-values. Extra labels didn't affect the generated ID and were applied on the following actions: - templating for Summary and Annotations; - persisting state via remote write; - restoring state via remote read. Such behaviour caused difficulties on restore procedure because extra labels had to be dropped before checking the alert ID, but that not always worked. Consider the case when expression returns the following time series `up{job="foo"}` and rule has extra label `job=bar`. This would mean that restored alert ID will be always different to the real time series because of collision. To solve the situation extra labels are now always applied beforehand and `vmalert` doesn't store original labels anymore. However, this could result into a new error situation. Consider the case when expression returns two time series `up{job="foo"}` and `up{job="baz"}`, while rule has extra label `job=bar`. In such case, applying extra labels will result into two identical time series and `vmalert` will return error: `result contains metrics with the same labelset after applying rule labels` https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2024-11-21 14:44:00 +00:00 · 2020-11-09 22:27:32 +00:00 · 2020-11-09 22:27:32 +00:00 · 3adf8c5a6f
commit 3adf8c5a6f
parent 0d1855f661
7 changed files with 79 additions and 41 deletions
--- a/app/vmalert/alerting.go
+++ b/app/vmalert/alerting.go
@ -140,7 +140,17 @@ func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series b
 	updated := make(map[uint64]struct{})
 	// update list of active alerts
 	for _, m := range qMetrics {
+		for k, v := range ar.Labels {
+			// apply extra labels
+			m.SetLabel(k, v)
+		}
+
 		h := hash(m)
+		if _, ok := updated[h]; ok {
+			// duplicate may be caused by extra labels
+			// conflicting with the metric labels
+			return nil, fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
+		}
 		updated[h] = struct{}{}
 		if a, ok := ar.alerts[h]; ok {
 			if a.Value != m.Value {
@ -258,25 +268,11 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time) (*notifie
 }

 func (ar *AlertingRule) template(a *notifier.Alert) error {
-	// 1. template rule labels with data labels
-	rLabels, err := a.ExecTemplate(ar.Labels)
-	if err != nil {
-		return err
-	}
-
-	// 2. merge data labels and rule labels
-	// metric labels may be overridden by
-	// rule labels
-	for k, v := range rLabels {
-		a.Labels[k] = v
-	}
-
-	// 3. template merged labels
+	var err error
 	a.Labels, err = a.ExecTemplate(a.Labels)
 	if err != nil {
 		return err
 	}
-
 	a.Annotations, err = a.ExecTemplate(ar.Annotations)
 	return err
 }
@ -419,14 +415,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
 		// drop all extra labels, so hash key will
 		// be identical to time series received in Exec
 		for _, l := range labels {
-			if l.Name == alertNameLabel {
-				continue
-			}
-			if l.Name == alertGroupNameLabel {
-				continue
-			}
-			// drop all overridden labels
-			if _, ok := ar.Labels[l.Name]; ok {
+			if l.Name == alertNameLabel || l.Name == alertGroupNameLabel {
 				continue
 			}
 			m.Labels = append(m.Labels, l)
--- a/app/vmalert/alerting_test.go
+++ b/app/vmalert/alerting_test.go
@ -2,6 +2,8 @@ package main

 import (
 	"context"
+	"errors"
+	"strings"
 	"testing"
 	"time"

@ -218,19 +220,6 @@ func TestAlertingRule_Exec(t *testing.T) {
 				hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
 			},
 		},
-		{
-			newTestAlertingRule("duplicate", 0),
-			[][]datasource.Metric{
-				{
-					// metrics with the same labelset should result in one alert
-					metricWithLabels(t, "name", "foo", "type", "bar"),
-					metricWithLabels(t, "type", "bar", "name", "foo"),
-				},
-			},
-			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, "name", "foo", "type", "bar")): {State: notifier.StateFiring},
-			},
-		},
 		{
 			newTestAlertingRule("for-pending", time.Minute),
 			[][]datasource.Metric{
@ -376,7 +365,7 @@ func TestAlertingRule_Restore(t *testing.T) {
 					alertNameLabel, "",
 					"foo", "bar",
 					"namespace", "baz",
-					// following pair supposed to be dropped
+					// extra labels set by rule
 					"source", "vm",
 				),
 			},
@ -384,6 +373,7 @@ func TestAlertingRule_Restore(t *testing.T) {
 				hash(metricWithLabels(t,
 					"foo", "bar",
 					"namespace", "baz",
+					"source", "vm",
 				)): {State: notifier.StatePending,
 					Start: time.Now().Truncate(time.Hour)},
 			},
@ -442,6 +432,38 @@ func TestAlertingRule_Restore(t *testing.T) {
 	}
 }

+func TestAlertingRule_Exec_Negative(t *testing.T) {
+	fq := &fakeQuerier{}
+	ar := newTestAlertingRule("test", 0)
+	ar.Labels = map[string]string{"job": "test"}
+
+	// successful attempt
+	fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
+	_, err := ar.Exec(context.TODO(), fq, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// label `job` will collide with rule extra label and will make both time series equal
+	fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
+	_, err = ar.Exec(context.TODO(), fq, false)
+	if !errors.Is(err, errDuplicate) {
+		t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
+	}
+
+	fq.reset()
+
+	expErr := "connection reset by peer"
+	fq.setErr(errors.New(expErr))
+	_, err = ar.Exec(context.TODO(), fq, false)
+	if err == nil {
+		t.Fatalf("expected to get err; got nil")
+	}
+	if !strings.Contains(err.Error(), expErr) {
+		t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
+	}
+}
+
 func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
 	r := newTestAlertingRule(name, 0)
 	r.Labels = make(map[string]string)
--- a/app/vmalert/datasource/datasource.go
+++ b/app/vmalert/datasource/datasource.go
@ -17,6 +17,23 @@ type Metric struct {
 	Value     float64
 }

+// SetLabel adds or updates existing one label
+// by the given key and label
+func (m *Metric) SetLabel(key, value string) {
+	for i, l := range m.Labels {
+		if l.Name == key {
+			m.Labels[i].Value = value
+			return
+		}
+	}
+	m.AddLabel(key, value)
+}
+
+// AddLabel appends the given label to the label set
+func (m *Metric) AddLabel(key, value string) {
+	m.Labels = append(m.Labels, Label{Name: key, Value: value})
+}
+
 // Label represents metric's label
 type Label struct {
 	Name  string
--- a/app/vmalert/datasource/vm.go
+++ b/app/vmalert/datasource/vm.go
@ -37,7 +37,7 @@ func (r response) metrics() ([]Metric, error) {
 		}
 		m.Labels = nil
 		for k, v := range r.Data.Result[i].Labels {
-			m.Labels = append(m.Labels, Label{Name: k, Value: v})
+			m.AddLabel(k, v)
 		}
 		m.Timestamp = int64(res.TV[0].(float64))
 		m.Value = f
--- a/app/vmalert/group_test.go
+++ b/app/vmalert/group_test.go
@ -172,6 +172,11 @@ func TestGroupStart(t *testing.T) {
 		t.Fatalf("faield to create alert: %s", err)
 	}
 	alert1.State = notifier.StateFiring
+	// add external label
+	alert1.Labels["cluster"] = "east-1"
+	// add rule labels - see config/testdata/rules1-good.rules
+	alert1.Labels["label"] = "bar"
+	alert1.Labels["host"] = inst1
 	alert1.ID = hash(m1)

 	alert2, err := r.newAlert(m2, time.Now())
@ -179,6 +184,11 @@ func TestGroupStart(t *testing.T) {
 		t.Fatalf("faield to create alert: %s", err)
 	}
 	alert2.State = notifier.StateFiring
+	// add external label
+	alert2.Labels["cluster"] = "east-1"
+	// add rule labels - see config/testdata/rules1-good.rules
+	alert2.Labels["label"] = "bar"
+	alert2.Labels["host"] = inst2
 	alert2.ID = hash(m2)

 	finished := make(chan struct{})
--- a/app/vmalert/recording.go
+++ b/app/vmalert/recording.go
@ -2,7 +2,6 @@ package main

 import (
 	"context"
-	"errors"
 	"fmt"
 	"hash/fnv"
 	"sort"
@ -79,8 +78,6 @@ func (rr *RecordingRule) Close() {
 	metrics.UnregisterMetric(rr.metrics.errors.name)
 }

-var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
-
 // Exec executes RecordingRule expression via the given Querier.
 func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
 	if !series {
--- a/app/vmalert/rule.go
+++ b/app/vmalert/rule.go
@ -2,6 +2,7 @@ package main

 import (
 	"context"
+	"errors"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@ -25,3 +26,5 @@ type Rule interface {
 	// such as metrics unregister
 	Close()
 }
+
+var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")