From 4de1b2b74a2ff01ecd593d5a29581546f35dac2b Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Wed, 6 Apr 2022 20:24:45 +0200 Subject: [PATCH] vmalert: fix labels and annotations processing for alerts (#2403) To improve compatibility with Prometheus alerting the order of templates processing has changed. Before, vmalert did all labels processing beforehand. It meant all extra labels (such as `alertname`, `alertgroup` or rule labels) were available in templating. All collisions were resolved in favour of extra labels. In Prometheus, only labels from the received metric are available in templating, so no collisions are possible. This change makes vmalert's behaviour similar to Prometheus. For example, consider alerting rule which is triggered by time series with `alertname` label. In vmalert, this label would be overriden by alerting rule's name everywhere: for alert labels, for annotations, etc. In Prometheus, it would be overriden for alert's labels only, but in annotations the original label value would be available. See more details here https://github.com/prometheus/compliance/issues/80 Signed-off-by: hagen1778 --- app/vmalert/alerting.go | 160 ++++++++++++++++------------- app/vmalert/alerting_test.go | 92 ++++++++--------- app/vmalert/group_test.go | 16 +-- app/vmalert/main.go | 2 +- app/vmalert/manager.go | 4 +- app/vmalert/notifier/alert.go | 4 +- app/vmalert/notifier/alert_test.go | 2 +- 7 files changed, 142 insertions(+), 138 deletions(-) diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index a5b929f86e..0f819cbd64 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -141,6 +141,53 @@ func (ar *AlertingRule) ID() uint64 { return ar.RuleID } +type labelSet struct { + // origin labels from series + // used for templating + origin map[string]string + // processed labels with additional data + // used as Alert labels + processed map[string]string +} + +// toLabels converts labels from given Metric +// to labelSet which contains original and processed labels. +func (ar *AlertingRule) toLabels(m datasource.Metric, qFn notifier.QueryFn) (*labelSet, error) { + ls := &labelSet{ + origin: make(map[string]string, len(m.Labels)), + processed: make(map[string]string), + } + for _, l := range m.Labels { + // drop __name__ to be consistent with Prometheus alerting + if l.Name == "__name__" { + continue + } + ls.origin[l.Name] = l.Value + ls.processed[l.Name] = l.Value + } + + extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{ + Labels: ls.origin, + Value: m.Values[0], + Expr: ar.Expr, + }) + if err != nil { + return nil, fmt.Errorf("failed to expand labels: %s", err) + } + for k, v := range extraLabels { + ls.processed[k] = v + } + + // set additional labels to identify group and rule name + if ar.Name != "" { + ls.processed[alertNameLabel] = ar.Name + } + if !*disableAlertGroupLabel && ar.GroupName != "" { + ls.processed[alertGroupNameLabel] = ar.GroupName + } + return ls, nil +} + // ExecRange executes alerting rule on the given time range similarly to Exec. // It doesn't update internal states of the Rule and meant to be used just // to get time series for backfilling. @@ -155,24 +202,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([] return nil, fmt.Errorf("`query` template isn't supported in replay mode") } for _, s := range series { - // set additional labels to identify group and rule Name - if ar.Name != "" { - s.SetLabel(alertNameLabel, ar.Name) - } - if !*disableAlertGroupLabel && ar.GroupName != "" { - s.SetLabel(alertGroupNameLabel, ar.GroupName) - } - // extra labels could contain templates, so we expand them first - labels, err := expandLabels(s, qFn, ar) - if err != nil { - return nil, fmt.Errorf("failed to expand labels: %s", err) - } - for k, v := range labels { - // apply extra labels to datasource - // so the hash key will be consistent on restore - s.SetLabel(k, v) - } - a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert + a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert if err != nil { return nil, fmt.Errorf("failed to create alert: %s", err) } @@ -234,28 +264,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal updated := make(map[uint64]struct{}) // update list of active alerts for _, m := range qMetrics { - // set additional labels to identify group and rule name - if ar.Name != "" { - m.SetLabel(alertNameLabel, ar.Name) - } - if !*disableAlertGroupLabel && ar.GroupName != "" { - m.SetLabel(alertGroupNameLabel, ar.GroupName) - } - // extra labels could contain templates, so we expand them first - labels, err := expandLabels(m, qFn, ar) + ls, err := ar.toLabels(m, qFn) if err != nil { return nil, fmt.Errorf("failed to expand labels: %s", err) } - for k, v := range labels { - // apply extra labels to datasource - // so the hash key will be consistent on restore - m.SetLabel(k, v) - } - h := hash(m) + h := hash(ls.processed) if _, ok := updated[h]; ok { // duplicate may be caused by extra labels // conflicting with the metric labels - ar.lastExecError = fmt.Errorf("labels %v: %w", m.Labels, errDuplicate) + ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate) return nil, ar.lastExecError } updated[h] = struct{}{} @@ -272,14 +289,14 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal a.Value = m.Values[0] // and re-exec template since Value can be used // in annotations - a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations) + a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations) if err != nil { return nil, err } } continue } - a, err := ar.newAlert(m, ar.lastExecTime, qFn) + a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn) if err != nil { ar.lastExecError = err return nil, fmt.Errorf("failed to create alert: %w", err) @@ -315,19 +332,6 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal return ar.toTimeSeries(ts.Unix()), nil } -func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) { - metricLabels := make(map[string]string) - for _, l := range m.Labels { - metricLabels[l.Name] = l.Value - } - tpl := notifier.AlertTplData{ - Labels: metricLabels, - Value: m.Values[0], - Expr: ar.Expr, - } - return notifier.ExecTemplate(q, ar.Labels, tpl) -} - func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries { var tss []prompbmarshal.TimeSeries for _, a := range ar.alerts { @@ -358,42 +362,43 @@ func (ar *AlertingRule) UpdateWith(r Rule) error { } // TODO: consider hashing algorithm in VM -func hash(m datasource.Metric) uint64 { +func hash(labels map[string]string) uint64 { hash := fnv.New64a() - labels := m.Labels - sort.Slice(labels, func(i, j int) bool { - return labels[i].Name < labels[j].Name - }) - for _, l := range labels { + keys := make([]string, 0, len(labels)) + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { // drop __name__ to be consistent with Prometheus alerting - if l.Name == "__name__" { + if k == "__name__" { continue } - hash.Write([]byte(l.Name)) - hash.Write([]byte(l.Value)) + name, value := k, labels[k] + hash.Write([]byte(name)) + hash.Write([]byte(value)) hash.Write([]byte("\xff")) } return hash.Sum64() } -func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) { +func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) { + var err error + if ls == nil { + ls, err = ar.toLabels(m, qFn) + if err != nil { + return nil, fmt.Errorf("failed to expand labels: %s", err) + } + } a := ¬ifier.Alert{ GroupID: ar.GroupID, Name: ar.Name, - Labels: map[string]string{}, + Labels: ls.processed, Value: m.Values[0], ActiveAt: start, Expr: ar.Expr, } - for _, l := range m.Labels { - // drop __name__ to be consistent with Prometheus alerting - if l.Name == "__name__" { - continue - } - a.Labels[l.Name] = l.Value - } - var err error - a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations) + a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations) return a, err } @@ -560,15 +565,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb } for _, m := range qMetrics { - a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn) + ls := &labelSet{ + origin: make(map[string]string, len(m.Labels)), + processed: make(map[string]string, len(m.Labels)), + } + for _, l := range m.Labels { + if l.Name == "__name__" { + continue + } + ls.origin[l.Name] = l.Value + ls.processed[l.Name] = l.Value + } + a, err := ar.newAlert(m, ls, time.Unix(int64(m.Values[0]), 0), qFn) if err != nil { return fmt.Errorf("failed to create alert: %w", err) } - a.ID = hash(m) + a.ID = hash(ls.processed) a.State = notifier.StatePending a.Restored = true ar.alerts[a.ID] = a - logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start) + logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt) } return nil } diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index edca8254a8..6bbd2dff6f 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -315,10 +315,13 @@ func TestAlertingRule_Exec(t *testing.T) { } expAlerts := make(map[uint64]*notifier.Alert) for _, ta := range tc.expAlerts { - labels := ta.labels - labels = append(labels, alertNameLabel) - labels = append(labels, tc.rule.Name) - h := hash(metricWithLabels(t, labels...)) + labels := make(map[string]string) + for i := 0; i < len(ta.labels); i += 2 { + k, v := ta.labels[i], ta.labels[i+1] + labels[k] = v + } + labels[alertNameLabel] = tc.rule.Name + h := hash(labels) expAlerts[h] = ta.alert } for key, exp := range expAlerts { @@ -513,7 +516,7 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(datasource.Metric{}): {State: notifier.StatePending, + hash(nil): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -529,12 +532,12 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, - alertNameLabel, "metric labels", - alertGroupNameLabel, "groupID", - "foo", "bar", - "namespace", "baz", - )): {State: notifier.StatePending, + hash(map[string]string{ + alertNameLabel: "metric labels", + alertGroupNameLabel: "groupID", + "foo": "bar", + "namespace": "baz", + }): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -550,11 +553,11 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, - "foo", "bar", - "namespace", "baz", - "source", "vm", - )): {State: notifier.StatePending, + hash(map[string]string{ + "foo": "bar", + "namespace": "baz", + "source": "vm", + }): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -575,11 +578,11 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-1"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, - hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-2"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(2 * time.Hour)}, - hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-3"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(3 * time.Hour)}, }, }, @@ -659,7 +662,7 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 1, "instance", "bar"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): { + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): { Annotations: map[string]string{}, Labels: map[string]string{ alertNameLabel: "common", @@ -667,7 +670,7 @@ func TestAlertingRule_Template(t *testing.T) { "instance": "foo", }, }, - hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): { + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): { Annotations: map[string]string{}, Labels: map[string]string{ alertNameLabel: "common", @@ -682,11 +685,10 @@ func TestAlertingRule_Template(t *testing.T) { Name: "override label", Labels: map[string]string{ "instance": "{{ $labels.instance }}", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`, - "description": `It is {{ $value }} connections for "{{ $labels.instance }}"`, + "summary": `Too high connection number for "{{ $labels.instance }}"`, + "description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`, }, alerts: make(map[uint64]*notifier.Alert), }, @@ -695,64 +697,58 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 10, "instance", "bar", alertNameLabel, "override"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): { + hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): { Labels: map[string]string{ alertNameLabel: "override label", "instance": "foo", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "foo" for region east`, - "description": `It is 2 connections for "foo"`, + "summary": `Too high connection number for "foo"`, + "description": `override: It is 2 connections for "foo"`, }, }, - hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): { + hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): { Labels: map[string]string{ alertNameLabel: "override label", "instance": "bar", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "bar" for region east`, - "description": `It is 10 connections for "bar"`, + "summary": `Too high connection number for "bar"`, + "description": `override: It is 10 connections for "bar"`, }, }, }, }, { &AlertingRule{ - Name: "ExtraTemplating", + Name: "OriginLabels", GroupName: "Testing", Labels: map[string]string{ - "name": "alert_{{ $labels.alertname }}", - "group": "group_{{ $labels.alertgroup }}", "instance": "{{ $labels.instance }}", }, Annotations: map[string]string{ - "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, - "description": `Alert "{{ $labels.name }}({{ $labels.group }})" for instance {{ $labels.instance }}`, + "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, }, alerts: make(map[uint64]*notifier.Alert), }, []datasource.Metric{ - metricWithValueAndLabels(t, 1, "instance", "foo"), + metricWithValueAndLabels(t, 1, + alertNameLabel, "originAlertname", + alertGroupNameLabel, "originGroupname", + "instance", "foo"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "ExtraTemplating", - "name", "alert_ExtraTemplating", - alertGroupNameLabel, "Testing", - "group", "group_Testing", - "instance", "foo")): { + hash(map[string]string{ + alertNameLabel: "OriginLabels", + alertGroupNameLabel: "Testing", + "instance": "foo"}): { Labels: map[string]string{ - alertNameLabel: "ExtraTemplating", - "name": "alert_ExtraTemplating", + alertNameLabel: "OriginLabels", alertGroupNameLabel: "Testing", - "group": "group_Testing", "instance": "foo", }, Annotations: map[string]string{ - "summary": `Alert "ExtraTemplating(Testing)" for instance foo`, - "description": `Alert "alert_ExtraTemplating(group_Testing)" for instance foo`, + "summary": `Alert "originAlertname(originGroupname)" for instance foo`, }, }, }, diff --git a/app/vmalert/group_test.go b/app/vmalert/group_test.go index d94838b609..8322e65d02 100644 --- a/app/vmalert/group_test.go +++ b/app/vmalert/group_test.go @@ -174,7 +174,7 @@ func TestGroupStart(t *testing.T) { m2 := metricWithLabels(t, "instance", inst2, "job", job) r := g.Rules[0].(*AlertingRule) - alert1, err := r.newAlert(m1, time.Now(), nil) + alert1, err := r.newAlert(m1, nil, time.Now(), nil) if err != nil { t.Fatalf("faield to create alert: %s", err) } @@ -187,13 +187,9 @@ func TestGroupStart(t *testing.T) { // add service labels alert1.Labels[alertNameLabel] = alert1.Name alert1.Labels[alertGroupNameLabel] = g.Name - var labels1 []string - for k, v := range alert1.Labels { - labels1 = append(labels1, k, v) - } - alert1.ID = hash(metricWithLabels(t, labels1...)) + alert1.ID = hash(alert1.Labels) - alert2, err := r.newAlert(m2, time.Now(), nil) + alert2, err := r.newAlert(m2, nil, time.Now(), nil) if err != nil { t.Fatalf("faield to create alert: %s", err) } @@ -206,11 +202,7 @@ func TestGroupStart(t *testing.T) { // add service labels alert2.Labels[alertNameLabel] = alert2.Name alert2.Labels[alertGroupNameLabel] = g.Name - var labels2 []string - for k, v := range alert2.Labels { - labels2 = append(labels2, k, v) - } - alert2.ID = hash(metricWithLabels(t, labels2...)) + alert2.ID = hash(alert2.Labels) finished := make(chan struct{}) fs.add(m1) diff --git a/app/vmalert/main.go b/app/vmalert/main.go index c986503291..9366ad3500 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -243,7 +243,7 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali "tpl": externalAlertSource, } return func(alert notifier.Alert) string { - templated, err := alert.ExecTemplate(nil, m) + templated, err := alert.ExecTemplate(nil, nil, m) if err != nil { logger.Errorf("can not exec source template %s", err) } diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index 7152fd258d..3ab70c6f40 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -37,7 +37,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { g, ok := m.groups[gID] if !ok { - return nil, fmt.Errorf("can't find group with id %q", gID) + return nil, fmt.Errorf("can't find group with id %d", gID) } for _, rule := range g.Rules { ar, ok := rule.(*AlertingRule) @@ -48,7 +48,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { return apiAlert, nil } } - return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name) + return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name) } func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error { diff --git a/app/vmalert/notifier/alert.go b/app/vmalert/notifier/alert.go index 44b80eb4a7..a1bcae7c17 100644 --- a/app/vmalert/notifier/alert.go +++ b/app/vmalert/notifier/alert.go @@ -88,8 +88,8 @@ var tplHeaders = []string{ // map of annotations. // Every alert could have a different datasource, so function // requires a queryFunction as an argument. -func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) { - tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr} +func (a *Alert) ExecTemplate(q QueryFn, labels, annotations map[string]string) (map[string]string, error) { + tplData := AlertTplData{Value: a.Value, Labels: labels, Expr: a.Expr} return templateAnnotations(annotations, tplData, funcsWithQuery(q)) } diff --git a/app/vmalert/notifier/alert_test.go b/app/vmalert/notifier/alert_test.go index f8e0c77ee0..27b83aac62 100644 --- a/app/vmalert/notifier/alert_test.go +++ b/app/vmalert/notifier/alert_test.go @@ -130,7 +130,7 @@ func TestAlert_ExecTemplate(t *testing.T) { } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations) + tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations) if err != nil { t.Fatal(err) }