diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index a5b929f86..0f819cbd6 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -141,6 +141,53 @@ func (ar *AlertingRule) ID() uint64 { return ar.RuleID } +type labelSet struct { + // origin labels from series + // used for templating + origin map[string]string + // processed labels with additional data + // used as Alert labels + processed map[string]string +} + +// toLabels converts labels from given Metric +// to labelSet which contains original and processed labels. +func (ar *AlertingRule) toLabels(m datasource.Metric, qFn notifier.QueryFn) (*labelSet, error) { + ls := &labelSet{ + origin: make(map[string]string, len(m.Labels)), + processed: make(map[string]string), + } + for _, l := range m.Labels { + // drop __name__ to be consistent with Prometheus alerting + if l.Name == "__name__" { + continue + } + ls.origin[l.Name] = l.Value + ls.processed[l.Name] = l.Value + } + + extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{ + Labels: ls.origin, + Value: m.Values[0], + Expr: ar.Expr, + }) + if err != nil { + return nil, fmt.Errorf("failed to expand labels: %s", err) + } + for k, v := range extraLabels { + ls.processed[k] = v + } + + // set additional labels to identify group and rule name + if ar.Name != "" { + ls.processed[alertNameLabel] = ar.Name + } + if !*disableAlertGroupLabel && ar.GroupName != "" { + ls.processed[alertGroupNameLabel] = ar.GroupName + } + return ls, nil +} + // ExecRange executes alerting rule on the given time range similarly to Exec. // It doesn't update internal states of the Rule and meant to be used just // to get time series for backfilling. @@ -155,24 +202,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([] return nil, fmt.Errorf("`query` template isn't supported in replay mode") } for _, s := range series { - // set additional labels to identify group and rule Name - if ar.Name != "" { - s.SetLabel(alertNameLabel, ar.Name) - } - if !*disableAlertGroupLabel && ar.GroupName != "" { - s.SetLabel(alertGroupNameLabel, ar.GroupName) - } - // extra labels could contain templates, so we expand them first - labels, err := expandLabels(s, qFn, ar) - if err != nil { - return nil, fmt.Errorf("failed to expand labels: %s", err) - } - for k, v := range labels { - // apply extra labels to datasource - // so the hash key will be consistent on restore - s.SetLabel(k, v) - } - a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert + a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert if err != nil { return nil, fmt.Errorf("failed to create alert: %s", err) } @@ -234,28 +264,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal updated := make(map[uint64]struct{}) // update list of active alerts for _, m := range qMetrics { - // set additional labels to identify group and rule name - if ar.Name != "" { - m.SetLabel(alertNameLabel, ar.Name) - } - if !*disableAlertGroupLabel && ar.GroupName != "" { - m.SetLabel(alertGroupNameLabel, ar.GroupName) - } - // extra labels could contain templates, so we expand them first - labels, err := expandLabels(m, qFn, ar) + ls, err := ar.toLabels(m, qFn) if err != nil { return nil, fmt.Errorf("failed to expand labels: %s", err) } - for k, v := range labels { - // apply extra labels to datasource - // so the hash key will be consistent on restore - m.SetLabel(k, v) - } - h := hash(m) + h := hash(ls.processed) if _, ok := updated[h]; ok { // duplicate may be caused by extra labels // conflicting with the metric labels - ar.lastExecError = fmt.Errorf("labels %v: %w", m.Labels, errDuplicate) + ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate) return nil, ar.lastExecError } updated[h] = struct{}{} @@ -272,14 +289,14 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal a.Value = m.Values[0] // and re-exec template since Value can be used // in annotations - a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations) + a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations) if err != nil { return nil, err } } continue } - a, err := ar.newAlert(m, ar.lastExecTime, qFn) + a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn) if err != nil { ar.lastExecError = err return nil, fmt.Errorf("failed to create alert: %w", err) @@ -315,19 +332,6 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal return ar.toTimeSeries(ts.Unix()), nil } -func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) { - metricLabels := make(map[string]string) - for _, l := range m.Labels { - metricLabels[l.Name] = l.Value - } - tpl := notifier.AlertTplData{ - Labels: metricLabels, - Value: m.Values[0], - Expr: ar.Expr, - } - return notifier.ExecTemplate(q, ar.Labels, tpl) -} - func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries { var tss []prompbmarshal.TimeSeries for _, a := range ar.alerts { @@ -358,42 +362,43 @@ func (ar *AlertingRule) UpdateWith(r Rule) error { } // TODO: consider hashing algorithm in VM -func hash(m datasource.Metric) uint64 { +func hash(labels map[string]string) uint64 { hash := fnv.New64a() - labels := m.Labels - sort.Slice(labels, func(i, j int) bool { - return labels[i].Name < labels[j].Name - }) - for _, l := range labels { + keys := make([]string, 0, len(labels)) + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { // drop __name__ to be consistent with Prometheus alerting - if l.Name == "__name__" { + if k == "__name__" { continue } - hash.Write([]byte(l.Name)) - hash.Write([]byte(l.Value)) + name, value := k, labels[k] + hash.Write([]byte(name)) + hash.Write([]byte(value)) hash.Write([]byte("\xff")) } return hash.Sum64() } -func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) { +func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) { + var err error + if ls == nil { + ls, err = ar.toLabels(m, qFn) + if err != nil { + return nil, fmt.Errorf("failed to expand labels: %s", err) + } + } a := ¬ifier.Alert{ GroupID: ar.GroupID, Name: ar.Name, - Labels: map[string]string{}, + Labels: ls.processed, Value: m.Values[0], ActiveAt: start, Expr: ar.Expr, } - for _, l := range m.Labels { - // drop __name__ to be consistent with Prometheus alerting - if l.Name == "__name__" { - continue - } - a.Labels[l.Name] = l.Value - } - var err error - a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations) + a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations) return a, err } @@ -560,15 +565,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb } for _, m := range qMetrics { - a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn) + ls := &labelSet{ + origin: make(map[string]string, len(m.Labels)), + processed: make(map[string]string, len(m.Labels)), + } + for _, l := range m.Labels { + if l.Name == "__name__" { + continue + } + ls.origin[l.Name] = l.Value + ls.processed[l.Name] = l.Value + } + a, err := ar.newAlert(m, ls, time.Unix(int64(m.Values[0]), 0), qFn) if err != nil { return fmt.Errorf("failed to create alert: %w", err) } - a.ID = hash(m) + a.ID = hash(ls.processed) a.State = notifier.StatePending a.Restored = true ar.alerts[a.ID] = a - logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start) + logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt) } return nil } diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index edca8254a..6bbd2dff6 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -315,10 +315,13 @@ func TestAlertingRule_Exec(t *testing.T) { } expAlerts := make(map[uint64]*notifier.Alert) for _, ta := range tc.expAlerts { - labels := ta.labels - labels = append(labels, alertNameLabel) - labels = append(labels, tc.rule.Name) - h := hash(metricWithLabels(t, labels...)) + labels := make(map[string]string) + for i := 0; i < len(ta.labels); i += 2 { + k, v := ta.labels[i], ta.labels[i+1] + labels[k] = v + } + labels[alertNameLabel] = tc.rule.Name + h := hash(labels) expAlerts[h] = ta.alert } for key, exp := range expAlerts { @@ -513,7 +516,7 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(datasource.Metric{}): {State: notifier.StatePending, + hash(nil): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -529,12 +532,12 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, - alertNameLabel, "metric labels", - alertGroupNameLabel, "groupID", - "foo", "bar", - "namespace", "baz", - )): {State: notifier.StatePending, + hash(map[string]string{ + alertNameLabel: "metric labels", + alertGroupNameLabel: "groupID", + "foo": "bar", + "namespace": "baz", + }): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -550,11 +553,11 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, - "foo", "bar", - "namespace", "baz", - "source", "vm", - )): {State: notifier.StatePending, + hash(map[string]string{ + "foo": "bar", + "namespace": "baz", + "source": "vm", + }): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, }, }, @@ -575,11 +578,11 @@ func TestAlertingRule_Restore(t *testing.T) { ), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-1"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(time.Hour)}, - hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-2"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(2 * time.Hour)}, - hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending, + hash(map[string]string{"host": "localhost-3"}): {State: notifier.StatePending, ActiveAt: time.Now().Truncate(3 * time.Hour)}, }, }, @@ -659,7 +662,7 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 1, "instance", "bar"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): { + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): { Annotations: map[string]string{}, Labels: map[string]string{ alertNameLabel: "common", @@ -667,7 +670,7 @@ func TestAlertingRule_Template(t *testing.T) { "instance": "foo", }, }, - hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): { + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): { Annotations: map[string]string{}, Labels: map[string]string{ alertNameLabel: "common", @@ -682,11 +685,10 @@ func TestAlertingRule_Template(t *testing.T) { Name: "override label", Labels: map[string]string{ "instance": "{{ $labels.instance }}", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`, - "description": `It is {{ $value }} connections for "{{ $labels.instance }}"`, + "summary": `Too high connection number for "{{ $labels.instance }}"`, + "description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`, }, alerts: make(map[uint64]*notifier.Alert), }, @@ -695,64 +697,58 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 10, "instance", "bar", alertNameLabel, "override"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): { + hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): { Labels: map[string]string{ alertNameLabel: "override label", "instance": "foo", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "foo" for region east`, - "description": `It is 2 connections for "foo"`, + "summary": `Too high connection number for "foo"`, + "description": `override: It is 2 connections for "foo"`, }, }, - hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): { + hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): { Labels: map[string]string{ alertNameLabel: "override label", "instance": "bar", - "region": "east", }, Annotations: map[string]string{ - "summary": `Too high connection number for "bar" for region east`, - "description": `It is 10 connections for "bar"`, + "summary": `Too high connection number for "bar"`, + "description": `override: It is 10 connections for "bar"`, }, }, }, }, { &AlertingRule{ - Name: "ExtraTemplating", + Name: "OriginLabels", GroupName: "Testing", Labels: map[string]string{ - "name": "alert_{{ $labels.alertname }}", - "group": "group_{{ $labels.alertgroup }}", "instance": "{{ $labels.instance }}", }, Annotations: map[string]string{ - "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, - "description": `Alert "{{ $labels.name }}({{ $labels.group }})" for instance {{ $labels.instance }}`, + "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, }, alerts: make(map[uint64]*notifier.Alert), }, []datasource.Metric{ - metricWithValueAndLabels(t, 1, "instance", "foo"), + metricWithValueAndLabels(t, 1, + alertNameLabel, "originAlertname", + alertGroupNameLabel, "originGroupname", + "instance", "foo"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, alertNameLabel, "ExtraTemplating", - "name", "alert_ExtraTemplating", - alertGroupNameLabel, "Testing", - "group", "group_Testing", - "instance", "foo")): { + hash(map[string]string{ + alertNameLabel: "OriginLabels", + alertGroupNameLabel: "Testing", + "instance": "foo"}): { Labels: map[string]string{ - alertNameLabel: "ExtraTemplating", - "name": "alert_ExtraTemplating", + alertNameLabel: "OriginLabels", alertGroupNameLabel: "Testing", - "group": "group_Testing", "instance": "foo", }, Annotations: map[string]string{ - "summary": `Alert "ExtraTemplating(Testing)" for instance foo`, - "description": `Alert "alert_ExtraTemplating(group_Testing)" for instance foo`, + "summary": `Alert "originAlertname(originGroupname)" for instance foo`, }, }, }, diff --git a/app/vmalert/group_test.go b/app/vmalert/group_test.go index d94838b60..8322e65d0 100644 --- a/app/vmalert/group_test.go +++ b/app/vmalert/group_test.go @@ -174,7 +174,7 @@ func TestGroupStart(t *testing.T) { m2 := metricWithLabels(t, "instance", inst2, "job", job) r := g.Rules[0].(*AlertingRule) - alert1, err := r.newAlert(m1, time.Now(), nil) + alert1, err := r.newAlert(m1, nil, time.Now(), nil) if err != nil { t.Fatalf("faield to create alert: %s", err) } @@ -187,13 +187,9 @@ func TestGroupStart(t *testing.T) { // add service labels alert1.Labels[alertNameLabel] = alert1.Name alert1.Labels[alertGroupNameLabel] = g.Name - var labels1 []string - for k, v := range alert1.Labels { - labels1 = append(labels1, k, v) - } - alert1.ID = hash(metricWithLabels(t, labels1...)) + alert1.ID = hash(alert1.Labels) - alert2, err := r.newAlert(m2, time.Now(), nil) + alert2, err := r.newAlert(m2, nil, time.Now(), nil) if err != nil { t.Fatalf("faield to create alert: %s", err) } @@ -206,11 +202,7 @@ func TestGroupStart(t *testing.T) { // add service labels alert2.Labels[alertNameLabel] = alert2.Name alert2.Labels[alertGroupNameLabel] = g.Name - var labels2 []string - for k, v := range alert2.Labels { - labels2 = append(labels2, k, v) - } - alert2.ID = hash(metricWithLabels(t, labels2...)) + alert2.ID = hash(alert2.Labels) finished := make(chan struct{}) fs.add(m1) diff --git a/app/vmalert/main.go b/app/vmalert/main.go index c98650329..9366ad350 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -243,7 +243,7 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali "tpl": externalAlertSource, } return func(alert notifier.Alert) string { - templated, err := alert.ExecTemplate(nil, m) + templated, err := alert.ExecTemplate(nil, nil, m) if err != nil { logger.Errorf("can not exec source template %s", err) } diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index 7152fd258..3ab70c6f4 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -37,7 +37,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { g, ok := m.groups[gID] if !ok { - return nil, fmt.Errorf("can't find group with id %q", gID) + return nil, fmt.Errorf("can't find group with id %d", gID) } for _, rule := range g.Rules { ar, ok := rule.(*AlertingRule) @@ -48,7 +48,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { return apiAlert, nil } } - return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name) + return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name) } func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error { diff --git a/app/vmalert/notifier/alert.go b/app/vmalert/notifier/alert.go index 44b80eb4a..a1bcae7c1 100644 --- a/app/vmalert/notifier/alert.go +++ b/app/vmalert/notifier/alert.go @@ -88,8 +88,8 @@ var tplHeaders = []string{ // map of annotations. // Every alert could have a different datasource, so function // requires a queryFunction as an argument. -func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) { - tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr} +func (a *Alert) ExecTemplate(q QueryFn, labels, annotations map[string]string) (map[string]string, error) { + tplData := AlertTplData{Value: a.Value, Labels: labels, Expr: a.Expr} return templateAnnotations(annotations, tplData, funcsWithQuery(q)) } diff --git a/app/vmalert/notifier/alert_test.go b/app/vmalert/notifier/alert_test.go index f8e0c77ee..27b83aac6 100644 --- a/app/vmalert/notifier/alert_test.go +++ b/app/vmalert/notifier/alert_test.go @@ -130,7 +130,7 @@ func TestAlert_ExecTemplate(t *testing.T) { } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations) + tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations) if err != nil { t.Fatal(err) }