vmalert: fix labels and annotations processing for alerts (#2403)

To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.

For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.

See more details here https://github.com/prometheus/compliance/issues/80

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-04-06 20:24:45 +02:00 committed by Aliaksandr Valialkin
parent cb319b15bb
commit 4de1b2b74a
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
7 changed files with 142 additions and 138 deletions

View file

@ -141,6 +141,53 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
type labelSet struct {
// origin labels from series
// used for templating
origin map[string]string
// processed labels with additional data
// used as Alert labels
processed map[string]string
}
// toLabels converts labels from given Metric
// to labelSet which contains original and processed labels.
func (ar *AlertingRule) toLabels(m datasource.Metric, qFn notifier.QueryFn) (*labelSet, error) {
ls := &labelSet{
origin: make(map[string]string, len(m.Labels)),
processed: make(map[string]string),
}
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
ls.origin[l.Name] = l.Value
ls.processed[l.Name] = l.Value
}
extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{
Labels: ls.origin,
Value: m.Values[0],
Expr: ar.Expr,
})
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
for k, v := range extraLabels {
ls.processed[k] = v
}
// set additional labels to identify group and rule name
if ar.Name != "" {
ls.processed[alertNameLabel] = ar.Name
}
if !*disableAlertGroupLabel && ar.GroupName != "" {
ls.processed[alertGroupNameLabel] = ar.GroupName
}
return ls, nil
}
// ExecRange executes alerting rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
@ -155,24 +202,7 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
}
for _, s := range series {
// set additional labels to identify group and rule Name
if ar.Name != "" {
s.SetLabel(alertNameLabel, ar.Name)
}
if !*disableAlertGroupLabel && ar.GroupName != "" {
s.SetLabel(alertGroupNameLabel, ar.GroupName)
}
// extra labels could contain templates, so we expand them first
labels, err := expandLabels(s, qFn, ar)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
for k, v := range labels {
// apply extra labels to datasource
// so the hash key will be consistent on restore
s.SetLabel(k, v)
}
a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
if err != nil {
return nil, fmt.Errorf("failed to create alert: %s", err)
}
@ -234,28 +264,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
// set additional labels to identify group and rule name
if ar.Name != "" {
m.SetLabel(alertNameLabel, ar.Name)
}
if !*disableAlertGroupLabel && ar.GroupName != "" {
m.SetLabel(alertGroupNameLabel, ar.GroupName)
}
// extra labels could contain templates, so we expand them first
labels, err := expandLabels(m, qFn, ar)
ls, err := ar.toLabels(m, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
for k, v := range labels {
// apply extra labels to datasource
// so the hash key will be consistent on restore
m.SetLabel(k, v)
}
h := hash(m)
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
ar.lastExecError = fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, ar.lastExecError
}
updated[h] = struct{}{}
@ -272,14 +289,14 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
a.Value = m.Values[0]
// and re-exec template since Value can be used
// in annotations
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
if err != nil {
return nil, err
}
}
continue
}
a, err := ar.newAlert(m, ar.lastExecTime, qFn)
a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
if err != nil {
ar.lastExecError = err
return nil, fmt.Errorf("failed to create alert: %w", err)
@ -315,19 +332,6 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time) ([]prompbmarshal
return ar.toTimeSeries(ts.Unix()), nil
}
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
metricLabels := make(map[string]string)
for _, l := range m.Labels {
metricLabels[l.Name] = l.Value
}
tpl := notifier.AlertTplData{
Labels: metricLabels,
Value: m.Values[0],
Expr: ar.Expr,
}
return notifier.ExecTemplate(q, ar.Labels, tpl)
}
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts {
@ -358,42 +362,43 @@ func (ar *AlertingRule) UpdateWith(r Rule) error {
}
// TODO: consider hashing algorithm in VM
func hash(m datasource.Metric) uint64 {
func hash(labels map[string]string) uint64 {
hash := fnv.New64a()
labels := m.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
if k == "__name__" {
continue
}
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
name, value := k, labels[k]
hash.Write([]byte(name))
hash.Write([]byte(value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
var err error
if ls == nil {
ls, err = ar.toLabels(m, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
}
a := &notifier.Alert{
GroupID: ar.GroupID,
Name: ar.Name,
Labels: map[string]string{},
Labels: ls.processed,
Value: m.Values[0],
ActiveAt: start,
Expr: ar.Expr,
}
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
a.Labels[l.Name] = l.Value
}
var err error
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
return a, err
}
@ -560,15 +565,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb
}
for _, m := range qMetrics {
a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
ls := &labelSet{
origin: make(map[string]string, len(m.Labels)),
processed: make(map[string]string, len(m.Labels)),
}
for _, l := range m.Labels {
if l.Name == "__name__" {
continue
}
ls.origin[l.Name] = l.Value
ls.processed[l.Name] = l.Value
}
a, err := ar.newAlert(m, ls, time.Unix(int64(m.Values[0]), 0), qFn)
if err != nil {
return fmt.Errorf("failed to create alert: %w", err)
}
a.ID = hash(m)
a.ID = hash(ls.processed)
a.State = notifier.StatePending
a.Restored = true
ar.alerts[a.ID] = a
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
}
return nil
}

View file

@ -315,10 +315,13 @@ func TestAlertingRule_Exec(t *testing.T) {
}
expAlerts := make(map[uint64]*notifier.Alert)
for _, ta := range tc.expAlerts {
labels := ta.labels
labels = append(labels, alertNameLabel)
labels = append(labels, tc.rule.Name)
h := hash(metricWithLabels(t, labels...))
labels := make(map[string]string)
for i := 0; i < len(ta.labels); i += 2 {
k, v := ta.labels[i], ta.labels[i+1]
labels[k] = v
}
labels[alertNameLabel] = tc.rule.Name
h := hash(labels)
expAlerts[h] = ta.alert
}
for key, exp := range expAlerts {
@ -513,7 +516,7 @@ func TestAlertingRule_Restore(t *testing.T) {
),
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StatePending,
hash(nil): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(time.Hour)},
},
},
@ -529,12 +532,12 @@ func TestAlertingRule_Restore(t *testing.T) {
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
alertNameLabel, "metric labels",
alertGroupNameLabel, "groupID",
"foo", "bar",
"namespace", "baz",
)): {State: notifier.StatePending,
hash(map[string]string{
alertNameLabel: "metric labels",
alertGroupNameLabel: "groupID",
"foo": "bar",
"namespace": "baz",
}): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(time.Hour)},
},
},
@ -550,11 +553,11 @@ func TestAlertingRule_Restore(t *testing.T) {
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
"source", "vm",
)): {State: notifier.StatePending,
hash(map[string]string{
"foo": "bar",
"namespace": "baz",
"source": "vm",
}): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(time.Hour)},
},
},
@ -575,11 +578,11 @@ func TestAlertingRule_Restore(t *testing.T) {
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
hash(map[string]string{"host": "localhost-1"}): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(time.Hour)},
hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
hash(map[string]string{"host": "localhost-2"}): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(2 * time.Hour)},
hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
hash(map[string]string{"host": "localhost-3"}): {State: notifier.StatePending,
ActiveAt: time.Now().Truncate(3 * time.Hour)},
},
},
@ -659,7 +662,7 @@ func TestAlertingRule_Template(t *testing.T) {
metricWithValueAndLabels(t, 1, "instance", "bar"),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): {
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): {
Annotations: map[string]string{},
Labels: map[string]string{
alertNameLabel: "common",
@ -667,7 +670,7 @@ func TestAlertingRule_Template(t *testing.T) {
"instance": "foo",
},
},
hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): {
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): {
Annotations: map[string]string{},
Labels: map[string]string{
alertNameLabel: "common",
@ -682,11 +685,10 @@ func TestAlertingRule_Template(t *testing.T) {
Name: "override label",
Labels: map[string]string{
"instance": "{{ $labels.instance }}",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "{{ $labels.instance }}" for region {{ $labels.region }}`,
"description": `It is {{ $value }} connections for "{{ $labels.instance }}"`,
"summary": `Too high connection number for "{{ $labels.instance }}"`,
"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
},
alerts: make(map[uint64]*notifier.Alert),
},
@ -695,64 +697,58 @@ func TestAlertingRule_Template(t *testing.T) {
metricWithValueAndLabels(t, 10, "instance", "bar", alertNameLabel, "override"),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): {
hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"instance": "foo",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "foo" for region east`,
"description": `It is 2 connections for "foo"`,
"summary": `Too high connection number for "foo"`,
"description": `override: It is 2 connections for "foo"`,
},
},
hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): {
hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"instance": "bar",
"region": "east",
},
Annotations: map[string]string{
"summary": `Too high connection number for "bar" for region east`,
"description": `It is 10 connections for "bar"`,
"summary": `Too high connection number for "bar"`,
"description": `override: It is 10 connections for "bar"`,
},
},
},
},
{
&AlertingRule{
Name: "ExtraTemplating",
Name: "OriginLabels",
GroupName: "Testing",
Labels: map[string]string{
"name": "alert_{{ $labels.alertname }}",
"group": "group_{{ $labels.alertgroup }}",
"instance": "{{ $labels.instance }}",
},
Annotations: map[string]string{
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
"description": `Alert "{{ $labels.name }}({{ $labels.group }})" for instance {{ $labels.instance }}`,
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
},
alerts: make(map[uint64]*notifier.Alert),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "instance", "foo"),
metricWithValueAndLabels(t, 1,
alertNameLabel, "originAlertname",
alertGroupNameLabel, "originGroupname",
"instance", "foo"),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, alertNameLabel, "ExtraTemplating",
"name", "alert_ExtraTemplating",
alertGroupNameLabel, "Testing",
"group", "group_Testing",
"instance", "foo")): {
hash(map[string]string{
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
"instance": "foo"}): {
Labels: map[string]string{
alertNameLabel: "ExtraTemplating",
"name": "alert_ExtraTemplating",
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
"group": "group_Testing",
"instance": "foo",
},
Annotations: map[string]string{
"summary": `Alert "ExtraTemplating(Testing)" for instance foo`,
"description": `Alert "alert_ExtraTemplating(group_Testing)" for instance foo`,
"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
},
},
},

View file

@ -174,7 +174,7 @@ func TestGroupStart(t *testing.T) {
m2 := metricWithLabels(t, "instance", inst2, "job", job)
r := g.Rules[0].(*AlertingRule)
alert1, err := r.newAlert(m1, time.Now(), nil)
alert1, err := r.newAlert(m1, nil, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
@ -187,13 +187,9 @@ func TestGroupStart(t *testing.T) {
// add service labels
alert1.Labels[alertNameLabel] = alert1.Name
alert1.Labels[alertGroupNameLabel] = g.Name
var labels1 []string
for k, v := range alert1.Labels {
labels1 = append(labels1, k, v)
}
alert1.ID = hash(metricWithLabels(t, labels1...))
alert1.ID = hash(alert1.Labels)
alert2, err := r.newAlert(m2, time.Now(), nil)
alert2, err := r.newAlert(m2, nil, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
@ -206,11 +202,7 @@ func TestGroupStart(t *testing.T) {
// add service labels
alert2.Labels[alertNameLabel] = alert2.Name
alert2.Labels[alertGroupNameLabel] = g.Name
var labels2 []string
for k, v := range alert2.Labels {
labels2 = append(labels2, k, v)
}
alert2.ID = hash(metricWithLabels(t, labels2...))
alert2.ID = hash(alert2.Labels)
finished := make(chan struct{})
fs.add(m1)

View file

@ -243,7 +243,7 @@ func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, vali
"tpl": externalAlertSource,
}
return func(alert notifier.Alert) string {
templated, err := alert.ExecTemplate(nil, m)
templated, err := alert.ExecTemplate(nil, nil, m)
if err != nil {
logger.Errorf("can not exec source template %s", err)
}

View file

@ -37,7 +37,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
g, ok := m.groups[gID]
if !ok {
return nil, fmt.Errorf("can't find group with id %q", gID)
return nil, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
@ -48,7 +48,7 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
return apiAlert, nil
}
}
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name)
}
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {

View file

@ -88,8 +88,8 @@ var tplHeaders = []string{
// map of annotations.
// Every alert could have a different datasource, so function
// requires a queryFunction as an argument.
func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) {
tplData := AlertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
func (a *Alert) ExecTemplate(q QueryFn, labels, annotations map[string]string) (map[string]string, error) {
tplData := AlertTplData{Value: a.Value, Labels: labels, Expr: a.Expr}
return templateAnnotations(annotations, tplData, funcsWithQuery(q))
}

View file

@ -130,7 +130,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations)
tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations)
if err != nil {
t.Fatal(err)
}