vmalert: automatically add exported_ prefix for original evaluation… (#5398)

automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one,
previously it was overridden.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit 1f477aba41)
This commit is contained in:
Hui Wang 2023-12-22 23:07:47 +08:00 committed by hagen1778
parent 372553479a
commit c14e229b20
No known key found for this signature in database
GPG key ID: 3BF75F3741CA9640
6 changed files with 111 additions and 67 deletions

View file

@ -237,11 +237,30 @@ type labelSet struct {
origin map[string]string
// processed labels includes origin labels
// plus extra labels (group labels, service labels like alertNameLabel).
// in case of conflicts, extra labels are preferred.
// in case of key conflicts, origin labels are renamed with prefix `exported_` and extra labels are preferred.
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161
// used as labels attached to notifier.Alert and ALERTS series written to remote storage.
processed map[string]string
}
// add adds a value v with key k to origin and processed label sets.
// On k conflicts in processed set, the passed v is preferred.
// On k conflicts in origin set, the original value is preferred and copied
// to processed with `exported_%k` key. The copy happens only if passed v isn't equal to origin[k] value.
func (ls *labelSet) add(k, v string) {
ls.processed[k] = v
ov, ok := ls.origin[k]
if !ok {
ls.origin[k] = v
return
}
if ov != v {
// copy value only if v and ov are different
key := fmt.Sprintf("exported_%s", k)
ls.processed[key] = ov
}
}
// toLabels converts labels from given Metric
// to labelSet which contains original and processed labels.
func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*labelSet, error) {
@ -267,24 +286,14 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
return nil, fmt.Errorf("failed to expand labels: %w", err)
}
for k, v := range extraLabels {
ls.processed[k] = v
if _, ok := ls.origin[k]; !ok {
ls.origin[k] = v
}
ls.add(k, v)
}
// set additional labels to identify group and rule name
if ar.Name != "" {
ls.processed[alertNameLabel] = ar.Name
if _, ok := ls.origin[alertNameLabel]; !ok {
ls.origin[alertNameLabel] = ar.Name
}
ls.add(alertNameLabel, ar.Name)
}
if !*disableAlertGroupLabel && ar.GroupName != "" {
ls.processed[alertGroupNameLabel] = ar.GroupName
if _, ok := ls.origin[alertGroupNameLabel]; !ok {
ls.origin[alertGroupNameLabel] = ar.GroupName
}
ls.add(alertGroupNameLabel, ar.GroupName)
}
return ls, nil
}
@ -414,8 +423,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
// duplicate may be caused the removal of `__name__` label
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.Err
}

View file

@ -768,14 +768,16 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
ar.q = fq
// successful attempt
// label `job` will be overridden by rule extra label, the original value will be reserved by "exported_job"
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
// label `__name__` will be omitted and get duplicated results here
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo_1", "job", "bar"))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
@ -899,20 +901,22 @@ func TestAlertingRule_Template(t *testing.T) {
metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"),
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"instance": "foo",
alertNameLabel: "override label",
"exported_alertname": "override",
"instance": "foo",
},
Annotations: map[string]string{
"summary": `first: Too high connection number for "foo"`,
"description": `override: It is 2 connections for "foo"`,
},
},
hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"instance": "bar",
alertNameLabel: "override label",
"exported_alertname": "override",
"instance": "bar",
},
Annotations: map[string]string{
"summary": `second: Too high connection number for "bar"`,
@ -941,14 +945,18 @@ func TestAlertingRule_Template(t *testing.T) {
},
map[uint64]*notifier.Alert{
hash(map[string]string{
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
"instance": "foo",
alertNameLabel: "OriginLabels",
"exported_alertname": "originAlertname",
alertGroupNameLabel: "Testing",
"exported_alertgroup": "originGroupname",
"instance": "foo",
}): {
Labels: map[string]string{
alertNameLabel: "OriginLabels",
alertGroupNameLabel: "Testing",
"instance": "foo",
alertNameLabel: "OriginLabels",
"exported_alertname": "originAlertname",
alertGroupNameLabel: "Testing",
"exported_alertgroup": "originGroupname",
"instance": "foo",
},
Annotations: map[string]string{
"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
@ -1092,3 +1100,54 @@ func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.
rule.KeepFiringFor = keepFiringFor
return rule
}
func TestAlertingRule_ToLabels(t *testing.T) {
metric := datasource.Metric{
Labels: []datasource.Label{
{Name: "instance", Value: "0.0.0.0:8800"},
{Name: "group", Value: "vmalert"},
{Name: "alertname", Value: "ConfigurationReloadFailure"},
},
Values: []float64{1},
Timestamps: []int64{time.Now().UnixNano()},
}
ar := &AlertingRule{
Labels: map[string]string{
"instance": "override", // this should override instance with new value
"group": "vmalert", // this shouldn't have effect since value in metric is equal
},
Expr: "sum(vmalert_alerting_rules_error) by(instance, group, alertname) > 0",
Name: "AlertingRulesError",
GroupName: "vmalert",
}
expectedOriginLabels := map[string]string{
"instance": "0.0.0.0:8800",
"group": "vmalert",
"alertname": "ConfigurationReloadFailure",
"alertgroup": "vmalert",
}
expectedProcessedLabels := map[string]string{
"instance": "override",
"exported_instance": "0.0.0.0:8800",
"alertname": "AlertingRulesError",
"exported_alertname": "ConfigurationReloadFailure",
"group": "vmalert",
"alertgroup": "vmalert",
}
ls, err := ar.toLabels(metric, nil)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
if !reflect.DeepEqual(ls.origin, expectedOriginLabels) {
t.Errorf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels)
}
if !reflect.DeepEqual(ls.processed, expectedProcessedLabels) {
t.Errorf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels)
}
}

View file

@ -194,6 +194,9 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
labels["__name__"] = rr.Name
// override existing labels with configured ones
for k, v := range rr.Labels {
if _, ok := labels[k]; ok && labels[k] != v {
labels[fmt.Sprintf("exported_%s", k)] = labels[k]
}
labels[k] = v
}
return newTimeSeries(m.Values, m.Timestamps, labels)
@ -203,7 +206,7 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
func (rr *RecordingRule) updateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
return fmt.Errorf("BUG: attempt to update recording rule with wrong type %#v", r)
}
rr.Expr = nr.Expr
rr.Labels = nr.Labels

View file

@ -61,7 +61,7 @@ func TestRecordingRule_Exec(t *testing.T) {
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar", "source", "origin"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
@ -70,9 +70,10 @@ func TestRecordingRule_Exec(t *testing.T) {
"source": "test",
}),
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
"job": "bar",
"source": "test",
"__name__": "job:foo",
"job": "bar",
"source": "test",
"exported_source": "origin",
}),
},
},
@ -254,10 +255,7 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), errDuplicate.Error()) {
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
if err != nil {
t.Fatal(err)
}
}

View file

@ -44,6 +44,7 @@ The sandbox cluster installation is running under the constant load generated by
* BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped.
* BUGFIX: `vmstorage`: added missing `-inmemoryDataFlushInterval` command-line flag, which was missing in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) after implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337) in [v1.85.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.0).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.

View file

@ -100,6 +100,8 @@ See the full list of configuration flags in [configuration](#configuration) sect
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `-external.label` command-line flags in order to define which `vmalert` generated rules or alerts.
If rule result metrics have label that conflict with `-external.label`, `vmalert` will automatically rename
it with prefix `exported_`.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
@ -896,33 +898,6 @@ max(vmalert_alerting_rules_last_evaluation_series_fetched) by(group, alertname)
See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
This feature is available only if vmalert is using VictoriaMetrics v1.90 or higher as a datasource.
### Series with the same labelset
vmalert can produce the following error message during rules evaluation:
```
result contains metrics with the same labelset after applying rule labels
```
The error means there is a collision between [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
after applying extra labels to result.
For example, a rule with `expr: foo > 0` returns two distinct time series in response:
```
foo{bar="baz"} 1
foo{bar="qux"} 2
```
If user configures `-external.label=bar=baz` cmd-line flag to enforce
adding `bar="baz"` label-value pair, then time series won't be distinct anymore:
```
foo{bar="baz"} 1
foo{bar="baz"} 2 # 'bar' label was overriden by `-external.label=bar=baz
```
The same issue can be caused by collision of configured `labels` on [Group](#groups) or [Rule](#rules) levels.
To fix it one should avoid collisions by carefully picking label overrides in configuration.
## Security
See general recommendations regarding security [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#security).