mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-03-11 15:34:56 +00:00
vmalert: automatically add exported_
prefix for original evaluation… (#5398)
automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161 Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
be20501376
commit
1f477aba41
6 changed files with 111 additions and 67 deletions
|
@ -237,11 +237,30 @@ type labelSet struct {
|
|||
origin map[string]string
|
||||
// processed labels includes origin labels
|
||||
// plus extra labels (group labels, service labels like alertNameLabel).
|
||||
// in case of conflicts, extra labels are preferred.
|
||||
// in case of key conflicts, origin labels are renamed with prefix `exported_` and extra labels are preferred.
|
||||
// see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161
|
||||
// used as labels attached to notifier.Alert and ALERTS series written to remote storage.
|
||||
processed map[string]string
|
||||
}
|
||||
|
||||
// add adds a value v with key k to origin and processed label sets.
|
||||
// On k conflicts in processed set, the passed v is preferred.
|
||||
// On k conflicts in origin set, the original value is preferred and copied
|
||||
// to processed with `exported_%k` key. The copy happens only if passed v isn't equal to origin[k] value.
|
||||
func (ls *labelSet) add(k, v string) {
|
||||
ls.processed[k] = v
|
||||
ov, ok := ls.origin[k]
|
||||
if !ok {
|
||||
ls.origin[k] = v
|
||||
return
|
||||
}
|
||||
if ov != v {
|
||||
// copy value only if v and ov are different
|
||||
key := fmt.Sprintf("exported_%s", k)
|
||||
ls.processed[key] = ov
|
||||
}
|
||||
}
|
||||
|
||||
// toLabels converts labels from given Metric
|
||||
// to labelSet which contains original and processed labels.
|
||||
func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*labelSet, error) {
|
||||
|
@ -267,24 +286,14 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
|
|||
return nil, fmt.Errorf("failed to expand labels: %w", err)
|
||||
}
|
||||
for k, v := range extraLabels {
|
||||
ls.processed[k] = v
|
||||
if _, ok := ls.origin[k]; !ok {
|
||||
ls.origin[k] = v
|
||||
}
|
||||
ls.add(k, v)
|
||||
}
|
||||
|
||||
// set additional labels to identify group and rule name
|
||||
if ar.Name != "" {
|
||||
ls.processed[alertNameLabel] = ar.Name
|
||||
if _, ok := ls.origin[alertNameLabel]; !ok {
|
||||
ls.origin[alertNameLabel] = ar.Name
|
||||
}
|
||||
ls.add(alertNameLabel, ar.Name)
|
||||
}
|
||||
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
||||
ls.processed[alertGroupNameLabel] = ar.GroupName
|
||||
if _, ok := ls.origin[alertGroupNameLabel]; !ok {
|
||||
ls.origin[alertGroupNameLabel] = ar.GroupName
|
||||
}
|
||||
ls.add(alertGroupNameLabel, ar.GroupName)
|
||||
}
|
||||
return ls, nil
|
||||
}
|
||||
|
@ -414,8 +423,7 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
}
|
||||
h := hash(ls.processed)
|
||||
if _, ok := updated[h]; ok {
|
||||
// duplicate may be caused by extra labels
|
||||
// conflicting with the metric labels
|
||||
// duplicate may be caused the removal of `__name__` label
|
||||
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.Err
|
||||
}
|
||||
|
|
|
@ -768,14 +768,16 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
|||
ar.q = fq
|
||||
|
||||
// successful attempt
|
||||
// label `job` will be overridden by rule extra label, the original value will be reserved by "exported_job"
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err := ar.exec(context.TODO(), time.Now(), 0)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// label `job` will collide with rule extra label and will make both time series equal
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
// label `__name__` will be omitted and get duplicated results here
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo_1", "job", "bar"))
|
||||
_, err = ar.exec(context.TODO(), time.Now(), 0)
|
||||
if !errors.Is(err, errDuplicate) {
|
||||
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
|
||||
|
@ -899,20 +901,22 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"),
|
||||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{alertNameLabel: "override label", "instance": "foo"}): {
|
||||
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "override label",
|
||||
"instance": "foo",
|
||||
alertNameLabel: "override label",
|
||||
"exported_alertname": "override",
|
||||
"instance": "foo",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `first: Too high connection number for "foo"`,
|
||||
"description": `override: It is 2 connections for "foo"`,
|
||||
},
|
||||
},
|
||||
hash(map[string]string{alertNameLabel: "override label", "instance": "bar"}): {
|
||||
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "override label",
|
||||
"instance": "bar",
|
||||
alertNameLabel: "override label",
|
||||
"exported_alertname": "override",
|
||||
"instance": "bar",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `second: Too high connection number for "bar"`,
|
||||
|
@ -941,14 +945,18 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
},
|
||||
map[uint64]*notifier.Alert{
|
||||
hash(map[string]string{
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"instance": "foo",
|
||||
alertNameLabel: "OriginLabels",
|
||||
"exported_alertname": "originAlertname",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"exported_alertgroup": "originGroupname",
|
||||
"instance": "foo",
|
||||
}): {
|
||||
Labels: map[string]string{
|
||||
alertNameLabel: "OriginLabels",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"instance": "foo",
|
||||
alertNameLabel: "OriginLabels",
|
||||
"exported_alertname": "originAlertname",
|
||||
alertGroupNameLabel: "Testing",
|
||||
"exported_alertgroup": "originGroupname",
|
||||
"instance": "foo",
|
||||
},
|
||||
Annotations: map[string]string{
|
||||
"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
|
||||
|
@ -1092,3 +1100,54 @@ func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.
|
|||
rule.KeepFiringFor = keepFiringFor
|
||||
return rule
|
||||
}
|
||||
|
||||
func TestAlertingRule_ToLabels(t *testing.T) {
|
||||
metric := datasource.Metric{
|
||||
Labels: []datasource.Label{
|
||||
{Name: "instance", Value: "0.0.0.0:8800"},
|
||||
{Name: "group", Value: "vmalert"},
|
||||
{Name: "alertname", Value: "ConfigurationReloadFailure"},
|
||||
},
|
||||
Values: []float64{1},
|
||||
Timestamps: []int64{time.Now().UnixNano()},
|
||||
}
|
||||
|
||||
ar := &AlertingRule{
|
||||
Labels: map[string]string{
|
||||
"instance": "override", // this should override instance with new value
|
||||
"group": "vmalert", // this shouldn't have effect since value in metric is equal
|
||||
},
|
||||
Expr: "sum(vmalert_alerting_rules_error) by(instance, group, alertname) > 0",
|
||||
Name: "AlertingRulesError",
|
||||
GroupName: "vmalert",
|
||||
}
|
||||
|
||||
expectedOriginLabels := map[string]string{
|
||||
"instance": "0.0.0.0:8800",
|
||||
"group": "vmalert",
|
||||
"alertname": "ConfigurationReloadFailure",
|
||||
"alertgroup": "vmalert",
|
||||
}
|
||||
|
||||
expectedProcessedLabels := map[string]string{
|
||||
"instance": "override",
|
||||
"exported_instance": "0.0.0.0:8800",
|
||||
"alertname": "AlertingRulesError",
|
||||
"exported_alertname": "ConfigurationReloadFailure",
|
||||
"group": "vmalert",
|
||||
"alertgroup": "vmalert",
|
||||
}
|
||||
|
||||
ls, err := ar.toLabels(metric, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(ls.origin, expectedOriginLabels) {
|
||||
t.Errorf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(ls.processed, expectedProcessedLabels) {
|
||||
t.Errorf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -194,6 +194,9 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
|
|||
labels["__name__"] = rr.Name
|
||||
// override existing labels with configured ones
|
||||
for k, v := range rr.Labels {
|
||||
if _, ok := labels[k]; ok && labels[k] != v {
|
||||
labels[fmt.Sprintf("exported_%s", k)] = labels[k]
|
||||
}
|
||||
labels[k] = v
|
||||
}
|
||||
return newTimeSeries(m.Values, m.Timestamps, labels)
|
||||
|
@ -203,7 +206,7 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
|
|||
func (rr *RecordingRule) updateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
return fmt.Errorf("BUG: attempt to update recording rule with wrong type %#v", r)
|
||||
}
|
||||
rr.Expr = nr.Expr
|
||||
rr.Labels = nr.Labels
|
||||
|
|
|
@ -61,7 +61,7 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar", "source", "origin"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
|
@ -70,9 +70,10 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
"source": "test",
|
||||
}),
|
||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
"__name__": "job:foo",
|
||||
"job": "bar",
|
||||
"source": "test",
|
||||
"exported_source": "origin",
|
||||
}),
|
||||
},
|
||||
},
|
||||
|
@ -254,10 +255,7 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
|
|||
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), errDuplicate.Error()) {
|
||||
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
* BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped.
|
||||
* BUGFIX: `vmstorage`: added missing `-inmemoryDataFlushInterval` command-line flag, which was missing in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) after implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337) in [v1.85.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.0).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
|
||||
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.
|
||||
|
|
|
@ -100,6 +100,8 @@ See the full list of configuration flags in [configuration](#configuration) sect
|
|||
|
||||
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
|
||||
to specify different `-external.label` command-line flags in order to define which `vmalert` generated rules or alerts.
|
||||
If rule result metrics have label that conflict with `-external.label`, `vmalert` will automatically rename
|
||||
it with prefix `exported_`.
|
||||
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
|
@ -896,33 +898,6 @@ max(vmalert_alerting_rules_last_evaluation_series_fetched) by(group, alertname)
|
|||
See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4039).
|
||||
This feature is available only if vmalert is using VictoriaMetrics v1.90 or higher as a datasource.
|
||||
|
||||
### Series with the same labelset
|
||||
|
||||
vmalert can produce the following error message during rules evaluation:
|
||||
```
|
||||
result contains metrics with the same labelset after applying rule labels
|
||||
```
|
||||
|
||||
The error means there is a collision between [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series)
|
||||
after applying extra labels to result.
|
||||
|
||||
For example, a rule with `expr: foo > 0` returns two distinct time series in response:
|
||||
```
|
||||
foo{bar="baz"} 1
|
||||
foo{bar="qux"} 2
|
||||
```
|
||||
|
||||
If user configures `-external.label=bar=baz` cmd-line flag to enforce
|
||||
adding `bar="baz"` label-value pair, then time series won't be distinct anymore:
|
||||
```
|
||||
foo{bar="baz"} 1
|
||||
foo{bar="baz"} 2 # 'bar' label was overriden by `-external.label=bar=baz
|
||||
```
|
||||
|
||||
The same issue can be caused by collision of configured `labels` on [Group](#groups) or [Rule](#rules) levels.
|
||||
To fix it one should avoid collisions by carefully picking label overrides in configuration.
|
||||
|
||||
|
||||
## Security
|
||||
|
||||
See general recommendations regarding security [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#security).
|
||||
|
|
Loading…
Reference in a new issue