dashboards: add file label filter to vmalert dashboard panels (#7515)

Previously, metrics from groups with the same name but in different
files could be mixed in the results.

e.g. the evaluation time
[here](https://grafana.maas.victoriametrics.com/d/LzldHAVnz/victoriametrics-vmalert?orgId=1&var-ds=PE8D8DB4BEE4E4B22&var-job=All&var-instance=All&var-file=%2Fetc%2Fvmalert%2Fconfig%2Fvm-per-tenant-rulefiles-0%2Fmaas-tenant-1011-maas-1011-vm-health.yaml&var-group=All&var-topk=5&editPanel=23)
is the total for multiple groups from different tenants.
This commit is contained in:
Hui Wang 2024-11-13 00:00:39 +08:00 committed by f41gh7
parent 9c1b3b63bc
commit 1bff6c1bbd
No known key found for this signature in database
GPG key ID: 4558311CF775EC72
4 changed files with 92 additions and 45 deletions

View file

@ -276,7 +276,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -338,7 +338,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -404,7 +404,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0))",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -910,9 +910,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n) by(job, instance, group)) \nby(job, group))",
"expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))",
"interval": "",
"legendFormat": "{{group}} ({{job}})",
"legendFormat": "({{job}}) {{group}}({{file}})",
"range": true,
"refId": "A"
}
@ -2292,9 +2292,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group) > 0",
"expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group, file) > 0",
"interval": "1m",
"legendFormat": "__auto",
"legendFormat": "({{job}}) {{group}}({{file}})",
"range": true,
"refId": "A"
}
@ -2517,9 +2517,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0)",
"expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0)",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -2619,9 +2619,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -2721,9 +2721,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
"expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -3050,9 +3050,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, instance, group, recording) > 0\n ) by(job, group, recording)\n)",
"expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3152,9 +3152,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"} < 1) by(job, group, recording)",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"} < 1) by(job, group, file, recording)",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3251,9 +3251,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, recording) > 0",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3749,6 +3749,29 @@
"sort": 0,
"type": "query"
},
{
"allValue": ".*",
"current": {},
"datasource": {
"type": "victoriametrics-datasource",
"uid": "$ds"
},
"definition": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "file",
"options": [],
"query": {
"query": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".*",
"current": {},

View file

@ -275,7 +275,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -337,7 +337,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -403,7 +403,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0))",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -909,9 +909,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n) by(job, instance, group)) \nby(job, group))",
"expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))",
"interval": "",
"legendFormat": "{{group}} ({{job}})",
"legendFormat": "({{job}}) {{group}}({{file}})",
"range": true,
"refId": "A"
}
@ -2291,9 +2291,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group) > 0",
"expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group, file) > 0",
"interval": "1m",
"legendFormat": "__auto",
"legendFormat": "({{job}}) {{group}}({{file}})",
"range": true,
"refId": "A"
}
@ -2516,9 +2516,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0)",
"expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0)",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -2618,9 +2618,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -2720,9 +2720,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
"expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})",
"range": true,
"refId": "A"
}
@ -3049,9 +3049,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, instance, group, recording) > 0\n ) by(job, group, recording)\n)",
"expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3151,9 +3151,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"} < 1) by(job, group, recording)",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"} < 1) by(job, group, file, recording)",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3250,9 +3250,9 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, recording) > 0",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})",
"range": true,
"refId": "A"
}
@ -3748,6 +3748,29 @@
"sort": 0,
"type": "query"
},
{
"allValue": ".*",
"current": {},
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"definition": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "file",
"options": [],
"query": {
"query": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".*",
"current": {},

View file

@ -23,9 +23,9 @@ groups:
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
description: "Alerting rules execution is failing for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesError
@ -34,9 +34,9 @@ groups:
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
description: "Recording rules execution is failing for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesNoData
@ -45,9 +45,9 @@ groups:
labels:
severity: info
annotations:
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data"
description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\"
description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\ in file \"{{ $labels.file }}\"
produces 0 samples over the last 30min. It might be caused by a misconfiguration
or incorrect query expression."
@ -58,7 +58,7 @@ groups:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
The group evaluation time takes longer than the configured evaluation interval. This may result in missed
alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups.

View file

@ -24,6 +24,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068).
* BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details.
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth): properly check availability of all the backends before giving up when proxying requests. Previously, vmauth could return an error even if there were healthy backends available. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061) for details.
* BUGFIX: [dashboards](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards): add `file` label filter to vmalert dashboard panels. Previously, metrics from groups with the same name but different rule files could be mixed in the results.
## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0)