From 935bec447bb72d957bfb9ea83e9f98ffb7a0dfbd Mon Sep 17 00:00:00 2001 From: Dmytro Kozlov Date: Wed, 6 Dec 2023 19:39:35 +0100 Subject: [PATCH] app/vmalert: replace error metrics for gauges with counter metrics (#5217) See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160 Signed-off-by: hagen1778 Co-authored-by: hagen1778 --- app/vmalert/rule/alerting.go | 14 +++++--------- app/vmalert/rule/alerting_test.go | 5 +++++ app/vmalert/rule/recording.go | 14 +++++--------- app/vmalert/rule/recording_test.go | 16 +++++++++++++--- dashboards/vm/vmalert.json | 12 ++++++------ dashboards/vmalert.json | 12 ++++++------ deployment/docker/alerts-vmalert.yml | 4 ++-- docs/CHANGELOG.md | 3 +++ 8 files changed, 45 insertions(+), 35 deletions(-) diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index 93584930d..e10405250 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -48,7 +48,7 @@ type AlertingRule struct { } type alertingRuleMetrics struct { - errors *utils.Gauge + errors *utils.Counter pending *utils.Gauge active *utils.Gauge samples *utils.Gauge @@ -118,14 +118,7 @@ func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule } return float64(num) }) - ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels), - func() float64 { - e := ar.state.getLast() - if e.Err == nil { - return 0 - } - return 1 - }) + ar.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{%s}`, labels)) ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels), func() float64 { e := ar.state.getLast() @@ -385,6 +378,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr defer func() { ar.state.add(curState) + if curState.Err != nil { + ar.metrics.errors.Inc() + } }() ar.alertsMu.Lock() diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index b6496c688..91d90e31e 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -3,6 +3,7 @@ package rule import ( "context" "errors" + "fmt" "reflect" "sort" "strings" @@ -13,6 +14,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" ) @@ -1078,6 +1080,9 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule { EvalInterval: waitFor, alerts: make(map[uint64]*notifier.Alert), state: &ruleState{entries: make([]StateEntry, 10)}, + metrics: &alertingRuleMetrics{ + errors: utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{alertname=%q}`, name)), + }, } return &rule } diff --git a/app/vmalert/rule/recording.go b/app/vmalert/rule/recording.go index 196f51456..08a69a8fe 100644 --- a/app/vmalert/rule/recording.go +++ b/app/vmalert/rule/recording.go @@ -36,7 +36,7 @@ type RecordingRule struct { } type recordingRuleMetrics struct { - errors *utils.Gauge + errors *utils.Counter samples *utils.Gauge } @@ -83,14 +83,7 @@ func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul } labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID()) - rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels), - func() float64 { - e := rr.state.getLast() - if e.Err == nil { - return 0 - } - return 1 - }) + rr.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_recording_rules_errors_total{%s}`, labels)) rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels), func() float64 { e := rr.state.getLast() @@ -142,6 +135,9 @@ func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]p defer func() { rr.state.add(curState) + if curState.Err != nil { + rr.metrics.errors.Inc() + } }() if err != nil { diff --git a/app/vmalert/rule/recording_test.go b/app/vmalert/rule/recording_test.go index 287bb065c..65b391f19 100644 --- a/app/vmalert/rule/recording_test.go +++ b/app/vmalert/rule/recording_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" ) @@ -201,9 +202,15 @@ func TestRecordingRuleLimit(t *testing.T) { metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), } - rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{ - "source": "test_limit", - }} + rule := &RecordingRule{Name: "job:foo", + state: &ruleState{entries: make([]StateEntry, 10)}, + Labels: map[string]string{ + "source": "test_limit", + }, + metrics: &recordingRuleMetrics{ + errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`), + }, + } var err error for _, testCase := range testCases { fq := &datasource.FakeQuerier{} @@ -223,6 +230,9 @@ func TestRecordingRule_ExecNegative(t *testing.T) { "job": "test", }, state: &ruleState{entries: make([]StateEntry, 10)}, + metrics: &recordingRuleMetrics{ + errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`), + }, } fq := &datasource.FakeQuerier{} expErr := "connection reset by peer" diff --git a/dashboards/vm/vmalert.json b/dashboards/vm/vmalert.json index 0c0a3c5c2..a2ec9b53f 100644 --- a/dashboards/vm/vmalert.json +++ b/dashboards/vm/vmalert.json @@ -81,7 +81,7 @@ } ] }, - "description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher", + "description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -273,7 +273,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -333,7 +333,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -397,7 +397,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))", + "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))", "interval": "", "legendFormat": "", "refId": "A" @@ -2240,7 +2240,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0", + "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0", "interval": "", "legendFormat": "{{group}}.{{alertname}} ({{job}})", "range": true, @@ -2872,7 +2872,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0", + "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0", "interval": "", "legendFormat": "{{group}}.{{recording}} ({{job}})", "range": true, diff --git a/dashboards/vmalert.json b/dashboards/vmalert.json index 054837139..5a9d53e82 100644 --- a/dashboards/vmalert.json +++ b/dashboards/vmalert.json @@ -80,7 +80,7 @@ } ] }, - "description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher", + "description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -272,7 +272,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -332,7 +332,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -396,7 +396,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))", + "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))", "interval": "", "legendFormat": "", "refId": "A" @@ -2239,7 +2239,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0", + "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0", "interval": "", "legendFormat": "{{group}}.{{alertname}} ({{job}})", "range": true, @@ -2871,7 +2871,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0", + "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0", "interval": "", "legendFormat": "{{group}}.{{recording}} ({{job}})", "range": true, diff --git a/deployment/docker/alerts-vmalert.yml b/deployment/docker/alerts-vmalert.yml index ef6a5a7d3..44af7ecaa 100644 --- a/deployment/docker/alerts-vmalert.yml +++ b/deployment/docker/alerts-vmalert.yml @@ -18,7 +18,7 @@ groups: Check vmalert's logs for detailed error message." - alert: AlertingRulesError - expr: sum(vmalert_alerting_rules_error) by(job, instance, group, file) > 0 + expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) by(job, instance, group, file) > 0 for: 5m labels: severity: warning @@ -29,7 +29,7 @@ groups: Check vmalert's logs for detailed error message." - alert: RecordingRulesError - expr: sum(vmalert_recording_rules_error) by(job, instance, group, file) > 0 + expr: sum(increase(vmalert_recording_rules_errors_total[5m])) by(job, instance, group, file) > 0 for: 5m labels: severity: warning diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2bdf9b563..540653c13 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -28,6 +28,8 @@ The sandbox cluster installation is running under the constant load generated by ## tip +**vmalert's metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` were replaced with `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details.** + * SECURITY: upgrade base docker image (Alpine) from 3.18.4 to 3.18.5. See [alpine 3.18.5 release notes](https://www.alpinelinux.org/posts/Alpine-3.15.11-3.16.8-3.17.6-3.18.5-released.html). * FEATURE: `vmselect`: allow opening [vmui](https://docs.victoriametrics.com/#vmui) and investigating [Top queries](https://docs.victoriametrics.com/#top-queries) and [Active queries](https://docs.victoriametrics.com/#active-queries) when the `vmselect` is overloaded with concurrent queries (e.g. when more than `-search.maxConcurrentRequests` concurrent queries are executed). Previously an attempt to open `Top queries` or `Active queries` at `vmui` could result in `couldn't start executing the request in ... seconds, since -search.maxConcurrentRequests=... concurrent requests are executed` error, which could complicate debugging of overloaded `vmselect` or single-node VictoriaMetrics. @@ -39,6 +41,7 @@ The sandbox cluster installation is running under the constant load generated by * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): export `vm_promscrape_scrape_pool_targets` [metric](https://docs.victoriametrics.com/vmagent.html#monitoring) to track the number of targets each scrape job discovers. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5311). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): provide `/vmalert/api/v1/rule` and `/api/v1/rule` API endpoints to get the rule object in JSON format. See [these docs](https://docs.victoriametrics.com/vmalert.html#web) for details. +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): deprecate process gauge metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` in favour of `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total` counter metrics. [Counter](https://docs.victoriametrics.com/keyConcepts.html#counter) metric type is more suitable for error counting as it preserves the state change between the scrapes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details. * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/). * FEATURE: all VictoriaMetrics binaries: expose additional metrics at `/metrics` page, which may simplify debugging of VictoriaMetrics components (see [this feature request](https://github.com/VictoriaMetrics/metrics/issues/54)): * `go_sched_latencies_seconds` - the [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram), which shows the time goroutines have spent in runnable state before actually running. Big values point to the lack of CPU time for the current workload.