app/vmalert: replace error metrics for gauges with counter metrics (#5217)

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Dmytro Kozlov 2023-12-06 19:39:35 +01:00 committed by GitHub
parent 65bc460323
commit 935bec447b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 45 additions and 35 deletions

View file

@ -48,7 +48,7 @@ type AlertingRule struct {
}
type alertingRuleMetrics struct {
errors *utils.Gauge
errors *utils.Counter
pending *utils.Gauge
active *utils.Gauge
samples *utils.Gauge
@ -118,14 +118,7 @@ func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
}
return float64(num)
})
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.Err == nil {
return 0
}
return 1
})
ar.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{%s}`, labels))
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := ar.state.getLast()
@ -385,6 +378,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
defer func() {
ar.state.add(curState)
if curState.Err != nil {
ar.metrics.errors.Inc()
}
}()
ar.alertsMu.Lock()

View file

@ -3,6 +3,7 @@ package rule
import (
"context"
"errors"
"fmt"
"reflect"
"sort"
"strings"
@ -13,6 +14,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
@ -1078,6 +1080,9 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: &ruleState{entries: make([]StateEntry, 10)},
metrics: &alertingRuleMetrics{
errors: utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{alertname=%q}`, name)),
},
}
return &rule
}

View file

@ -36,7 +36,7 @@ type RecordingRule struct {
}
type recordingRuleMetrics struct {
errors *utils.Gauge
errors *utils.Counter
samples *utils.Gauge
}
@ -83,14 +83,7 @@ func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
}
labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
e := rr.state.getLast()
if e.Err == nil {
return 0
}
return 1
})
rr.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_recording_rules_errors_total{%s}`, labels))
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := rr.state.getLast()
@ -142,6 +135,9 @@ func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]p
defer func() {
rr.state.add(curState)
if curState.Err != nil {
rr.metrics.errors.Inc()
}
}()
if err != nil {

View file

@ -8,6 +8,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
@ -201,9 +202,15 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
}
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
rule := &RecordingRule{Name: "job:foo",
state: &ruleState{entries: make([]StateEntry, 10)},
Labels: map[string]string{
"source": "test_limit",
}}
},
metrics: &recordingRuleMetrics{
errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`),
},
}
var err error
for _, testCase := range testCases {
fq := &datasource.FakeQuerier{}
@ -223,6 +230,9 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
"job": "test",
},
state: &ruleState{entries: make([]StateEntry, 10)},
metrics: &recordingRuleMetrics{
errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`),
},
}
fq := &datasource.FakeQuerier{}
expErr := "connection reset by peer"

View file

@ -81,7 +81,7 @@
}
]
},
"description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher",
"description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@ -273,7 +273,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -333,7 +333,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -397,7 +397,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))",
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -2240,7 +2240,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"range": true,
@ -2872,7 +2872,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0",
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"range": true,

View file

@ -80,7 +80,7 @@
}
]
},
"description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher",
"description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@ -272,7 +272,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -332,7 +332,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -396,7 +396,7 @@
"uid": "$ds"
},
"exemplar": false,
"expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))",
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
"interval": "",
"legendFormat": "",
"refId": "A"
@ -2239,7 +2239,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
"interval": "",
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
"range": true,
@ -2871,7 +2871,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0",
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
"interval": "",
"legendFormat": "{{group}}.{{recording}} ({{job}})",
"range": true,

View file

@ -18,7 +18,7 @@ groups:
Check vmalert's logs for detailed error message."
- alert: AlertingRulesError
expr: sum(vmalert_alerting_rules_error) by(job, instance, group, file) > 0
expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) by(job, instance, group, file) > 0
for: 5m
labels:
severity: warning
@ -29,7 +29,7 @@ groups:
Check vmalert's logs for detailed error message."
- alert: RecordingRulesError
expr: sum(vmalert_recording_rules_error) by(job, instance, group, file) > 0
expr: sum(increase(vmalert_recording_rules_errors_total[5m])) by(job, instance, group, file) > 0
for: 5m
labels:
severity: warning

View file

@ -28,6 +28,8 @@ The sandbox cluster installation is running under the constant load generated by
## tip
**vmalert's metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` were replaced with `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details.**
* SECURITY: upgrade base docker image (Alpine) from 3.18.4 to 3.18.5. See [alpine 3.18.5 release notes](https://www.alpinelinux.org/posts/Alpine-3.15.11-3.16.8-3.17.6-3.18.5-released.html).
* FEATURE: `vmselect`: allow opening [vmui](https://docs.victoriametrics.com/#vmui) and investigating [Top queries](https://docs.victoriametrics.com/#top-queries) and [Active queries](https://docs.victoriametrics.com/#active-queries) when the `vmselect` is overloaded with concurrent queries (e.g. when more than `-search.maxConcurrentRequests` concurrent queries are executed). Previously an attempt to open `Top queries` or `Active queries` at `vmui` could result in `couldn't start executing the request in ... seconds, since -search.maxConcurrentRequests=... concurrent requests are executed` error, which could complicate debugging of overloaded `vmselect` or single-node VictoriaMetrics.
@ -39,6 +41,7 @@ The sandbox cluster installation is running under the constant load generated by
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): export `vm_promscrape_scrape_pool_targets` [metric](https://docs.victoriametrics.com/vmagent.html#monitoring) to track the number of targets each scrape job discovers. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5311).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): provide `/vmalert/api/v1/rule` and `/api/v1/rule` API endpoints to get the rule object in JSON format. See [these docs](https://docs.victoriametrics.com/vmalert.html#web) for details.
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): deprecate process gauge metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` in favour of `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total` counter metrics. [Counter](https://docs.victoriametrics.com/keyConcepts.html#counter) metric type is more suitable for error counting as it preserves the state change between the scrapes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details.
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
* FEATURE: all VictoriaMetrics binaries: expose additional metrics at `/metrics` page, which may simplify debugging of VictoriaMetrics components (see [this feature request](https://github.com/VictoriaMetrics/metrics/issues/54)):
* `go_sched_latencies_seconds` - the [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram), which shows the time goroutines have spent in runnable state before actually running. Big values point to the lack of CPU time for the current workload.