mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
app/vmalert: replace error metrics for gauges with counter metrics (#5217)
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160 Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
65bc460323
commit
935bec447b
8 changed files with 45 additions and 35 deletions
|
@ -48,7 +48,7 @@ type AlertingRule struct {
|
|||
}
|
||||
|
||||
type alertingRuleMetrics struct {
|
||||
errors *utils.Gauge
|
||||
errors *utils.Counter
|
||||
pending *utils.Gauge
|
||||
active *utils.Gauge
|
||||
samples *utils.Gauge
|
||||
|
@ -118,14 +118,7 @@ func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
}
|
||||
return float64(num)
|
||||
})
|
||||
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
})
|
||||
ar.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{%s}`, labels))
|
||||
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
|
@ -385,6 +378,9 @@ func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
|
||||
defer func() {
|
||||
ar.state.add(curState)
|
||||
if curState.Err != nil {
|
||||
ar.metrics.errors.Inc()
|
||||
}
|
||||
}()
|
||||
|
||||
ar.alertsMu.Lock()
|
||||
|
|
|
@ -3,6 +3,7 @@ package rule
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
|
@ -13,6 +14,7 @@ import (
|
|||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
@ -1078,6 +1080,9 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
|||
EvalInterval: waitFor,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
metrics: &alertingRuleMetrics{
|
||||
errors: utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{alertname=%q}`, name)),
|
||||
},
|
||||
}
|
||||
return &rule
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ type RecordingRule struct {
|
|||
}
|
||||
|
||||
type recordingRuleMetrics struct {
|
||||
errors *utils.Gauge
|
||||
errors *utils.Counter
|
||||
samples *utils.Gauge
|
||||
}
|
||||
|
||||
|
@ -83,14 +83,7 @@ func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||
}
|
||||
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, file=%q, id="%d"`, rr.Name, group.Name, group.File, rr.ID())
|
||||
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
})
|
||||
rr.metrics.errors = utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_recording_rules_errors_total{%s}`, labels))
|
||||
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
|
@ -142,6 +135,9 @@ func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]p
|
|||
|
||||
defer func() {
|
||||
rr.state.add(curState)
|
||||
if curState.Err != nil {
|
||||
rr.metrics.errors.Inc()
|
||||
}
|
||||
}()
|
||||
|
||||
if err != nil {
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
|
@ -201,9 +202,15 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
|
||||
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
|
||||
}
|
||||
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
|
||||
"source": "test_limit",
|
||||
}}
|
||||
rule := &RecordingRule{Name: "job:foo",
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
Labels: map[string]string{
|
||||
"source": "test_limit",
|
||||
},
|
||||
metrics: &recordingRuleMetrics{
|
||||
errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`),
|
||||
},
|
||||
}
|
||||
var err error
|
||||
for _, testCase := range testCases {
|
||||
fq := &datasource.FakeQuerier{}
|
||||
|
@ -223,6 +230,9 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
|
|||
"job": "test",
|
||||
},
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
metrics: &recordingRuleMetrics{
|
||||
errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`),
|
||||
},
|
||||
}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
|
|
|
@ -81,7 +81,7 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
"description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher",
|
||||
"description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
|
@ -273,7 +273,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -333,7 +333,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -397,7 +397,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))",
|
||||
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -2240,7 +2240,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
|
||||
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
|
||||
"range": true,
|
||||
|
@ -2872,7 +2872,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0",
|
||||
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{group}}.{{recording}} ({{job}})",
|
||||
"range": true,
|
||||
|
|
|
@ -80,7 +80,7 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
"description": "Overview for VictoriaMetrics vmalert v1.83.0 or higher",
|
||||
"description": "Overview for VictoriaMetrics vmalert v1.96.0 or higher",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
|
@ -272,7 +272,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "count(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -332,7 +332,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "count(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -396,7 +396,7 @@
|
|||
"uid": "$ds"
|
||||
},
|
||||
"exemplar": false,
|
||||
"expr": "(sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0)) + \n(sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) or vector(0))",
|
||||
"expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
|
@ -2239,7 +2239,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "sum(vmalert_alerting_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0",
|
||||
"expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{group}}.{{alertname}} ({{job}})",
|
||||
"range": true,
|
||||
|
@ -2871,7 +2871,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "sum(vmalert_recording_rules_error{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, recording) > 0",
|
||||
"expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{group}}.{{recording}} ({{job}})",
|
||||
"range": true,
|
||||
|
|
|
@ -18,7 +18,7 @@ groups:
|
|||
Check vmalert's logs for detailed error message."
|
||||
|
||||
- alert: AlertingRulesError
|
||||
expr: sum(vmalert_alerting_rules_error) by(job, instance, group, file) > 0
|
||||
expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) by(job, instance, group, file) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -29,7 +29,7 @@ groups:
|
|||
Check vmalert's logs for detailed error message."
|
||||
|
||||
- alert: RecordingRulesError
|
||||
expr: sum(vmalert_recording_rules_error) by(job, instance, group, file) > 0
|
||||
expr: sum(increase(vmalert_recording_rules_errors_total[5m])) by(job, instance, group, file) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
@ -28,6 +28,8 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
|
||||
## tip
|
||||
|
||||
**vmalert's metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` were replaced with `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details.**
|
||||
|
||||
* SECURITY: upgrade base docker image (Alpine) from 3.18.4 to 3.18.5. See [alpine 3.18.5 release notes](https://www.alpinelinux.org/posts/Alpine-3.15.11-3.16.8-3.17.6-3.18.5-released.html).
|
||||
|
||||
* FEATURE: `vmselect`: allow opening [vmui](https://docs.victoriametrics.com/#vmui) and investigating [Top queries](https://docs.victoriametrics.com/#top-queries) and [Active queries](https://docs.victoriametrics.com/#active-queries) when the `vmselect` is overloaded with concurrent queries (e.g. when more than `-search.maxConcurrentRequests` concurrent queries are executed). Previously an attempt to open `Top queries` or `Active queries` at `vmui` could result in `couldn't start executing the request in ... seconds, since -search.maxConcurrentRequests=... concurrent requests are executed` error, which could complicate debugging of overloaded `vmselect` or single-node VictoriaMetrics.
|
||||
|
@ -39,6 +41,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): export `vm_promscrape_scrape_pool_targets` [metric](https://docs.victoriametrics.com/vmagent.html#monitoring) to track the number of targets each scrape job discovers. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5311).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): provide `/vmalert/api/v1/rule` and `/api/v1/rule` API endpoints to get the rule object in JSON format. See [these docs](https://docs.victoriametrics.com/vmalert.html#web) for details.
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): deprecate process gauge metrics `vmalert_alerting_rules_error` and `vmalert_recording_rules_error` in favour of `vmalert_alerting_rules_errors_total` and `vmalert_recording_rules_errors_total` counter metrics. [Counter](https://docs.victoriametrics.com/keyConcepts.html#counter) metric type is more suitable for error counting as it preserves the state change between the scrapes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5160) for details.
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
|
||||
* FEATURE: all VictoriaMetrics binaries: expose additional metrics at `/metrics` page, which may simplify debugging of VictoriaMetrics components (see [this feature request](https://github.com/VictoriaMetrics/metrics/issues/54)):
|
||||
* `go_sched_latencies_seconds` - the [histogram](https://docs.victoriametrics.com/keyConcepts.html#histogram), which shows the time goroutines have spent in runnable state before actually running. Big values point to the lack of CPU time for the current workload.
|
||||
|
|
Loading…
Reference in a new issue