vmalert: expose new metrics for tracking number of produced samples during last evaluation (#1518)

* vmalert: expose new metrics for tracking number of produced samples during last evaluation

Two new metrics were added to track the number of samples produced during the last evaluation:
* vmalert_recording_rules_last_evaluation_samples
* vmalert_alerting_rules_last_evaluation_samples

The gauge type is used to remain consistent with Prometheus metric
`prometheus_rule_group_last_evaluation_samples` which is on the group level.
However, the counter type was considered as well.

Two metrics instead of one are used to make it easier to separate recording and
alerting rules. It is likely, number of samples produced by recording rules is
more important so people will refer to it more frequently.

The expected usage of the new metric is the following:
```
   - alert: RecordingRuleReturnsEmptyResults
        expr: sum(vmalert_recording_rules_last_evaluation_samples) by(recording) < 1
        annotations:
          summary: Recording rule {{$labels.recording}} returns empty results.
            Please verify expression correctness.
```

Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1494

* vmalert: rename `vmalert_alerts_error` to `vmalert_alerting_rules_error` to remain consistent with recording rules metrics
This commit is contained in:
Roman Khavronenko 2021-08-05 09:59:46 +03:00 committed by GitHub
parent d826352688
commit 7416fdaa8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 26 deletions

View file

@ -42,6 +42,9 @@ type AlertingRule struct {
// resets on every successful Exec
// may be used as Health state
lastExecError error
// stores the number of samples returned during
// the last evaluation
lastExecSamples int
metrics *alertingRuleMetrics
}
@ -50,6 +53,7 @@ type alertingRuleMetrics struct {
errors *gauge
pending *gauge
active *gauge
samples *gauge
}
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
@ -76,8 +80,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
ar.mu.RLock()
defer ar.mu.RUnlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StatePending {
@ -88,8 +92,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
})
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
ar.mu.RLock()
defer ar.mu.RUnlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StateFiring {
@ -98,15 +102,21 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
}
return float64(num)
})
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels),
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
ar.mu.RLock()
defer ar.mu.RUnlock()
if ar.lastExecError == nil {
return 0
}
return 1
})
ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
ar.mu.RLock()
defer ar.mu.RUnlock()
return float64(ar.lastExecSamples)
})
return ar
}
@ -115,6 +125,7 @@ func (ar *AlertingRule) Close() {
metrics.UnregisterMetric(ar.metrics.active.name)
metrics.UnregisterMetric(ar.metrics.pending.name)
metrics.UnregisterMetric(ar.metrics.errors.name)
metrics.UnregisterMetric(ar.metrics.samples.name)
}
// String implements Stringer interface
@ -194,6 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e
ar.lastExecError = err
ar.lastExecTime = time.Now()
ar.lastExecSamples = len(qMetrics)
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
@ -384,6 +396,7 @@ func (ar *AlertingRule) RuleAPI() APIAlertingRule {
Expression: ar.Expr,
For: ar.For.String(),
LastError: lastErr,
LastSamples: ar.lastExecSamples,
LastExec: ar.lastExecTime,
Labels: ar.Labels,
Annotations: ar.Annotations,

View file

@ -35,12 +35,16 @@ type RecordingRule struct {
// resets on every successful Exec
// may be used as Health state
lastExecError error
// stores the number of samples returned during
// the last evaluation
lastExecSamples int
metrics *recordingRuleMetrics
}
type recordingRuleMetrics struct {
errors *gauge
errors *gauge
samples *gauge
}
// String implements Stringer interface
@ -73,19 +77,26 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
rr.mu.Lock()
defer rr.mu.Unlock()
rr.mu.RLock()
defer rr.mu.RUnlock()
if rr.lastExecError == nil {
return 0
}
return 1
})
rr.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
rr.mu.RLock()
defer rr.mu.RUnlock()
return float64(rr.lastExecSamples)
})
return rr
}
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
metrics.UnregisterMetric(rr.metrics.errors.name)
metrics.UnregisterMetric(rr.metrics.samples.name)
}
// ExecRange executes recording rule on the given time range similarly to Exec.
@ -118,6 +129,7 @@ func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries,
rr.lastExecTime = time.Now()
rr.lastExecError = err
rr.lastExecSamples = len(qMetrics)
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
}
@ -190,13 +202,14 @@ func (rr *RecordingRule) RuleAPI() APIRecordingRule {
}
return APIRecordingRule{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
Name: rr.Name,
Type: rr.Type.String(),
Expression: rr.Expr,
LastError: lastErr,
LastExec: rr.lastExecTime,
Labels: rr.Labels,
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
Name: rr.Name,
Type: rr.Type.String(),
Expression: rr.Expr,
LastError: lastErr,
LastSamples: rr.lastExecSamples,
LastExec: rr.lastExecTime,
Labels: rr.Labels,
}
}

View file

@ -40,6 +40,7 @@ type APIAlertingRule struct {
Expression string `json:"expression"`
For string `json:"for"`
LastError string `json:"last_error"`
LastSamples int `json:"last_samples"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
@ -47,12 +48,13 @@ type APIAlertingRule struct {
// APIRecordingRule represents RecordingRule for WEB view
type APIRecordingRule struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
LastError string `json:"last_error"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
LastError string `json:"last_error"`
LastSamples int `json:"last_samples"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
}