From 7416fdaa8b60a46d1bfb66eb839f00e8ee44af5b Mon Sep 17 00:00:00 2001
From: Roman Khavronenko <hagen1778@gmail.com>
Date: Thu, 5 Aug 2021 09:59:46 +0300
Subject: [PATCH] vmalert: expose new metrics for tracking number of produced
 samples during last evaluation (#1518)

* vmalert: expose new metrics for tracking number of produced samples during last evaluation

Two new metrics were added to track the number of samples produced during the last evaluation:
* vmalert_recording_rules_last_evaluation_samples
* vmalert_alerting_rules_last_evaluation_samples

The gauge type is used to remain consistent with Prometheus metric
`prometheus_rule_group_last_evaluation_samples` which is on the group level.
However, the counter type was considered as well.

Two metrics instead of one are used to make it easier to separate recording and
alerting rules. It is likely, number of samples produced by recording rules is
more important so people will refer to it more frequently.

The expected usage of the new metric is the following:
```
   - alert: RecordingRuleReturnsEmptyResults
        expr: sum(vmalert_recording_rules_last_evaluation_samples) by(recording) < 1
        annotations:
          summary: Recording rule {{$labels.recording}} returns empty results.
            Please verify expression correctness.
```

Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1494

* vmalert: rename `vmalert_alerts_error` to `vmalert_alerting_rules_error` to remain consistent with recording rules metrics
---
 app/vmalert/alerting.go  | 27 ++++++++++++++++++++-------
 app/vmalert/recording.go | 35 ++++++++++++++++++++++++-----------
 app/vmalert/web_types.go | 18 ++++++++++--------
 3 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go
index 51b6807cce..c5490bd42c 100644
--- a/app/vmalert/alerting.go
+++ b/app/vmalert/alerting.go
@@ -42,6 +42,9 @@ type AlertingRule struct {
 	// resets on every successful Exec
 	// may be used as Health state
 	lastExecError error
+	// stores the number of samples returned during
+	// the last evaluation
+	lastExecSamples int
 
 	metrics *alertingRuleMetrics
 }
@@ -50,6 +53,7 @@ type alertingRuleMetrics struct {
 	errors  *gauge
 	pending *gauge
 	active  *gauge
+	samples *gauge
 }
 
 func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
@@ -76,8 +80,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 	labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
 	ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
 		func() float64 {
-			ar.mu.Lock()
-			defer ar.mu.Unlock()
+			ar.mu.RLock()
+			defer ar.mu.RUnlock()
 			var num int
 			for _, a := range ar.alerts {
 				if a.State == notifier.StatePending {
@@ -88,8 +92,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 		})
 	ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
 		func() float64 {
-			ar.mu.Lock()
-			defer ar.mu.Unlock()
+			ar.mu.RLock()
+			defer ar.mu.RUnlock()
 			var num int
 			for _, a := range ar.alerts {
 				if a.State == notifier.StateFiring {
@@ -98,15 +102,21 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 			}
 			return float64(num)
 		})
-	ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels),
+	ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
 		func() float64 {
-			ar.mu.Lock()
-			defer ar.mu.Unlock()
+			ar.mu.RLock()
+			defer ar.mu.RUnlock()
 			if ar.lastExecError == nil {
 				return 0
 			}
 			return 1
 		})
+	ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
+		func() float64 {
+			ar.mu.RLock()
+			defer ar.mu.RUnlock()
+			return float64(ar.lastExecSamples)
+		})
 	return ar
 }
 
@@ -115,6 +125,7 @@ func (ar *AlertingRule) Close() {
 	metrics.UnregisterMetric(ar.metrics.active.name)
 	metrics.UnregisterMetric(ar.metrics.pending.name)
 	metrics.UnregisterMetric(ar.metrics.errors.name)
+	metrics.UnregisterMetric(ar.metrics.samples.name)
 }
 
 // String implements Stringer interface
@@ -194,6 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e
 
 	ar.lastExecError = err
 	ar.lastExecTime = time.Now()
+	ar.lastExecSamples = len(qMetrics)
 	if err != nil {
 		return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
 	}
@@ -384,6 +396,7 @@ func (ar *AlertingRule) RuleAPI() APIAlertingRule {
 		Expression:  ar.Expr,
 		For:         ar.For.String(),
 		LastError:   lastErr,
+		LastSamples: ar.lastExecSamples,
 		LastExec:    ar.lastExecTime,
 		Labels:      ar.Labels,
 		Annotations: ar.Annotations,
diff --git a/app/vmalert/recording.go b/app/vmalert/recording.go
index c5c70db504..0907be124d 100644
--- a/app/vmalert/recording.go
+++ b/app/vmalert/recording.go
@@ -35,12 +35,16 @@ type RecordingRule struct {
 	// resets on every successful Exec
 	// may be used as Health state
 	lastExecError error
+	// stores the number of samples returned during
+	// the last evaluation
+	lastExecSamples int
 
 	metrics *recordingRuleMetrics
 }
 
 type recordingRuleMetrics struct {
-	errors *gauge
+	errors  *gauge
+	samples *gauge
 }
 
 // String implements Stringer interface
@@ -73,19 +77,26 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
 	labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
 	rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
 		func() float64 {
-			rr.mu.Lock()
-			defer rr.mu.Unlock()
+			rr.mu.RLock()
+			defer rr.mu.RUnlock()
 			if rr.lastExecError == nil {
 				return 0
 			}
 			return 1
 		})
+	rr.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
+		func() float64 {
+			rr.mu.RLock()
+			defer rr.mu.RUnlock()
+			return float64(rr.lastExecSamples)
+		})
 	return rr
 }
 
 // Close unregisters rule metrics
 func (rr *RecordingRule) Close() {
 	metrics.UnregisterMetric(rr.metrics.errors.name)
+	metrics.UnregisterMetric(rr.metrics.samples.name)
 }
 
 // ExecRange executes recording rule on the given time range similarly to Exec.
@@ -118,6 +129,7 @@ func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries,
 
 	rr.lastExecTime = time.Now()
 	rr.lastExecError = err
+	rr.lastExecSamples = len(qMetrics)
 	if err != nil {
 		return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
 	}
@@ -190,13 +202,14 @@ func (rr *RecordingRule) RuleAPI() APIRecordingRule {
 	}
 	return APIRecordingRule{
 		// encode as strings to avoid rounding
-		ID:         fmt.Sprintf("%d", rr.ID()),
-		GroupID:    fmt.Sprintf("%d", rr.GroupID),
-		Name:       rr.Name,
-		Type:       rr.Type.String(),
-		Expression: rr.Expr,
-		LastError:  lastErr,
-		LastExec:   rr.lastExecTime,
-		Labels:     rr.Labels,
+		ID:          fmt.Sprintf("%d", rr.ID()),
+		GroupID:     fmt.Sprintf("%d", rr.GroupID),
+		Name:        rr.Name,
+		Type:        rr.Type.String(),
+		Expression:  rr.Expr,
+		LastError:   lastErr,
+		LastSamples: rr.lastExecSamples,
+		LastExec:    rr.lastExecTime,
+		Labels:      rr.Labels,
 	}
 }
diff --git a/app/vmalert/web_types.go b/app/vmalert/web_types.go
index 8586fb64c4..6b677ef8c7 100644
--- a/app/vmalert/web_types.go
+++ b/app/vmalert/web_types.go
@@ -40,6 +40,7 @@ type APIAlertingRule struct {
 	Expression  string            `json:"expression"`
 	For         string            `json:"for"`
 	LastError   string            `json:"last_error"`
+	LastSamples int               `json:"last_samples"`
 	LastExec    time.Time         `json:"last_exec"`
 	Labels      map[string]string `json:"labels"`
 	Annotations map[string]string `json:"annotations"`
@@ -47,12 +48,13 @@ type APIAlertingRule struct {
 
 // APIRecordingRule represents RecordingRule for WEB view
 type APIRecordingRule struct {
-	ID         string            `json:"id"`
-	Name       string            `json:"name"`
-	Type       string            `json:"type"`
-	GroupID    string            `json:"group_id"`
-	Expression string            `json:"expression"`
-	LastError  string            `json:"last_error"`
-	LastExec   time.Time         `json:"last_exec"`
-	Labels     map[string]string `json:"labels"`
+	ID          string            `json:"id"`
+	Name        string            `json:"name"`
+	Type        string            `json:"type"`
+	GroupID     string            `json:"group_id"`
+	Expression  string            `json:"expression"`
+	LastError   string            `json:"last_error"`
+	LastSamples int               `json:"last_samples"`
+	LastExec    time.Time         `json:"last_exec"`
+	Labels      map[string]string `json:"labels"`
 }