diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 51b6807cc..c5490bd42 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -42,6 +42,9 @@ type AlertingRule struct { // resets on every successful Exec // may be used as Health state lastExecError error + // stores the number of samples returned during + // the last evaluation + lastExecSamples int metrics *alertingRuleMetrics } @@ -50,6 +53,7 @@ type alertingRuleMetrics struct { errors *gauge pending *gauge active *gauge + samples *gauge } func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule { @@ -76,8 +80,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID()) ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels), func() float64 { - ar.mu.Lock() - defer ar.mu.Unlock() + ar.mu.RLock() + defer ar.mu.RUnlock() var num int for _, a := range ar.alerts { if a.State == notifier.StatePending { @@ -88,8 +92,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule }) ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels), func() float64 { - ar.mu.Lock() - defer ar.mu.Unlock() + ar.mu.RLock() + defer ar.mu.RUnlock() var num int for _, a := range ar.alerts { if a.State == notifier.StateFiring { @@ -98,15 +102,21 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule } return float64(num) }) - ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels), + ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels), func() float64 { - ar.mu.Lock() - defer ar.mu.Unlock() + ar.mu.RLock() + defer ar.mu.RUnlock() if ar.lastExecError == nil { return 0 } return 1 }) + ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels), + func() float64 { + ar.mu.RLock() + defer ar.mu.RUnlock() + return float64(ar.lastExecSamples) + }) return ar } @@ -115,6 +125,7 @@ func (ar *AlertingRule) Close() { metrics.UnregisterMetric(ar.metrics.active.name) metrics.UnregisterMetric(ar.metrics.pending.name) metrics.UnregisterMetric(ar.metrics.errors.name) + metrics.UnregisterMetric(ar.metrics.samples.name) } // String implements Stringer interface @@ -194,6 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e ar.lastExecError = err ar.lastExecTime = time.Now() + ar.lastExecSamples = len(qMetrics) if err != nil { return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err) } @@ -384,6 +396,7 @@ func (ar *AlertingRule) RuleAPI() APIAlertingRule { Expression: ar.Expr, For: ar.For.String(), LastError: lastErr, + LastSamples: ar.lastExecSamples, LastExec: ar.lastExecTime, Labels: ar.Labels, Annotations: ar.Annotations, diff --git a/app/vmalert/recording.go b/app/vmalert/recording.go index c5c70db50..0907be124 100644 --- a/app/vmalert/recording.go +++ b/app/vmalert/recording.go @@ -35,12 +35,16 @@ type RecordingRule struct { // resets on every successful Exec // may be used as Health state lastExecError error + // stores the number of samples returned during + // the last evaluation + lastExecSamples int metrics *recordingRuleMetrics } type recordingRuleMetrics struct { - errors *gauge + errors *gauge + samples *gauge } // String implements Stringer interface @@ -73,19 +77,26 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID()) rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels), func() float64 { - rr.mu.Lock() - defer rr.mu.Unlock() + rr.mu.RLock() + defer rr.mu.RUnlock() if rr.lastExecError == nil { return 0 } return 1 }) + rr.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels), + func() float64 { + rr.mu.RLock() + defer rr.mu.RUnlock() + return float64(rr.lastExecSamples) + }) return rr } // Close unregisters rule metrics func (rr *RecordingRule) Close() { metrics.UnregisterMetric(rr.metrics.errors.name) + metrics.UnregisterMetric(rr.metrics.samples.name) } // ExecRange executes recording rule on the given time range similarly to Exec. @@ -118,6 +129,7 @@ func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, rr.lastExecTime = time.Now() rr.lastExecError = err + rr.lastExecSamples = len(qMetrics) if err != nil { return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err) } @@ -190,13 +202,14 @@ func (rr *RecordingRule) RuleAPI() APIRecordingRule { } return APIRecordingRule{ // encode as strings to avoid rounding - ID: fmt.Sprintf("%d", rr.ID()), - GroupID: fmt.Sprintf("%d", rr.GroupID), - Name: rr.Name, - Type: rr.Type.String(), - Expression: rr.Expr, - LastError: lastErr, - LastExec: rr.lastExecTime, - Labels: rr.Labels, + ID: fmt.Sprintf("%d", rr.ID()), + GroupID: fmt.Sprintf("%d", rr.GroupID), + Name: rr.Name, + Type: rr.Type.String(), + Expression: rr.Expr, + LastError: lastErr, + LastSamples: rr.lastExecSamples, + LastExec: rr.lastExecTime, + Labels: rr.Labels, } } diff --git a/app/vmalert/web_types.go b/app/vmalert/web_types.go index 8586fb64c..6b677ef8c 100644 --- a/app/vmalert/web_types.go +++ b/app/vmalert/web_types.go @@ -40,6 +40,7 @@ type APIAlertingRule struct { Expression string `json:"expression"` For string `json:"for"` LastError string `json:"last_error"` + LastSamples int `json:"last_samples"` LastExec time.Time `json:"last_exec"` Labels map[string]string `json:"labels"` Annotations map[string]string `json:"annotations"` @@ -47,12 +48,13 @@ type APIAlertingRule struct { // APIRecordingRule represents RecordingRule for WEB view type APIRecordingRule struct { - ID string `json:"id"` - Name string `json:"name"` - Type string `json:"type"` - GroupID string `json:"group_id"` - Expression string `json:"expression"` - LastError string `json:"last_error"` - LastExec time.Time `json:"last_exec"` - Labels map[string]string `json:"labels"` + ID string `json:"id"` + Name string `json:"name"` + Type string `json:"type"` + GroupID string `json:"group_id"` + Expression string `json:"expression"` + LastError string `json:"last_error"` + LastSamples int `json:"last_samples"` + LastExec time.Time `json:"last_exec"` + Labels map[string]string `json:"labels"` }