vmalert: add experimental feature of storing Rule's evaluation state (#3106)

vmalert: add experimental feature of storing Rule's evaluation state The new feature keeps last 20 state changes of each Rule in memory. The state are available for view on the Rule's view page. The page can be opened by clicking on `Details` link next to Rule's name on the `/groups` page. States change suppose to help in investigating cases when Rule doesn't generate alerts or records. Signed-off-by: hagen1778 <roman@victoriametrics.com>
2025-03-21 15:45:01 +00:00 · 2022-09-14 14:04:24 +02:00 · 2022-09-14 14:04:24 +02:00 · 877940a131
commit 877940a131
parent 99bc18774c
15 changed files with 1131 additions and 486 deletions
--- a/app/vmalert/README.md
+++ b/app/vmalert/README.md
@ -511,6 +511,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
 * `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
  Used as alert source in AlertManager.
 * `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
+* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
 * `http://<vmalert-addr>/metrics` - application metrics.
 * `http://<vmalert-addr>/-/reload` - hot configuration reload.

--- a/app/vmalert/alerting.go
+++ b/app/vmalert/alerting.go
@ -35,21 +35,13 @@ type AlertingRule struct {

 	q datasource.Querier

-	// guard status fields
-	mu sync.RWMutex
+	alertsMu sync.RWMutex
 	// stores list of active alerts
 	alerts map[uint64]*notifier.Alert
-	// stores last moment of time Exec was called
-	lastExecTime time.Time
-	// stores the duration of the last Exec call
-	lastExecDuration time.Duration
-	// stores last error that happened in Exec func
-	// resets on every successful Exec
-	// may be used as Health state
-	lastExecError error
-	// stores the number of samples returned during
-	// the last evaluation
-	lastExecSamples int
+
+	// state stores recent state changes
+	// during evaluations
+	state *ruleState

 	metrics *alertingRuleMetrics
 }
@ -82,14 +74,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 			Debug:              cfg.Debug,
 		}),
 		alerts:  make(map[uint64]*notifier.Alert),
+		state:   newRuleState(),
 		metrics: &alertingRuleMetrics{},
 	}

 	labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
 	ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
 		func() float64 {
-			ar.mu.RLock()
-			defer ar.mu.RUnlock()
+			ar.alertsMu.RLock()
+			defer ar.alertsMu.RUnlock()
 			var num int
 			for _, a := range ar.alerts {
 				if a.State == notifier.StatePending {
@ -100,8 +93,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 		})
 	ar.metrics.active = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
 		func() float64 {
-			ar.mu.RLock()
-			defer ar.mu.RUnlock()
+			ar.alertsMu.RLock()
+			defer ar.alertsMu.RUnlock()
 			var num int
 			for _, a := range ar.alerts {
 				if a.State == notifier.StateFiring {
@ -112,18 +105,16 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
 		})
 	ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
 		func() float64 {
-			ar.mu.RLock()
-			defer ar.mu.RUnlock()
-			if ar.lastExecError == nil {
+			e := ar.state.getLast()
+			if e.err == nil {
 				return 0
 			}
 			return 1
 		})
 	ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
 		func() float64 {
-			ar.mu.RLock()
-			defer ar.mu.RUnlock()
-			return float64(ar.lastExecSamples)
+			e := ar.state.getLast()
+			return float64(e.samples)
 		})
 	return ar
 }
@ -274,18 +265,26 @@ const resolvedRetention = 15 * time.Minute
 func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
 	start := time.Now()
 	qMetrics, err := ar.q.Query(ctx, ar.Expr, ts)
-	ar.mu.Lock()
-	defer ar.mu.Unlock()
+	curState := ruleStateEntry{
+		time:     start,
+		at:       ts,
+		duration: time.Since(start),
+		samples:  len(qMetrics),
+		err:      err,
+	}
+
+	defer func() {
+		ar.state.add(curState)
+	}()
+
+	ar.alertsMu.Lock()
+	defer ar.alertsMu.Unlock()

-	ar.lastExecTime = start
-	ar.lastExecDuration = time.Since(start)
-	ar.lastExecError = err
-	ar.lastExecSamples = len(qMetrics)
 	if err != nil {
 		return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
 	}

-	ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", ar.lastExecSamples, ar.lastExecDuration)
+	ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)

 	for h, a := range ar.alerts {
 		// cleanup inactive alerts from previous Exec
@ -301,14 +300,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
 	for _, m := range qMetrics {
 		ls, err := ar.toLabels(m, qFn)
 		if err != nil {
-			return nil, fmt.Errorf("failed to expand labels: %s", err)
+			curState.err = fmt.Errorf("failed to expand labels: %s", err)
+			return nil, curState.err
 		}
 		h := hash(ls.processed)
 		if _, ok := updated[h]; ok {
 			// duplicate may be caused by extra labels
 			// conflicting with the metric labels
-			ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
-			return nil, ar.lastExecError
+			curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
+			return nil, curState.err
 		}
 		updated[h] = struct{}{}
 		if a, ok := ar.alerts[h]; ok {
@ -327,15 +327,16 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
 				// in annotations
 				a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
 				if err != nil {
-					return nil, err
+					curState.err = err
+					return nil, curState.err
 				}
 			}
 			continue
 		}
-		a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
+		a, err := ar.newAlert(m, ls, start, qFn)
 		if err != nil {
-			ar.lastExecError = err
-			return nil, fmt.Errorf("failed to create alert: %w", err)
+			curState.err = fmt.Errorf("failed to create alert: %w", err)
+			return nil, curState.err
 		}
 		a.ID = h
 		a.State = notifier.StatePending
@ -372,7 +373,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
 	}
 	if limit > 0 && numActivePending > limit {
 		ar.alerts = map[uint64]*notifier.Alert{}
-		return nil, fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
+		curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
+		return nil, curState.err
 	}
 	return ar.toTimeSeries(ts.Unix()), nil
 }
@ -449,8 +451,8 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T

 // AlertAPI generates APIAlert object from alert by its id(hash)
 func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
-	ar.mu.RLock()
-	defer ar.mu.RUnlock()
+	ar.alertsMu.RLock()
+	defer ar.alertsMu.RUnlock()
 	a, ok := ar.alerts[id]
 	if !ok {
 		return nil
@ -458,9 +460,10 @@ func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
 	return ar.newAlertAPI(*a)
 }

-// ToAPI returns Rule representation in form
-// of APIRule
+// ToAPI returns Rule representation in form of APIRule
+// Isn't thread-safe. Call must be protected by AlertingRule mutex.
 func (ar *AlertingRule) ToAPI() APIRule {
+	lastState := ar.state.getLast()
 	r := APIRule{
 		Type:           "alerting",
 		DatasourceType: ar.Type.String(),
@ -469,19 +472,20 @@ func (ar *AlertingRule) ToAPI() APIRule {
 		Duration:       ar.For.Seconds(),
 		Labels:         ar.Labels,
 		Annotations:    ar.Annotations,
-		LastEvaluation: ar.lastExecTime,
-		EvaluationTime: ar.lastExecDuration.Seconds(),
+		LastEvaluation: lastState.time,
+		EvaluationTime: lastState.duration.Seconds(),
 		Health:         "ok",
 		State:          "inactive",
 		Alerts:         ar.AlertsToAPI(),
-		LastSamples:    ar.lastExecSamples,
+		LastSamples:    lastState.samples,
+		Updates:        ar.state.getAll(),

 		// encode as strings to avoid rounding in JSON
 		ID:      fmt.Sprintf("%d", ar.ID()),
 		GroupID: fmt.Sprintf("%d", ar.GroupID),
 	}
-	if ar.lastExecError != nil {
-		r.LastError = ar.lastExecError.Error()
+	if lastState.err != nil {
+		r.LastError = lastState.err.Error()
 		r.Health = "err"
 	}
 	// satisfy APIRule.State logic
@ -501,14 +505,14 @@ func (ar *AlertingRule) ToAPI() APIRule {
 // AlertsToAPI generates list of APIAlert objects from existing alerts
 func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
 	var alerts []*APIAlert
-	ar.mu.RLock()
+	ar.alertsMu.RLock()
 	for _, a := range ar.alerts {
 		if a.State == notifier.StateInactive {
 			continue
 		}
 		alerts = append(alerts, ar.newAlertAPI(*a))
 	}
-	ar.mu.RUnlock()
+	ar.alertsMu.RUnlock()
 	return alerts
 }

--- a/app/vmalert/alerting_test.go
+++ b/app/vmalert/alerting_test.go
@ -735,6 +735,7 @@ func TestAlertingRule_Template(t *testing.T) {
 					"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
 				},
 				alerts: make(map[uint64]*notifier.Alert),
+				state:  newRuleState(),
 			},
 			[]datasource.Metric{
 				metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"),
@ -774,6 +775,7 @@ func TestAlertingRule_Template(t *testing.T) {
 					"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
 				},
 				alerts: make(map[uint64]*notifier.Alert),
+				state:  newRuleState(),
 			},
 			[]datasource.Metric{
 				metricWithValueAndLabels(t, 1,
@ -915,5 +917,11 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
 }

 func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
-	return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor, EvalInterval: waitFor}
+	return &AlertingRule{
+		Name:         name,
+		For:          waitFor,
+		EvalInterval: waitFor,
+		alerts:       make(map[uint64]*notifier.Alert),
+		state:        newRuleState(),
+	}
 }
--- a/app/vmalert/manager.go
+++ b/app/vmalert/manager.go
@ -30,6 +30,23 @@ type manager struct {
 	groups   map[uint64]*Group
 }

+// RuleAPI generates APIRule object from alert by its ID(hash)
+func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
+	m.groupsMu.RLock()
+	defer m.groupsMu.RUnlock()
+
+	g, ok := m.groups[gID]
+	if !ok {
+		return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
+	}
+	for _, rule := range g.Rules {
+		if rule.ID() == rID {
+			return rule.ToAPI(), nil
+		}
+	}
+	return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
+}
+
 // AlertAPI generates APIAlert object from alert by its ID(hash)
 func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
 	m.groupsMu.RLock()
@ -70,9 +87,9 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
 		err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
 		if err != nil {
 			if !*remoteReadIgnoreRestoreErrors {
-				return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
+				return fmt.Errorf("failed to restore ruleState for group %q: %w", group.Name, err)
 			}
-			logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
+			logger.Errorf("error while restoring ruleState for group %q: %s", group.Name, err)
 		}
 	}

--- a/app/vmalert/recording.go
+++ b/app/vmalert/recording.go
@ -5,7 +5,6 @@ import (
 	"fmt"
 	"sort"
 	"strings"
-	"sync"
 	"time"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
@ -27,19 +26,9 @@ type RecordingRule struct {

 	q datasource.Querier

-	// guard status fields
-	mu sync.RWMutex
-	// stores last moment of time Exec was called
-	lastExecTime time.Time
-	// stores the duration of the last Exec call
-	lastExecDuration time.Duration
-	// stores last error that happened in Exec func
-	// resets on every successful Exec
-	// may be used as Health state
-	lastExecError error
-	// stores the number of samples returned during
-	// the last evaluation
-	lastExecSamples int
+	// state stores recent state changes
+	// during evaluations
+	state *ruleState

 	metrics *recordingRuleMetrics
 }
@ -69,6 +58,7 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
 		Labels:  cfg.Labels,
 		GroupID: group.ID(),
 		metrics: &recordingRuleMetrics{},
+		state:   newRuleState(),
 		q: qb.BuildWithParams(datasource.QuerierParams{
 			DataSourceType:     group.Type.String(),
 			EvaluationInterval: group.Interval,
@ -80,18 +70,16 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
 	labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
 	rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
 		func() float64 {
-			rr.mu.RLock()
-			defer rr.mu.RUnlock()
-			if rr.lastExecError == nil {
+			e := rr.state.getLast()
+			if e.err == nil {
 				return 0
 			}
 			return 1
 		})
 	rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
 		func() float64 {
-			rr.mu.RLock()
-			defer rr.mu.RUnlock()
-			return float64(rr.lastExecSamples)
+			e := rr.state.getLast()
+			return float64(e.samples)
 		})
 	return rr
 }
@ -126,21 +114,28 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([

 // Exec executes RecordingRule expression via the given Querier.
 func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
+	start := time.Now()
 	qMetrics, err := rr.q.Query(ctx, rr.Expr, ts)
-	rr.mu.Lock()
-	defer rr.mu.Unlock()
+	curState := ruleStateEntry{
+		time:     start,
+		at:       ts,
+		duration: time.Since(start),
+		samples:  len(qMetrics),
+	}
+
+	defer func() {
+		rr.state.add(curState)
+	}()

-	rr.lastExecTime = ts
-	rr.lastExecDuration = time.Since(ts)
-	rr.lastExecError = err
-	rr.lastExecSamples = len(qMetrics)
 	if err != nil {
-		return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
+		curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
+		return nil, curState.err
 	}

 	numSeries := len(qMetrics)
 	if limit > 0 && numSeries > limit {
-		return nil, fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
+		curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
+		return nil, curState.err
 	}

 	duplicates := make(map[string]struct{}, len(qMetrics))
@ -149,8 +144,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
 		ts := rr.toTimeSeries(r)
 		key := stringifyLabels(ts)
 		if _, ok := duplicates[key]; ok {
-			rr.lastExecError = errDuplicate
-			return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
+			curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
+			return nil, curState.err
 		}
 		duplicates[key] = struct{}{}
 		tss = append(tss, ts)
@ -205,23 +200,25 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
 // ToAPI returns Rule's representation in form
 // of APIRule
 func (rr *RecordingRule) ToAPI() APIRule {
+	lastState := rr.state.getLast()
 	r := APIRule{
 		Type:           "recording",
 		DatasourceType: rr.Type.String(),
 		Name:           rr.Name,
 		Query:          rr.Expr,
 		Labels:         rr.Labels,
-		LastEvaluation: rr.lastExecTime,
-		EvaluationTime: rr.lastExecDuration.Seconds(),
+		LastEvaluation: lastState.time,
+		EvaluationTime: lastState.duration.Seconds(),
 		Health:         "ok",
-		LastSamples:    rr.lastExecSamples,
+		LastSamples:    lastState.samples,
+		Updates:        rr.state.getAll(),
+
 		// encode as strings to avoid rounding
 		ID:      fmt.Sprintf("%d", rr.ID()),
 		GroupID: fmt.Sprintf("%d", rr.GroupID),
 	}
-
-	if rr.lastExecError != nil {
-		r.LastError = rr.lastExecError.Error()
+	if lastState.err != nil {
+		r.LastError = lastState.err.Error()
 		r.Health = "err"
 	}
 	return r
--- a/app/vmalert/recording_test.go
+++ b/app/vmalert/recording_test.go
@ -19,7 +19,7 @@ func TestRecordingRule_Exec(t *testing.T) {
 		expTS   []prompbmarshal.TimeSeries
 	}{
 		{
-			&RecordingRule{Name: "foo"},
+			&RecordingRule{Name: "foo", state: newRuleState()},
 			[]datasource.Metric{metricWithValueAndLabels(t, 10,
 				"__name__", "bar",
 			)},
@ -30,7 +30,7 @@ func TestRecordingRule_Exec(t *testing.T) {
 			},
 		},
 		{
-			&RecordingRule{Name: "foobarbaz"},
+			&RecordingRule{Name: "foobarbaz", state: newRuleState()},
 			[]datasource.Metric{
 				metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
 				metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
@ -52,9 +52,12 @@ func TestRecordingRule_Exec(t *testing.T) {
 			},
 		},
 		{
-			&RecordingRule{Name: "job:foo", Labels: map[string]string{
-				"source": "test",
-			}},
+			&RecordingRule{
+				Name:  "job:foo",
+				state: newRuleState(),
+				Labels: map[string]string{
+					"source": "test",
+				}},
 			[]datasource.Metric{
 				metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
 				metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
@ -195,7 +198,7 @@ func TestRecordingRuleLimit(t *testing.T) {
 		metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
 		metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
 	}
-	rule := &RecordingRule{Name: "job:foo", Labels: map[string]string{
+	rule := &RecordingRule{Name: "job:foo", state: newRuleState(), Labels: map[string]string{
 		"source": "test_limit",
 	}}
 	var err error
@ -211,9 +214,13 @@ func TestRecordingRuleLimit(t *testing.T) {
 }

 func TestRecordingRule_ExecNegative(t *testing.T) {
-	rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
-		"job": "test",
-	}}
+	rr := &RecordingRule{
+		Name:  "job:foo",
+		state: newRuleState(),
+		Labels: map[string]string{
+			"job": "test",
+		},
+	}

 	fq := &fakeQuerier{}
 	expErr := "connection reset by peer"
--- a/app/vmalert/rule.go
+++ b/app/vmalert/rule.go
@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"errors"
+	"sync"
 	"time"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@ -31,3 +32,72 @@ type Rule interface {
 }

 var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
+
+type ruleState struct {
+	sync.RWMutex
+	entries []ruleStateEntry
+	cur     int
+}
+
+type ruleStateEntry struct {
+	// stores last moment of time rule.Exec was called
+	time time.Time
+	// stores the timesteamp with which rule.Exec was called
+	at time.Time
+	// stores the duration of the last rule.Exec call
+	duration time.Duration
+	// stores last error that happened in Exec func
+	// resets on every successful Exec
+	// may be used as Health ruleState
+	err error
+	// stores the number of samples returned during
+	// the last evaluation
+	samples int
+}
+
+const defaultStateEntriesLimit = 20
+
+func newRuleState() *ruleState {
+	return &ruleState{
+		entries: make([]ruleStateEntry, defaultStateEntriesLimit),
+	}
+}
+
+func (s *ruleState) getLast() ruleStateEntry {
+	s.RLock()
+	defer s.RUnlock()
+	return s.entries[s.cur]
+}
+
+func (s *ruleState) getAll() []ruleStateEntry {
+	entries := make([]ruleStateEntry, 0)
+
+	s.RLock()
+	defer s.RUnlock()
+
+	cur := s.cur
+	for {
+		e := s.entries[cur]
+		if !e.time.IsZero() || !e.at.IsZero() {
+			entries = append(entries, e)
+		}
+		cur--
+		if cur < 0 {
+			cur = cap(s.entries) - 1
+		}
+		if cur == s.cur {
+			return entries
+		}
+	}
+}
+
+func (s *ruleState) add(e ruleStateEntry) {
+	s.Lock()
+	defer s.Unlock()
+
+	s.cur++
+	if s.cur > cap(s.entries)-1 {
+		s.cur = 0
+	}
+	s.entries[s.cur] = e
+}
--- a/app/vmalert/rule_test.go
+++ b/app/vmalert/rule_test.go
@ -0,0 +1,81 @@
+package main
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestRule_state(t *testing.T) {
+	state := newRuleState()
+	e := state.getLast()
+	if !e.at.IsZero() {
+		t.Fatalf("expected entry to be zero")
+	}
+
+	now := time.Now()
+	state.add(ruleStateEntry{at: now})
+
+	e = state.getLast()
+	if e.at != now {
+		t.Fatalf("expected entry at %v to be equal to %v",
+			e.at, now)
+	}
+
+	time.Sleep(time.Millisecond)
+	now2 := time.Now()
+	state.add(ruleStateEntry{at: now2})
+
+	e = state.getLast()
+	if e.at != now2 {
+		t.Fatalf("expected entry at %v to be equal to %v",
+			e.at, now2)
+	}
+
+	if len(state.getAll()) != 2 {
+		t.Fatalf("expected for state to have 2 entries only; got %d",
+			len(state.getAll()),
+		)
+	}
+
+	var last time.Time
+	for i := 0; i < defaultStateEntriesLimit*2; i++ {
+		last = time.Now()
+		state.add(ruleStateEntry{at: last})
+	}
+
+	e = state.getLast()
+	if e.at != last {
+		t.Fatalf("expected entry at %v to be equal to %v",
+			e.at, last)
+	}
+
+	if len(state.getAll()) != defaultStateEntriesLimit {
+		t.Fatalf("expected for state to have %d entries only; got %d",
+			defaultStateEntriesLimit, len(state.getAll()),
+		)
+	}
+}
+
+// TestRule_stateConcurrent supposed to test concurrent
+// execution of state updates.
+// Should be executed with -race flag
+func TestRule_stateConcurrent(t *testing.T) {
+	state := newRuleState()
+
+	const workers = 50
+	const iterations = 100
+	wg := sync.WaitGroup{}
+	wg.Add(workers)
+	for i := 0; i < workers; i++ {
+		go func() {
+			defer wg.Done()
+			for i := 0; i < iterations; i++ {
+				state.add(ruleStateEntry{at: time.Now()})
+				state.getAll()
+				state.getLast()
+			}
+		}()
+	}
+	wg.Wait()
+}
--- a/app/vmalert/web.go
+++ b/app/vmalert/web.go
@ -85,6 +85,14 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
 		}
 		WriteAlert(w, r, alert)
 		return true
+	case "/vmalert/rule":
+		rule, err := rh.getRule(r)
+		if err != nil {
+			httpserver.Errorf(w, r, "%s", err)
+			return true
+		}
+		WriteRuleDetails(w, r, rule)
+		return true
 	case "/vmalert/groups":
 		WriteListGroups(w, r, rh.groups())
 		return true
@ -168,8 +176,25 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
 const (
 	paramGroupID = "group_id"
 	paramAlertID = "alert_id"
+	paramRuleID  = "rule_id"
 )

+func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
+	groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
+	if err != nil {
+		return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
+	}
+	ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
+	if err != nil {
+		return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
+	}
+	rule, err := rh.m.RuleAPI(groupID, ruleID)
+	if err != nil {
+		return APIRule{}, errResponse(err, http.StatusNotFound)
+	}
+	return rule, nil
+}
+
 func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
 	groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
 	if err != nil {
--- a/app/vmalert/web.qtpl
+++ b/app/vmalert/web.qtpl
@ -26,6 +26,7 @@
 {% endfunc %}

 {% func ListGroups(r *http.Request, groups []APIGroup) %}
+    {%code prefix := utils.Prefix(r.URL.Path) %}
    {%= tpl.Header(r, navItems, "Groups") %}
    {%  if len(groups) > 0 %}
        {%code
@ -85,6 +86,7 @@
                                        {% else %}
                                        <b>record:</b> {%s r.Name %}
                                        {% endif %}
+                                        | <span><a target="_blank" href="{%s prefix+r.WebLink() %}">Details</a></span>
                                    </div>
                                    <div class="col-12">
                                        <code><pre>{%s r.Query %}</pre></code>
@ -116,7 +118,7 @@

    {% else %}
        <div>
-            <p>No items...</p>
+            <p>No groups...</p>
        </div>
    {% endif %}

@ -204,7 +206,7 @@

    {% else %}
        <div>
-            <p>No items...</p>
+            <p>No active alerts...</p>
        </div>
    {% endif %}

@ -260,7 +262,7 @@

    {% else %}
        <div>
-            <p>No items...</p>
+            <p>No targets...</p>
        </div>
    {% endif %}

@ -284,7 +286,7 @@
        }
        sort.Strings(annotationKeys)
    %}
-    <div class="display-6 pb-3 mb-3">{%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
+    <div class="display-6 pb-3 mb-3">Alert: {%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
    <div class="container border-bottom p-2">
      <div class="row">
        <div class="col-2">
@ -354,6 +356,117 @@

 {% endfunc %}

+
+{% func RuleDetails(r *http.Request, rule APIRule) %}
+    {%code prefix := utils.Prefix(r.URL.Path) %}
+    {%= tpl.Header(r, navItems, "") %}
+    {%code
+        var labelKeys []string
+        for k := range rule.Labels {
+            labelKeys = append(labelKeys, k)
+        }
+        sort.Strings(labelKeys)
+
+        var annotationKeys []string
+        for k := range rule.Annotations {
+            annotationKeys = append(annotationKeys, k)
+        }
+        sort.Strings(annotationKeys)
+    %}
+    <div class="display-6 pb-3 mb-3">Rule: {%s rule.Name %}<span class="ms-2 badge {% if rule.Health!="ok" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s rule.Health %}</span></div>
+    <div class="container border-bottom p-2">
+      <div class="row">
+        <div class="col-2">
+          Expr
+        </div>
+        <div class="col">
+          <code><pre>{%s rule.Query %}</pre></code>
+        </div>
+      </div>
+    </div>
+    <div class="container border-bottom p-2">
+      <div class="row">
+        <div class="col-2">
+          For
+        </div>
+        <div class="col">
+         {%v rule.Duration %} seconds
+        </div>
+      </div>
+    </div>
+    <div class="container border-bottom p-2">
+      <div class="row">
+        <div class="col-2">
+          Labels
+        </div>
+        <div class="col">
+          {% for _, k := range labelKeys %}
+                <span class="m-1 badge bg-primary">{%s k %}={%s rule.Labels[k] %}</span>
+          {% endfor %}
+        </div>
+      </div>
+    </div>
+    <div class="container border-bottom p-2">
+      <div class="row">
+        <div class="col-2">
+          Annotations
+        </div>
+        <div class="col">
+          {% for _, k := range annotationKeys %}
+                <b>{%s k %}:</b><br>
+                <p>{%s rule.Annotations[k] %}</p>
+          {% endfor %}
+        </div>
+      </div>
+    </div>
+    <div class="container border-bottom p-2">
+      <div class="row">
+        <div class="col-2">
+          Group
+        </div>
+        <div class="col">
+           <a target="_blank" href="{%s prefix %}groups#group-{%s rule.GroupID %}">{%s rule.GroupID %}</a>
+        </div>
+      </div>
+    </div>
+
+    <br>
+    <div class="display-6 pb-3">Last {%d len(rule.Updates) %} updates</span>:</div>
+        <table class="table table-striped table-hover table-sm">
+            <thead>
+                <tr>
+                    <th scope="col" style="width: 20%" title="The time when event was created">Updated at</th>
+                    <th scope="col" style="width: 20%" class="text-center" title="How many samples were returned">Samples</th>
+                    <th scope="col" style="width: 20%" class="text-center" title="How many seconds request took">Duration</th>
+                    <th scope="col" style="width: 20%" class="text-center" title="Time used for rule execution">Executed at</th>
+                </tr>
+            </thead>
+            <tbody>
+
+     {% for _, u := range rule.Updates %}
+             <tr{% if u.err != nil %} class="alert-danger"{% endif %}>
+                 <td>
+                    <span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
+                 </td>
+                 <td class="text-center">{%d u.samples %}</td>
+                 <td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
+                 <td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
+             </tr>
+          </li>
+          {% if u.err != nil %}
+             <tr{% if u.err != nil %} class="alert-danger"{% endif %}>
+               <td colspan="4">
+                   <span class="alert-danger">{%v u.err %}</span>
+               </td>
+             </tr>
+          {% endif %}
+     {% endfor %}
+
+    {%= tpl.Footer(r) %}
+{% endfunc %}
+
+
+
 {% func badgeState(state string) %}
 {%code
    badgeClass := "bg-warning text-dark"
--- a/app/vmalert/web.qtpl.go
+++ b/app/vmalert/web.qtpl.go
--- a/app/vmalert/web_test.go
+++ b/app/vmalert/web_test.go
@ -17,6 +17,7 @@ func TestHandler(t *testing.T) {
 		alerts: map[uint64]*notifier.Alert{
 			0: {State: notifier.StateFiring},
 		},
+		state: newRuleState(),
 	}
 	g := &Group{
 		Name:  "group",
@ -52,6 +53,22 @@ func TestHandler(t *testing.T) {
 	t.Run("/", func(t *testing.T) {
 		getResp(ts.URL, nil, 200)
 		getResp(ts.URL+"/vmalert", nil, 200)
+		getResp(ts.URL+"/vmalert/alerts", nil, 200)
+		getResp(ts.URL+"/vmalert/groups", nil, 200)
+		getResp(ts.URL+"/vmalert/notifiers", nil, 200)
+		getResp(ts.URL+"/rules", nil, 200)
+	})
+
+	t.Run("/vmalert/rule", func(t *testing.T) {
+		a := ar.ToAPI()
+		getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
+	})
+	t.Run("/vmalert/rule?badParam", func(t *testing.T) {
+		params := fmt.Sprintf("?%s=0&%s=1", paramGroupID, paramRuleID)
+		getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
+
+		params = fmt.Sprintf("?%s=1&%s=0", paramGroupID, paramRuleID)
+		getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
 	})

 	t.Run("/api/v1/alerts", func(t *testing.T) {
--- a/app/vmalert/web_types.go
+++ b/app/vmalert/web_types.go
@ -5,11 +5,11 @@ import (
 	"time"
 )

-// APIAlert represents a notifier.AlertingRule state
+// APIAlert represents a notifier.AlertingRule ruleState
 // for WEB view
 // https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
 type APIAlert struct {
-	State       string            `json:"state"`
+	State       string            `json:"ruleState"`
 	Name        string            `json:"name"`
 	Value       string            `json:"value"`
 	Labels      map[string]string `json:"labels,omitempty"`
@ -30,7 +30,7 @@ type APIAlert struct {
 	// SourceLink contains a link to a system which should show
 	// why Alert was generated
 	SourceLink string `json:"source"`
-	// Restored shows whether Alert's state was restored on restart
+	// Restored shows whether Alert's ruleState was restored on restart
 	Restored bool `json:"restored"`
 }

@ -86,10 +86,10 @@ type GroupAlerts struct {
 // see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
 type APIRule struct {
 	// State must be one of these under following scenarios
-	//  "pending": at least 1 alert in the rule in pending state and no other alert in firing state.
-	//  "firing": at least 1 alert in the rule in firing state.
-	//  "inactive": no alert in the rule in firing or pending state.
-	State string `json:"state"`
+	//  "pending": at least 1 alert in the rule in pending ruleState and no other alert in firing ruleState.
+	//  "firing": at least 1 alert in the rule in firing ruleState.
+	//  "inactive": no alert in the rule in firing or pending ruleState.
+	State string `json:"ruleState"`
 	Name  string `json:"name"`
 	// Query represents Rule's `expression` field
 	Query string `json:"query"`
@ -116,8 +116,17 @@ type APIRule struct {
 	// Type of the rule: recording or alerting
 	DatasourceType string `json:"datasourceType"`
 	LastSamples    int    `json:"lastSamples"`
-	// ID is an unique Alert's ID within a group
+	// ID is a unique Alert's ID within a group
 	ID string `json:"id"`
 	// GroupID is an unique Group's ID
 	GroupID string `json:"group_id"`
+
+	// TODO:
+	Updates []ruleStateEntry `json:"updates"`
+}
+
+// WebLink returns a link to the alert which can be used in UI.
+func (ar APIRule) WebLink() string {
+	return fmt.Sprintf("rule?%s=%s&%s=%s",
+		paramGroupID, ar.GroupID, paramRuleID, ar.ID)
 }
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -24,6 +24,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): minimize the time needed for reading large responses from scrape targets in [stream parsing mode](https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode). This should reduce scrape durations for such targets as [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) running in a big Kubernetes cluster.
+* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add experimental feature for displaying last 10 states of the rule (recording or alerting) evaluation. The state is available on the Rule page, which can be opened by clicking on `Details` link next to Rule's name on the `/groups` page.

 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
 * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).
--- a/docs/vmalert.md
+++ b/docs/vmalert.md
@ -515,6 +515,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
 * `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
  Used as alert source in AlertManager.
 * `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
+* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
 * `http://<vmalert-addr>/metrics` - application metrics.
 * `http://<vmalert-addr>/-/reload` - hot configuration reload.