mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-03-11 15:34:56 +00:00
vmalert: add experimental feature of storing Rule's evaluation state (#3106)
vmalert: add experimental feature of storing Rule's evaluation state The new feature keeps last 20 state changes of each Rule in memory. The state are available for view on the Rule's view page. The page can be opened by clicking on `Details` link next to Rule's name on the `/groups` page. States change suppose to help in investigating cases when Rule doesn't generate alerts or records. Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
99bc18774c
commit
877940a131
15 changed files with 1131 additions and 486 deletions
|
@ -511,6 +511,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
|
|||
* `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
|
||||
* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
|
|
@ -35,21 +35,13 @@ type AlertingRule struct {
|
|||
|
||||
q datasource.Querier
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
alertsMu sync.RWMutex
|
||||
// stores list of active alerts
|
||||
alerts map[uint64]*notifier.Alert
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores the duration of the last Exec call
|
||||
lastExecDuration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
lastExecSamples int
|
||||
|
||||
// state stores recent state changes
|
||||
// during evaluations
|
||||
state *ruleState
|
||||
|
||||
metrics *alertingRuleMetrics
|
||||
}
|
||||
|
@ -82,14 +74,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
Debug: cfg.Debug,
|
||||
}),
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(),
|
||||
metrics: &alertingRuleMetrics{},
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
var num int
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StatePending {
|
||||
|
@ -100,8 +93,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
})
|
||||
ar.metrics.active = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
var num int
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateFiring {
|
||||
|
@ -112,18 +105,16 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
})
|
||||
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
if ar.lastExecError == nil {
|
||||
e := ar.state.getLast()
|
||||
if e.err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
})
|
||||
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
return float64(ar.lastExecSamples)
|
||||
e := ar.state.getLast()
|
||||
return float64(e.samples)
|
||||
})
|
||||
return ar
|
||||
}
|
||||
|
@ -274,18 +265,26 @@ const resolvedRetention = 15 * time.Minute
|
|||
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
qMetrics, err := ar.q.Query(ctx, ar.Expr, ts)
|
||||
ar.mu.Lock()
|
||||
defer ar.mu.Unlock()
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(qMetrics),
|
||||
err: err,
|
||||
}
|
||||
|
||||
defer func() {
|
||||
ar.state.add(curState)
|
||||
}()
|
||||
|
||||
ar.alertsMu.Lock()
|
||||
defer ar.alertsMu.Unlock()
|
||||
|
||||
ar.lastExecTime = start
|
||||
ar.lastExecDuration = time.Since(start)
|
||||
ar.lastExecError = err
|
||||
ar.lastExecSamples = len(qMetrics)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", ar.lastExecSamples, ar.lastExecDuration)
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
|
@ -301,14 +300,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
for _, m := range qMetrics {
|
||||
ls, err := ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
||||
curState.err = fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, curState.err
|
||||
}
|
||||
h := hash(ls.processed)
|
||||
if _, ok := updated[h]; ok {
|
||||
// duplicate may be caused by extra labels
|
||||
// conflicting with the metric labels
|
||||
ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, ar.lastExecError
|
||||
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.err
|
||||
}
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
|
@ -327,15 +327,16 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
// in annotations
|
||||
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
curState.err = err
|
||||
return nil, curState.err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
|
||||
a, err := ar.newAlert(m, ls, start, qFn)
|
||||
if err != nil {
|
||||
ar.lastExecError = err
|
||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||
curState.err = fmt.Errorf("failed to create alert: %w", err)
|
||||
return nil, curState.err
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
|
@ -372,7 +373,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
}
|
||||
if limit > 0 && numActivePending > limit {
|
||||
ar.alerts = map[uint64]*notifier.Alert{}
|
||||
return nil, fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
return nil, curState.err
|
||||
}
|
||||
return ar.toTimeSeries(ts.Unix()), nil
|
||||
}
|
||||
|
@ -449,8 +451,8 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
|
|||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.mu.RLock()
|
||||
defer ar.mu.RUnlock()
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
|
@ -458,9 +460,10 @@ func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
|||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// ToAPI returns Rule representation in form
|
||||
// of APIRule
|
||||
// ToAPI returns Rule representation in form of APIRule
|
||||
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
|
||||
func (ar *AlertingRule) ToAPI() APIRule {
|
||||
lastState := ar.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "alerting",
|
||||
DatasourceType: ar.Type.String(),
|
||||
|
@ -469,19 +472,20 @@ func (ar *AlertingRule) ToAPI() APIRule {
|
|||
Duration: ar.For.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: ar.lastExecTime,
|
||||
EvaluationTime: ar.lastExecDuration.Seconds(),
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
State: "inactive",
|
||||
Alerts: ar.AlertsToAPI(),
|
||||
LastSamples: ar.lastExecSamples,
|
||||
LastSamples: lastState.samples,
|
||||
Updates: ar.state.getAll(),
|
||||
|
||||
// encode as strings to avoid rounding in JSON
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
}
|
||||
if ar.lastExecError != nil {
|
||||
r.LastError = ar.lastExecError.Error()
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
// satisfy APIRule.State logic
|
||||
|
@ -501,14 +505,14 @@ func (ar *AlertingRule) ToAPI() APIRule {
|
|||
// AlertsToAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.mu.RLock()
|
||||
ar.alertsMu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.mu.RUnlock()
|
||||
ar.alertsMu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
|
|
|
@ -735,6 +735,7 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
|
||||
},
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(),
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"),
|
||||
|
@ -774,6 +775,7 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
|
||||
},
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(),
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1,
|
||||
|
@ -915,5 +917,11 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
|
|||
}
|
||||
|
||||
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
||||
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor, EvalInterval: waitFor}
|
||||
return &AlertingRule{
|
||||
Name: name,
|
||||
For: waitFor,
|
||||
EvalInterval: waitFor,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,23 @@ type manager struct {
|
|||
groups map[uint64]*Group
|
||||
}
|
||||
|
||||
// RuleAPI generates APIRule object from alert by its ID(hash)
|
||||
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if rule.ID() == rID {
|
||||
return rule.ToAPI(), nil
|
||||
}
|
||||
}
|
||||
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
|
@ -70,9 +87,9 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
|
|||
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||
if err != nil {
|
||||
if !*remoteReadIgnoreRestoreErrors {
|
||||
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
|
||||
return fmt.Errorf("failed to restore ruleState for group %q: %w", group.Name, err)
|
||||
}
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
logger.Errorf("error while restoring ruleState for group %q: %s", group.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import (
|
|||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
|
@ -27,19 +26,9 @@ type RecordingRule struct {
|
|||
|
||||
q datasource.Querier
|
||||
|
||||
// guard status fields
|
||||
mu sync.RWMutex
|
||||
// stores last moment of time Exec was called
|
||||
lastExecTime time.Time
|
||||
// stores the duration of the last Exec call
|
||||
lastExecDuration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health state
|
||||
lastExecError error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
lastExecSamples int
|
||||
// state stores recent state changes
|
||||
// during evaluations
|
||||
state *ruleState
|
||||
|
||||
metrics *recordingRuleMetrics
|
||||
}
|
||||
|
@ -69,6 +58,7 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||
Labels: cfg.Labels,
|
||||
GroupID: group.ID(),
|
||||
metrics: &recordingRuleMetrics{},
|
||||
state: newRuleState(),
|
||||
q: qb.BuildWithParams(datasource.QuerierParams{
|
||||
DataSourceType: group.Type.String(),
|
||||
EvaluationInterval: group.Interval,
|
||||
|
@ -80,18 +70,16 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
rr.mu.RLock()
|
||||
defer rr.mu.RUnlock()
|
||||
if rr.lastExecError == nil {
|
||||
e := rr.state.getLast()
|
||||
if e.err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
})
|
||||
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
rr.mu.RLock()
|
||||
defer rr.mu.RUnlock()
|
||||
return float64(rr.lastExecSamples)
|
||||
e := rr.state.getLast()
|
||||
return float64(e.samples)
|
||||
})
|
||||
return rr
|
||||
}
|
||||
|
@ -126,21 +114,28 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
|
|||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
qMetrics, err := rr.q.Query(ctx, rr.Expr, ts)
|
||||
rr.mu.Lock()
|
||||
defer rr.mu.Unlock()
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(qMetrics),
|
||||
}
|
||||
|
||||
defer func() {
|
||||
rr.state.add(curState)
|
||||
}()
|
||||
|
||||
rr.lastExecTime = ts
|
||||
rr.lastExecDuration = time.Since(ts)
|
||||
rr.lastExecError = err
|
||||
rr.lastExecSamples = len(qMetrics)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
return nil, curState.err
|
||||
}
|
||||
|
||||
numSeries := len(qMetrics)
|
||||
if limit > 0 && numSeries > limit {
|
||||
return nil, fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
return nil, curState.err
|
||||
}
|
||||
|
||||
duplicates := make(map[string]struct{}, len(qMetrics))
|
||||
|
@ -149,8 +144,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
|
|||
ts := rr.toTimeSeries(r)
|
||||
key := stringifyLabels(ts)
|
||||
if _, ok := duplicates[key]; ok {
|
||||
rr.lastExecError = errDuplicate
|
||||
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
return nil, curState.err
|
||||
}
|
||||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
|
@ -205,23 +200,25 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
|
|||
// ToAPI returns Rule's representation in form
|
||||
// of APIRule
|
||||
func (rr *RecordingRule) ToAPI() APIRule {
|
||||
lastState := rr.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "recording",
|
||||
DatasourceType: rr.Type.String(),
|
||||
Name: rr.Name,
|
||||
Query: rr.Expr,
|
||||
Labels: rr.Labels,
|
||||
LastEvaluation: rr.lastExecTime,
|
||||
EvaluationTime: rr.lastExecDuration.Seconds(),
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
LastSamples: rr.lastExecSamples,
|
||||
LastSamples: lastState.samples,
|
||||
Updates: rr.state.getAll(),
|
||||
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
}
|
||||
|
||||
if rr.lastExecError != nil {
|
||||
r.LastError = rr.lastExecError.Error()
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return r
|
||||
|
|
|
@ -19,7 +19,7 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
expTS []prompbmarshal.TimeSeries
|
||||
}{
|
||||
{
|
||||
&RecordingRule{Name: "foo"},
|
||||
&RecordingRule{Name: "foo", state: newRuleState()},
|
||||
[]datasource.Metric{metricWithValueAndLabels(t, 10,
|
||||
"__name__", "bar",
|
||||
)},
|
||||
|
@ -30,7 +30,7 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "foobarbaz"},
|
||||
&RecordingRule{Name: "foobarbaz", state: newRuleState()},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
|
||||
|
@ -52,9 +52,12 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
&RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
&RecordingRule{
|
||||
Name: "job:foo",
|
||||
state: newRuleState(),
|
||||
Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
|
@ -195,7 +198,7 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
|
||||
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
|
||||
}
|
||||
rule := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
rule := &RecordingRule{Name: "job:foo", state: newRuleState(), Labels: map[string]string{
|
||||
"source": "test_limit",
|
||||
}}
|
||||
var err error
|
||||
|
@ -211,9 +214,13 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestRecordingRule_ExecNegative(t *testing.T) {
|
||||
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
|
||||
"job": "test",
|
||||
}}
|
||||
rr := &RecordingRule{
|
||||
Name: "job:foo",
|
||||
state: newRuleState(),
|
||||
Labels: map[string]string{
|
||||
"job": "test",
|
||||
},
|
||||
}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
|
|
|
@ -3,6 +3,7 @@ package main
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
|
@ -31,3 +32,72 @@ type Rule interface {
|
|||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
|
||||
|
||||
type ruleState struct {
|
||||
sync.RWMutex
|
||||
entries []ruleStateEntry
|
||||
cur int
|
||||
}
|
||||
|
||||
type ruleStateEntry struct {
|
||||
// stores last moment of time rule.Exec was called
|
||||
time time.Time
|
||||
// stores the timesteamp with which rule.Exec was called
|
||||
at time.Time
|
||||
// stores the duration of the last rule.Exec call
|
||||
duration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health ruleState
|
||||
err error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
samples int
|
||||
}
|
||||
|
||||
const defaultStateEntriesLimit = 20
|
||||
|
||||
func newRuleState() *ruleState {
|
||||
return &ruleState{
|
||||
entries: make([]ruleStateEntry, defaultStateEntriesLimit),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) getLast() ruleStateEntry {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return s.entries[s.cur]
|
||||
}
|
||||
|
||||
func (s *ruleState) getAll() []ruleStateEntry {
|
||||
entries := make([]ruleStateEntry, 0)
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
cur := s.cur
|
||||
for {
|
||||
e := s.entries[cur]
|
||||
if !e.time.IsZero() || !e.at.IsZero() {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
cur--
|
||||
if cur < 0 {
|
||||
cur = cap(s.entries) - 1
|
||||
}
|
||||
if cur == s.cur {
|
||||
return entries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) add(e ruleStateEntry) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.cur++
|
||||
if s.cur > cap(s.entries)-1 {
|
||||
s.cur = 0
|
||||
}
|
||||
s.entries[s.cur] = e
|
||||
}
|
||||
|
|
81
app/vmalert/rule_test.go
Normal file
81
app/vmalert/rule_test.go
Normal file
|
@ -0,0 +1,81 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRule_state(t *testing.T) {
|
||||
state := newRuleState()
|
||||
e := state.getLast()
|
||||
if !e.at.IsZero() {
|
||||
t.Fatalf("expected entry to be zero")
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
state.add(ruleStateEntry{at: now})
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != now {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, now)
|
||||
}
|
||||
|
||||
time.Sleep(time.Millisecond)
|
||||
now2 := time.Now()
|
||||
state.add(ruleStateEntry{at: now2})
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != now2 {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, now2)
|
||||
}
|
||||
|
||||
if len(state.getAll()) != 2 {
|
||||
t.Fatalf("expected for state to have 2 entries only; got %d",
|
||||
len(state.getAll()),
|
||||
)
|
||||
}
|
||||
|
||||
var last time.Time
|
||||
for i := 0; i < defaultStateEntriesLimit*2; i++ {
|
||||
last = time.Now()
|
||||
state.add(ruleStateEntry{at: last})
|
||||
}
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != last {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, last)
|
||||
}
|
||||
|
||||
if len(state.getAll()) != defaultStateEntriesLimit {
|
||||
t.Fatalf("expected for state to have %d entries only; got %d",
|
||||
defaultStateEntriesLimit, len(state.getAll()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRule_stateConcurrent supposed to test concurrent
|
||||
// execution of state updates.
|
||||
// Should be executed with -race flag
|
||||
func TestRule_stateConcurrent(t *testing.T) {
|
||||
state := newRuleState()
|
||||
|
||||
const workers = 50
|
||||
const iterations = 100
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
state.add(ruleStateEntry{at: time.Now()})
|
||||
state.getAll()
|
||||
state.getLast()
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
|
@ -85,6 +85,14 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
|||
}
|
||||
WriteAlert(w, r, alert)
|
||||
return true
|
||||
case "/vmalert/rule":
|
||||
rule, err := rh.getRule(r)
|
||||
if err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
return true
|
||||
}
|
||||
WriteRuleDetails(w, r, rule)
|
||||
return true
|
||||
case "/vmalert/groups":
|
||||
WriteListGroups(w, r, rh.groups())
|
||||
return true
|
||||
|
@ -168,8 +176,25 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
|||
const (
|
||||
paramGroupID = "group_id"
|
||||
paramAlertID = "alert_id"
|
||||
paramRuleID = "rule_id"
|
||||
)
|
||||
|
||||
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
|
||||
if err != nil {
|
||||
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
|
||||
}
|
||||
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
|
||||
if err != nil {
|
||||
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
|
||||
}
|
||||
rule, err := rh.m.RuleAPI(groupID, ruleID)
|
||||
if err != nil {
|
||||
return APIRule{}, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
return rule, nil
|
||||
}
|
||||
|
||||
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
|
||||
if err != nil {
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
{% endfunc %}
|
||||
|
||||
{% func ListGroups(r *http.Request, groups []APIGroup) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Groups") %}
|
||||
{% if len(groups) > 0 %}
|
||||
{%code
|
||||
|
@ -85,6 +86,7 @@
|
|||
{% else %}
|
||||
<b>record:</b> {%s r.Name %}
|
||||
{% endif %}
|
||||
| <span><a target="_blank" href="{%s prefix+r.WebLink() %}">Details</a></span>
|
||||
</div>
|
||||
<div class="col-12">
|
||||
<code><pre>{%s r.Query %}</pre></code>
|
||||
|
@ -116,7 +118,7 @@
|
|||
|
||||
{% else %}
|
||||
<div>
|
||||
<p>No items...</p>
|
||||
<p>No groups...</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
@ -204,7 +206,7 @@
|
|||
|
||||
{% else %}
|
||||
<div>
|
||||
<p>No items...</p>
|
||||
<p>No active alerts...</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
@ -260,7 +262,7 @@
|
|||
|
||||
{% else %}
|
||||
<div>
|
||||
<p>No items...</p>
|
||||
<p>No targets...</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
@ -284,7 +286,7 @@
|
|||
}
|
||||
sort.Strings(annotationKeys)
|
||||
%}
|
||||
<div class="display-6 pb-3 mb-3">{%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
|
||||
<div class="display-6 pb-3 mb-3">Alert: {%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
|
@ -354,6 +356,117 @@
|
|||
|
||||
{% endfunc %}
|
||||
|
||||
|
||||
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "") %}
|
||||
{%code
|
||||
var labelKeys []string
|
||||
for k := range rule.Labels {
|
||||
labelKeys = append(labelKeys, k)
|
||||
}
|
||||
sort.Strings(labelKeys)
|
||||
|
||||
var annotationKeys []string
|
||||
for k := range rule.Annotations {
|
||||
annotationKeys = append(annotationKeys, k)
|
||||
}
|
||||
sort.Strings(annotationKeys)
|
||||
%}
|
||||
<div class="display-6 pb-3 mb-3">Rule: {%s rule.Name %}<span class="ms-2 badge {% if rule.Health!="ok" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s rule.Health %}</span></div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
Expr
|
||||
</div>
|
||||
<div class="col">
|
||||
<code><pre>{%s rule.Query %}</pre></code>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
For
|
||||
</div>
|
||||
<div class="col">
|
||||
{%v rule.Duration %} seconds
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
Labels
|
||||
</div>
|
||||
<div class="col">
|
||||
{% for _, k := range labelKeys %}
|
||||
<span class="m-1 badge bg-primary">{%s k %}={%s rule.Labels[k] %}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
Annotations
|
||||
</div>
|
||||
<div class="col">
|
||||
{% for _, k := range annotationKeys %}
|
||||
<b>{%s k %}:</b><br>
|
||||
<p>{%s rule.Annotations[k] %}</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container border-bottom p-2">
|
||||
<div class="row">
|
||||
<div class="col-2">
|
||||
Group
|
||||
</div>
|
||||
<div class="col">
|
||||
<a target="_blank" href="{%s prefix %}groups#group-{%s rule.GroupID %}">{%s rule.GroupID %}</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
<div class="display-6 pb-3">Last {%d len(rule.Updates) %} updates</span>:</div>
|
||||
<table class="table table-striped table-hover table-sm">
|
||||
<thead>
|
||||
<tr>
|
||||
<th scope="col" style="width: 20%" title="The time when event was created">Updated at</th>
|
||||
<th scope="col" style="width: 20%" class="text-center" title="How many samples were returned">Samples</th>
|
||||
<th scope="col" style="width: 20%" class="text-center" title="How many seconds request took">Duration</th>
|
||||
<th scope="col" style="width: 20%" class="text-center" title="Time used for rule execution">Executed at</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
||||
{% for _, u := range rule.Updates %}
|
||||
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
|
||||
<td>
|
||||
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
|
||||
</td>
|
||||
<td class="text-center">{%d u.samples %}</td>
|
||||
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
|
||||
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
|
||||
</tr>
|
||||
</li>
|
||||
{% if u.err != nil %}
|
||||
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
|
||||
<td colspan="4">
|
||||
<span class="alert-danger">{%v u.err %}</span>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{%= tpl.Footer(r) %}
|
||||
{% endfunc %}
|
||||
|
||||
|
||||
|
||||
{% func badgeState(state string) %}
|
||||
{%code
|
||||
badgeClass := "bg-warning text-dark"
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -17,6 +17,7 @@ func TestHandler(t *testing.T) {
|
|||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {State: notifier.StateFiring},
|
||||
},
|
||||
state: newRuleState(),
|
||||
}
|
||||
g := &Group{
|
||||
Name: "group",
|
||||
|
@ -52,6 +53,22 @@ func TestHandler(t *testing.T) {
|
|||
t.Run("/", func(t *testing.T) {
|
||||
getResp(ts.URL, nil, 200)
|
||||
getResp(ts.URL+"/vmalert", nil, 200)
|
||||
getResp(ts.URL+"/vmalert/alerts", nil, 200)
|
||||
getResp(ts.URL+"/vmalert/groups", nil, 200)
|
||||
getResp(ts.URL+"/vmalert/notifiers", nil, 200)
|
||||
getResp(ts.URL+"/rules", nil, 200)
|
||||
})
|
||||
|
||||
t.Run("/vmalert/rule", func(t *testing.T) {
|
||||
a := ar.ToAPI()
|
||||
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
|
||||
})
|
||||
t.Run("/vmalert/rule?badParam", func(t *testing.T) {
|
||||
params := fmt.Sprintf("?%s=0&%s=1", paramGroupID, paramRuleID)
|
||||
getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
|
||||
|
||||
params = fmt.Sprintf("?%s=1&%s=0", paramGroupID, paramRuleID)
|
||||
getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
|
||||
})
|
||||
|
||||
t.Run("/api/v1/alerts", func(t *testing.T) {
|
||||
|
|
|
@ -5,11 +5,11 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
// APIAlert represents a notifier.AlertingRule state
|
||||
// APIAlert represents a notifier.AlertingRule ruleState
|
||||
// for WEB view
|
||||
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIAlert struct {
|
||||
State string `json:"state"`
|
||||
State string `json:"ruleState"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
|
@ -30,7 +30,7 @@ type APIAlert struct {
|
|||
// SourceLink contains a link to a system which should show
|
||||
// why Alert was generated
|
||||
SourceLink string `json:"source"`
|
||||
// Restored shows whether Alert's state was restored on restart
|
||||
// Restored shows whether Alert's ruleState was restored on restart
|
||||
Restored bool `json:"restored"`
|
||||
}
|
||||
|
||||
|
@ -86,10 +86,10 @@ type GroupAlerts struct {
|
|||
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIRule struct {
|
||||
// State must be one of these under following scenarios
|
||||
// "pending": at least 1 alert in the rule in pending state and no other alert in firing state.
|
||||
// "firing": at least 1 alert in the rule in firing state.
|
||||
// "inactive": no alert in the rule in firing or pending state.
|
||||
State string `json:"state"`
|
||||
// "pending": at least 1 alert in the rule in pending ruleState and no other alert in firing ruleState.
|
||||
// "firing": at least 1 alert in the rule in firing ruleState.
|
||||
// "inactive": no alert in the rule in firing or pending ruleState.
|
||||
State string `json:"ruleState"`
|
||||
Name string `json:"name"`
|
||||
// Query represents Rule's `expression` field
|
||||
Query string `json:"query"`
|
||||
|
@ -116,8 +116,17 @@ type APIRule struct {
|
|||
// Type of the rule: recording or alerting
|
||||
DatasourceType string `json:"datasourceType"`
|
||||
LastSamples int `json:"lastSamples"`
|
||||
// ID is an unique Alert's ID within a group
|
||||
// ID is a unique Alert's ID within a group
|
||||
ID string `json:"id"`
|
||||
// GroupID is an unique Group's ID
|
||||
GroupID string `json:"group_id"`
|
||||
|
||||
// TODO:
|
||||
Updates []ruleStateEntry `json:"updates"`
|
||||
}
|
||||
|
||||
// WebLink returns a link to the alert which can be used in UI.
|
||||
func (ar APIRule) WebLink() string {
|
||||
return fmt.Sprintf("rule?%s=%s&%s=%s",
|
||||
paramGroupID, ar.GroupID, paramRuleID, ar.ID)
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
|||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): minimize the time needed for reading large responses from scrape targets in [stream parsing mode](https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode). This should reduce scrape durations for such targets as [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) running in a big Kubernetes cluster.
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add experimental feature for displaying last 10 states of the rule (recording or alerting) evaluation. The state is available on the Rule page, which can be opened by clicking on `Details` link next to Rule's name on the `/groups` page.
|
||||
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
|
||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).
|
||||
|
|
|
@ -515,6 +515,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
|
|||
* `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
|
||||
Used as alert source in AlertManager.
|
||||
* `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
|
||||
* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
|
||||
* `http://<vmalert-addr>/metrics` - application metrics.
|
||||
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
|
||||
|
||||
|
|
Loading…
Reference in a new issue