vmalert: add experimental feature of storing Rule's evaluation state (#3106)

vmalert: add experimental feature of storing Rule's evaluation state

The new feature keeps last 20 state changes of each Rule
in memory. The state are available for view on the Rule's
view page. The page can be opened by clicking on `Details`
link next to Rule's name on the `/groups` page.

States change suppose to help in investigating cases when Rule
doesn't generate alerts or records.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-09-14 14:04:24 +02:00 committed by Aliaksandr Valialkin
parent d071e39694
commit 6ae4f3526b
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
15 changed files with 1131 additions and 486 deletions

View file

@ -511,6 +511,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
* `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.

View file

@ -35,21 +35,13 @@ type AlertingRule struct {
q datasource.Querier
// guard status fields
mu sync.RWMutex
alertsMu sync.RWMutex
// stores list of active alerts
alerts map[uint64]*notifier.Alert
// stores last moment of time Exec was called
lastExecTime time.Time
// stores the duration of the last Exec call
lastExecDuration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
// stores the number of samples returned during
// the last evaluation
lastExecSamples int
// state stores recent state changes
// during evaluations
state *ruleState
metrics *alertingRuleMetrics
}
@ -82,14 +74,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
Debug: cfg.Debug,
}),
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(),
metrics: &alertingRuleMetrics{},
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
func() float64 {
ar.mu.RLock()
defer ar.mu.RUnlock()
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StatePending {
@ -100,8 +93,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
})
ar.metrics.active = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
func() float64 {
ar.mu.RLock()
defer ar.mu.RUnlock()
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StateFiring {
@ -112,18 +105,16 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
})
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
ar.mu.RLock()
defer ar.mu.RUnlock()
if ar.lastExecError == nil {
e := ar.state.getLast()
if e.err == nil {
return 0
}
return 1
})
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
ar.mu.RLock()
defer ar.mu.RUnlock()
return float64(ar.lastExecSamples)
e := ar.state.getLast()
return float64(e.samples)
})
return ar
}
@ -274,18 +265,26 @@ const resolvedRetention = 15 * time.Minute
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
qMetrics, err := ar.q.Query(ctx, ar.Expr, ts)
ar.mu.Lock()
defer ar.mu.Unlock()
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(qMetrics),
err: err,
}
defer func() {
ar.state.add(curState)
}()
ar.alertsMu.Lock()
defer ar.alertsMu.Unlock()
ar.lastExecTime = start
ar.lastExecDuration = time.Since(start)
ar.lastExecError = err
ar.lastExecSamples = len(qMetrics)
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", ar.lastExecSamples, ar.lastExecDuration)
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
@ -301,14 +300,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
for _, m := range qMetrics {
ls, err := ar.toLabels(m, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
curState.err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.err
}
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
ar.lastExecError = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, ar.lastExecError
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.err
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
@ -327,15 +327,16 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
// in annotations
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
if err != nil {
return nil, err
curState.err = err
return nil, curState.err
}
}
continue
}
a, err := ar.newAlert(m, ls, ar.lastExecTime, qFn)
a, err := ar.newAlert(m, ls, start, qFn)
if err != nil {
ar.lastExecError = err
return nil, fmt.Errorf("failed to create alert: %w", err)
curState.err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.err
}
a.ID = h
a.State = notifier.StatePending
@ -372,7 +373,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
if limit > 0 && numActivePending > limit {
ar.alerts = map[uint64]*notifier.Alert{}
return nil, fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.err
}
return ar.toTimeSeries(ts.Unix()), nil
}
@ -449,8 +451,8 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.mu.RLock()
defer ar.mu.RUnlock()
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
@ -458,9 +460,10 @@ func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
return ar.newAlertAPI(*a)
}
// ToAPI returns Rule representation in form
// of APIRule
// ToAPI returns Rule representation in form of APIRule
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
@ -469,19 +472,20 @@ func (ar *AlertingRule) ToAPI() APIRule {
Duration: ar.For.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: ar.lastExecTime,
EvaluationTime: ar.lastExecDuration.Seconds(),
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: ar.lastExecSamples,
LastSamples: lastState.samples,
Updates: ar.state.getAll(),
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if ar.lastExecError != nil {
r.LastError = ar.lastExecError.Error()
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
// satisfy APIRule.State logic
@ -501,14 +505,14 @@ func (ar *AlertingRule) ToAPI() APIRule {
// AlertsToAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
var alerts []*APIAlert
ar.mu.RLock()
ar.alertsMu.RLock()
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.mu.RUnlock()
ar.alertsMu.RUnlock()
return alerts
}

View file

@ -735,6 +735,7 @@ func TestAlertingRule_Template(t *testing.T) {
"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
},
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"),
@ -774,6 +775,7 @@ func TestAlertingRule_Template(t *testing.T) {
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
},
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 1,
@ -915,5 +917,11 @@ func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
}
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor, EvalInterval: waitFor}
return &AlertingRule{
Name: name,
For: waitFor,
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(),
}
}

View file

@ -30,6 +30,23 @@ type manager struct {
groups map[uint64]*Group
}
// RuleAPI generates APIRule object from alert by its ID(hash)
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
if rule.ID() == rID {
return rule.ToAPI(), nil
}
}
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
}
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
m.groupsMu.RLock()
@ -70,9 +87,9 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) er
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
if !*remoteReadIgnoreRestoreErrors {
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
return fmt.Errorf("failed to restore ruleState for group %q: %w", group.Name, err)
}
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
logger.Errorf("error while restoring ruleState for group %q: %s", group.Name, err)
}
}

View file

@ -5,7 +5,6 @@ import (
"fmt"
"sort"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
@ -27,19 +26,9 @@ type RecordingRule struct {
q datasource.Querier
// guard status fields
mu sync.RWMutex
// stores last moment of time Exec was called
lastExecTime time.Time
// stores the duration of the last Exec call
lastExecDuration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
// stores the number of samples returned during
// the last evaluation
lastExecSamples int
// state stores recent state changes
// during evaluations
state *ruleState
metrics *recordingRuleMetrics
}
@ -69,6 +58,7 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
Labels: cfg.Labels,
GroupID: group.ID(),
metrics: &recordingRuleMetrics{},
state: newRuleState(),
q: qb.BuildWithParams(datasource.QuerierParams{
DataSourceType: group.Type.String(),
EvaluationInterval: group.Interval,
@ -80,18 +70,16 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
rr.mu.RLock()
defer rr.mu.RUnlock()
if rr.lastExecError == nil {
e := rr.state.getLast()
if e.err == nil {
return 0
}
return 1
})
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
rr.mu.RLock()
defer rr.mu.RUnlock()
return float64(rr.lastExecSamples)
e := rr.state.getLast()
return float64(e.samples)
})
return rr
}
@ -126,21 +114,28 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
qMetrics, err := rr.q.Query(ctx, rr.Expr, ts)
rr.mu.Lock()
defer rr.mu.Unlock()
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(qMetrics),
}
defer func() {
rr.state.add(curState)
}()
rr.lastExecTime = ts
rr.lastExecDuration = time.Since(ts)
rr.lastExecError = err
rr.lastExecSamples = len(qMetrics)
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.err
}
numSeries := len(qMetrics)
if limit > 0 && numSeries > limit {
return nil, fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.err
}
duplicates := make(map[string]struct{}, len(qMetrics))
@ -149,8 +144,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
ts := rr.toTimeSeries(r)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
rr.lastExecError = errDuplicate
return nil, fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.err
}
duplicates[key] = struct{}{}
tss = append(tss, ts)
@ -205,23 +200,25 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
// ToAPI returns Rule's representation in form
// of APIRule
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: rr.lastExecTime,
EvaluationTime: rr.lastExecDuration.Seconds(),
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: rr.lastExecSamples,
LastSamples: lastState.samples,
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if rr.lastExecError != nil {
r.LastError = rr.lastExecError.Error()
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
return r

View file

@ -19,7 +19,7 @@ func TestRecordingRule_Exec(t *testing.T) {
expTS []prompbmarshal.TimeSeries
}{
{
&RecordingRule{Name: "foo"},
&RecordingRule{Name: "foo", state: newRuleState()},
[]datasource.Metric{metricWithValueAndLabels(t, 10,
"__name__", "bar",
)},
@ -30,7 +30,7 @@ func TestRecordingRule_Exec(t *testing.T) {
},
},
{
&RecordingRule{Name: "foobarbaz"},
&RecordingRule{Name: "foobarbaz", state: newRuleState()},
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
@ -52,9 +52,12 @@ func TestRecordingRule_Exec(t *testing.T) {
},
},
{
&RecordingRule{Name: "job:foo", Labels: map[string]string{
"source": "test",
}},
&RecordingRule{
Name: "job:foo",
state: newRuleState(),
Labels: map[string]string{
"source": "test",
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
@ -195,7 +198,7 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
}
rule := &RecordingRule{Name: "job:foo", Labels: map[string]string{
rule := &RecordingRule{Name: "job:foo", state: newRuleState(), Labels: map[string]string{
"source": "test_limit",
}}
var err error
@ -211,9 +214,13 @@ func TestRecordingRuleLimit(t *testing.T) {
}
func TestRecordingRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
"job": "test",
}}
rr := &RecordingRule{
Name: "job:foo",
state: newRuleState(),
Labels: map[string]string{
"job": "test",
},
}
fq := &fakeQuerier{}
expErr := "connection reset by peer"

View file

@ -3,6 +3,7 @@ package main
import (
"context"
"errors"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@ -31,3 +32,72 @@ type Rule interface {
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")
type ruleState struct {
sync.RWMutex
entries []ruleStateEntry
cur int
}
type ruleStateEntry struct {
// stores last moment of time rule.Exec was called
time time.Time
// stores the timesteamp with which rule.Exec was called
at time.Time
// stores the duration of the last rule.Exec call
duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
err error
// stores the number of samples returned during
// the last evaluation
samples int
}
const defaultStateEntriesLimit = 20
func newRuleState() *ruleState {
return &ruleState{
entries: make([]ruleStateEntry, defaultStateEntriesLimit),
}
}
func (s *ruleState) getLast() ruleStateEntry {
s.RLock()
defer s.RUnlock()
return s.entries[s.cur]
}
func (s *ruleState) getAll() []ruleStateEntry {
entries := make([]ruleStateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.time.IsZero() || !e.at.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e ruleStateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}

81
app/vmalert/rule_test.go Normal file
View file

@ -0,0 +1,81 @@
package main
import (
"sync"
"testing"
"time"
)
func TestRule_state(t *testing.T) {
state := newRuleState()
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
state.add(ruleStateEntry{at: now})
e = state.getLast()
if e.at != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
state.add(ruleStateEntry{at: now2})
e = state.getLast()
if e.at != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now2)
}
if len(state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(state.getAll()),
)
}
var last time.Time
for i := 0; i < defaultStateEntriesLimit*2; i++ {
last = time.Now()
state.add(ruleStateEntry{at: last})
}
e = state.getLast()
if e.at != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, last)
}
if len(state.getAll()) != defaultStateEntriesLimit {
t.Fatalf("expected for state to have %d entries only; got %d",
defaultStateEntriesLimit, len(state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(t *testing.T) {
state := newRuleState()
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
state.add(ruleStateEntry{at: time.Now()})
state.getAll()
state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -85,6 +85,14 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
}
WriteAlert(w, r, alert)
return true
case "/vmalert/rule":
rule, err := rh.getRule(r)
if err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
WriteRuleDetails(w, r, rule)
return true
case "/vmalert/groups":
WriteListGroups(w, r, rh.groups())
return true
@ -168,8 +176,25 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
const (
paramGroupID = "group_id"
paramAlertID = "alert_id"
paramRuleID = "rule_id"
)
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
}
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
}
rule, err := rh.m.RuleAPI(groupID, ruleID)
if err != nil {
return APIRule{}, errResponse(err, http.StatusNotFound)
}
return rule, nil
}
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
if err != nil {

View file

@ -26,6 +26,7 @@
{% endfunc %}
{% func ListGroups(r *http.Request, groups []APIGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups") %}
{% if len(groups) > 0 %}
{%code
@ -85,6 +86,7 @@
{% else %}
<b>record:</b> {%s r.Name %}
{% endif %}
| <span><a target="_blank" href="{%s prefix+r.WebLink() %}">Details</a></span>
</div>
<div class="col-12">
<code><pre>{%s r.Query %}</pre></code>
@ -116,7 +118,7 @@
{% else %}
<div>
<p>No items...</p>
<p>No groups...</p>
</div>
{% endif %}
@ -204,7 +206,7 @@
{% else %}
<div>
<p>No items...</p>
<p>No active alerts...</p>
</div>
{% endif %}
@ -260,7 +262,7 @@
{% else %}
<div>
<p>No items...</p>
<p>No targets...</p>
</div>
{% endif %}
@ -284,7 +286,7 @@
}
sort.Strings(annotationKeys)
%}
<div class="display-6 pb-3 mb-3">{%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
<div class="display-6 pb-3 mb-3">Alert: {%s alert.Name %}<span class="ms-2 badge {% if alert.State=="firing" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s alert.State %}</span></div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
@ -354,6 +356,117 @@
{% endfunc %}
{% func RuleDetails(r *http.Request, rule APIRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "") %}
{%code
var labelKeys []string
for k := range rule.Labels {
labelKeys = append(labelKeys, k)
}
sort.Strings(labelKeys)
var annotationKeys []string
for k := range rule.Annotations {
annotationKeys = append(annotationKeys, k)
}
sort.Strings(annotationKeys)
%}
<div class="display-6 pb-3 mb-3">Rule: {%s rule.Name %}<span class="ms-2 badge {% if rule.Health!="ok" %}bg-danger{% else %} bg-warning text-dark{% endif %}">{%s rule.Health %}</span></div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
Expr
</div>
<div class="col">
<code><pre>{%s rule.Query %}</pre></code>
</div>
</div>
</div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
For
</div>
<div class="col">
{%v rule.Duration %} seconds
</div>
</div>
</div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
Labels
</div>
<div class="col">
{% for _, k := range labelKeys %}
<span class="m-1 badge bg-primary">{%s k %}={%s rule.Labels[k] %}</span>
{% endfor %}
</div>
</div>
</div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
Annotations
</div>
<div class="col">
{% for _, k := range annotationKeys %}
<b>{%s k %}:</b><br>
<p>{%s rule.Annotations[k] %}</p>
{% endfor %}
</div>
</div>
</div>
<div class="container border-bottom p-2">
<div class="row">
<div class="col-2">
Group
</div>
<div class="col">
<a target="_blank" href="{%s prefix %}groups#group-{%s rule.GroupID %}">{%s rule.GroupID %}</a>
</div>
</div>
</div>
<br>
<div class="display-6 pb-3">Last {%d len(rule.Updates) %} updates</span>:</div>
<table class="table table-striped table-hover table-sm">
<thead>
<tr>
<th scope="col" style="width: 20%" title="The time when event was created">Updated at</th>
<th scope="col" style="width: 20%" class="text-center" title="How many samples were returned">Samples</th>
<th scope="col" style="width: 20%" class="text-center" title="How many seconds request took">Duration</th>
<th scope="col" style="width: 20%" class="text-center" title="Time used for rule execution">Executed at</th>
</tr>
</thead>
<tbody>
{% for _, u := range rule.Updates %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
</td>
<td class="text-center">{%d u.samples %}</td>
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
</tr>
</li>
{% if u.err != nil %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
<td colspan="4">
<span class="alert-danger">{%v u.err %}</span>
</td>
</tr>
{% endif %}
{% endfor %}
{%= tpl.Footer(r) %}
{% endfunc %}
{% func badgeState(state string) %}
{%code
badgeClass := "bg-warning text-dark"

File diff suppressed because it is too large Load diff

View file

@ -17,6 +17,7 @@ func TestHandler(t *testing.T) {
alerts: map[uint64]*notifier.Alert{
0: {State: notifier.StateFiring},
},
state: newRuleState(),
}
g := &Group{
Name: "group",
@ -52,6 +53,22 @@ func TestHandler(t *testing.T) {
t.Run("/", func(t *testing.T) {
getResp(ts.URL, nil, 200)
getResp(ts.URL+"/vmalert", nil, 200)
getResp(ts.URL+"/vmalert/alerts", nil, 200)
getResp(ts.URL+"/vmalert/groups", nil, 200)
getResp(ts.URL+"/vmalert/notifiers", nil, 200)
getResp(ts.URL+"/rules", nil, 200)
})
t.Run("/vmalert/rule", func(t *testing.T) {
a := ar.ToAPI()
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
})
t.Run("/vmalert/rule?badParam", func(t *testing.T) {
params := fmt.Sprintf("?%s=0&%s=1", paramGroupID, paramRuleID)
getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
params = fmt.Sprintf("?%s=1&%s=0", paramGroupID, paramRuleID)
getResp(ts.URL+"/vmalert/rule"+params, nil, 404)
})
t.Run("/api/v1/alerts", func(t *testing.T) {

View file

@ -5,11 +5,11 @@ import (
"time"
)
// APIAlert represents a notifier.AlertingRule state
// APIAlert represents a notifier.AlertingRule ruleState
// for WEB view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIAlert struct {
State string `json:"state"`
State string `json:"ruleState"`
Name string `json:"name"`
Value string `json:"value"`
Labels map[string]string `json:"labels,omitempty"`
@ -30,7 +30,7 @@ type APIAlert struct {
// SourceLink contains a link to a system which should show
// why Alert was generated
SourceLink string `json:"source"`
// Restored shows whether Alert's state was restored on restart
// Restored shows whether Alert's ruleState was restored on restart
Restored bool `json:"restored"`
}
@ -86,10 +86,10 @@ type GroupAlerts struct {
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIRule struct {
// State must be one of these under following scenarios
// "pending": at least 1 alert in the rule in pending state and no other alert in firing state.
// "firing": at least 1 alert in the rule in firing state.
// "inactive": no alert in the rule in firing or pending state.
State string `json:"state"`
// "pending": at least 1 alert in the rule in pending ruleState and no other alert in firing ruleState.
// "firing": at least 1 alert in the rule in firing ruleState.
// "inactive": no alert in the rule in firing or pending ruleState.
State string `json:"ruleState"`
Name string `json:"name"`
// Query represents Rule's `expression` field
Query string `json:"query"`
@ -116,8 +116,17 @@ type APIRule struct {
// Type of the rule: recording or alerting
DatasourceType string `json:"datasourceType"`
LastSamples int `json:"lastSamples"`
// ID is an unique Alert's ID within a group
// ID is a unique Alert's ID within a group
ID string `json:"id"`
// GroupID is an unique Group's ID
GroupID string `json:"group_id"`
// TODO:
Updates []ruleStateEntry `json:"updates"`
}
// WebLink returns a link to the alert which can be used in UI.
func (ar APIRule) WebLink() string {
return fmt.Sprintf("rule?%s=%s&%s=%s",
paramGroupID, ar.GroupID, paramRuleID, ar.ID)
}

View file

@ -24,6 +24,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): minimize the time needed for reading large responses from scrape targets in [stream parsing mode](https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode). This should reduce scrape durations for such targets as [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) running in a big Kubernetes cluster.
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add experimental feature for displaying last 10 states of the rule (recording or alerting) evaluation. The state is available on the Rule page, which can be opened by clicking on `Details` link next to Rule's name on the `/groups` page.
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).

View file

@ -515,6 +515,7 @@ or time series modification via [relabeling](https://docs.victoriametrics.com/vm
* `http://<vmalert-addr>/vmalert/api/v1/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in JSON format.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/vmalert/alert?group_id=<group_id>&alert_id=<alert_id>` - get alert status in web UI.
* `http://<vmalert-addr>/vmalert/rule?group_id=<group_id>&rule_id=<rule_id>` - get rule status in web UI.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.