mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-03-11 15:34:56 +00:00
vmalert: add debug
mode for alerting rules (#3055)
* vmalert: add `debug` mode for alerting rules Debug information includes alerts state changes and requests sent to the datasource. Debug can be enabled only on rule's level. It might be useful for debugging unexpected behaviour of alerting rule. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3025 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: review fixes Signed-off-by: hagen1778 <roman@victoriametrics.com> * Update app/vmalert/alerting.go Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com> * vmalert: go fmt Signed-off-by: hagen1778 <roman@victoriametrics.com> Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
This commit is contained in:
parent
6b5843eb66
commit
a887c1bc07
9 changed files with 84 additions and 2 deletions
|
@ -184,6 +184,13 @@ expr: <string>
|
||||||
# as firing once they return.
|
# as firing once they return.
|
||||||
[ for: <duration> | default = 0s ]
|
[ for: <duration> | default = 0s ]
|
||||||
|
|
||||||
|
# Whether to print debug information into logs.
|
||||||
|
# Information includes alerts state changes and requests sent to the datasource.
|
||||||
|
# Please note, that if rule's query params contain sensitive
|
||||||
|
# information - it will be printed to logs.
|
||||||
|
# Is applicable to alerting rules only.
|
||||||
|
[ debug: <bool> | default = false ]
|
||||||
|
|
||||||
# Labels to add or overwrite for each alert.
|
# Labels to add or overwrite for each alert.
|
||||||
labels:
|
labels:
|
||||||
[ <labelname>: <tmpl_string> ]
|
[ <labelname>: <tmpl_string> ]
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -30,6 +31,7 @@ type AlertingRule struct {
|
||||||
GroupID uint64
|
GroupID uint64
|
||||||
GroupName string
|
GroupName string
|
||||||
EvalInterval time.Duration
|
EvalInterval time.Duration
|
||||||
|
Debug bool
|
||||||
|
|
||||||
q datasource.Querier
|
q datasource.Querier
|
||||||
|
|
||||||
|
@ -71,11 +73,13 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
||||||
GroupID: group.ID(),
|
GroupID: group.ID(),
|
||||||
GroupName: group.Name,
|
GroupName: group.Name,
|
||||||
EvalInterval: group.Interval,
|
EvalInterval: group.Interval,
|
||||||
|
Debug: cfg.Debug,
|
||||||
q: qb.BuildWithParams(datasource.QuerierParams{
|
q: qb.BuildWithParams(datasource.QuerierParams{
|
||||||
DataSourceType: group.Type.String(),
|
DataSourceType: group.Type.String(),
|
||||||
EvaluationInterval: group.Interval,
|
EvaluationInterval: group.Interval,
|
||||||
QueryParams: group.Params,
|
QueryParams: group.Params,
|
||||||
Headers: group.Headers,
|
Headers: group.Headers,
|
||||||
|
Debug: cfg.Debug,
|
||||||
}),
|
}),
|
||||||
alerts: make(map[uint64]*notifier.Alert),
|
alerts: make(map[uint64]*notifier.Alert),
|
||||||
metrics: &alertingRuleMetrics{},
|
metrics: &alertingRuleMetrics{},
|
||||||
|
@ -143,6 +147,32 @@ func (ar *AlertingRule) ID() uint64 {
|
||||||
return ar.RuleID
|
return ar.RuleID
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ar *AlertingRule) logDebug(at time.Time, a *notifier.Alert, msg string) {
|
||||||
|
if !ar.Debug {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
prefix := fmt.Sprintf("DEBUG rule %q:%q (%d) at %v: ",
|
||||||
|
ar.GroupName, ar.Name, ar.RuleID, at.Format(time.RFC3339))
|
||||||
|
|
||||||
|
if a != nil {
|
||||||
|
labelKeys := make([]string, len(a.Labels))
|
||||||
|
var i int
|
||||||
|
for k := range a.Labels {
|
||||||
|
labelKeys[i] = k
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
sort.Strings(labelKeys)
|
||||||
|
var labels string
|
||||||
|
for _, l := range labelKeys {
|
||||||
|
labels += fmt.Sprintf("%s=%s,", l, a.Labels[l])
|
||||||
|
}
|
||||||
|
labels = strings.TrimRight(labels, ",")
|
||||||
|
prefix += fmt.Sprintf("alert %d (%s) ", a.ID, labels)
|
||||||
|
}
|
||||||
|
logger.Infof("%s", prefix+msg)
|
||||||
|
}
|
||||||
|
|
||||||
type labelSet struct {
|
type labelSet struct {
|
||||||
// origin labels from series
|
// origin labels from series
|
||||||
// used for templating
|
// used for templating
|
||||||
|
@ -255,9 +285,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ar.logDebug(ts, nil, fmt.Sprintf("query returned %d samples (elapsed: %s)",
|
||||||
|
ar.lastExecSamples, ar.lastExecDuration))
|
||||||
|
|
||||||
for h, a := range ar.alerts {
|
for h, a := range ar.alerts {
|
||||||
// cleanup inactive alerts from previous Exec
|
// cleanup inactive alerts from previous Exec
|
||||||
if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention {
|
if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention {
|
||||||
|
ar.logDebug(ts, a, "deleted as inactive")
|
||||||
delete(ar.alerts, h)
|
delete(ar.alerts, h)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -285,9 +319,10 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||||
// back to notifier.StatePending
|
// back to notifier.StatePending
|
||||||
a.State = notifier.StatePending
|
a.State = notifier.StatePending
|
||||||
a.ActiveAt = ts
|
a.ActiveAt = ts
|
||||||
|
ar.logDebug(ts, a, "INACTIVE => PENDING")
|
||||||
}
|
}
|
||||||
if a.Value != m.Values[0] {
|
if a.Value != m.Values[0] {
|
||||||
// update Value field with latest value
|
// update Value field with the latest value
|
||||||
a.Value = m.Values[0]
|
a.Value = m.Values[0]
|
||||||
// and re-exec template since Value can be used
|
// and re-exec template since Value can be used
|
||||||
// in annotations
|
// in annotations
|
||||||
|
@ -307,6 +342,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||||
a.State = notifier.StatePending
|
a.State = notifier.StatePending
|
||||||
a.ActiveAt = ts
|
a.ActiveAt = ts
|
||||||
ar.alerts[h] = a
|
ar.alerts[h] = a
|
||||||
|
ar.logDebug(ts, a, "created in state PENDING")
|
||||||
}
|
}
|
||||||
var numActivePending int
|
var numActivePending int
|
||||||
for h, a := range ar.alerts {
|
for h, a := range ar.alerts {
|
||||||
|
@ -317,11 +353,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||||
// alert was in Pending state - it is not
|
// alert was in Pending state - it is not
|
||||||
// active anymore
|
// active anymore
|
||||||
delete(ar.alerts, h)
|
delete(ar.alerts, h)
|
||||||
|
ar.logDebug(ts, a, "PENDING => DELETED: is absent in current evaluation round")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if a.State == notifier.StateFiring {
|
if a.State == notifier.StateFiring {
|
||||||
a.State = notifier.StateInactive
|
a.State = notifier.StateInactive
|
||||||
a.ResolvedAt = ts
|
a.ResolvedAt = ts
|
||||||
|
ar.logDebug(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -330,6 +368,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
||||||
a.State = notifier.StateFiring
|
a.State = notifier.StateFiring
|
||||||
a.Start = ts
|
a.Start = ts
|
||||||
alertsFired.Inc()
|
alertsFired.Inc()
|
||||||
|
ar.logDebug(ts, a, fmt.Sprintf("PENDING => FIRING: %s since becoming active at %v",
|
||||||
|
ts.Sub(a.ActiveAt), a.ActiveAt))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if limit > 0 && numActivePending > limit {
|
if limit > 0 && numActivePending > limit {
|
||||||
|
|
|
@ -113,6 +113,7 @@ type Rule struct {
|
||||||
For *promutils.Duration `yaml:"for,omitempty"`
|
For *promutils.Duration `yaml:"for,omitempty"`
|
||||||
Labels map[string]string `yaml:"labels,omitempty"`
|
Labels map[string]string `yaml:"labels,omitempty"`
|
||||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||||
|
Debug bool `yaml:"debug,omitempty"`
|
||||||
|
|
||||||
// Catches all undefined fields and must be empty after parsing.
|
// Catches all undefined fields and must be empty after parsing.
|
||||||
XXX map[string]interface{} `yaml:",inline"`
|
XXX map[string]interface{} `yaml:",inline"`
|
||||||
|
|
|
@ -535,6 +535,21 @@ headers:
|
||||||
rules:
|
rules:
|
||||||
- alert: foo
|
- alert: foo
|
||||||
expr: sum by(job) (up == 1)
|
expr: sum by(job) (up == 1)
|
||||||
|
`)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("`debug` change", func(t *testing.T) {
|
||||||
|
f(t, `
|
||||||
|
name: TestGroup
|
||||||
|
rules:
|
||||||
|
- alert: foo
|
||||||
|
expr: sum by(job) (up == 1)
|
||||||
|
`, `
|
||||||
|
name: TestGroup
|
||||||
|
rules:
|
||||||
|
- alert: foo
|
||||||
|
expr: sum by(job) (up == 1)
|
||||||
|
debug: true
|
||||||
`)
|
`)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
groups:
|
groups:
|
||||||
- name: TestGroup
|
- name: TestGroup
|
||||||
interval: 2s
|
interval: 5s
|
||||||
concurrency: 2
|
concurrency: 2
|
||||||
limit: 1000
|
limit: 1000
|
||||||
headers:
|
headers:
|
||||||
|
@ -11,6 +11,7 @@ groups:
|
||||||
- alert: Conns
|
- alert: Conns
|
||||||
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
expr: sum(vm_tcplistener_conns) by(instance) > 1
|
||||||
for: 3m
|
for: 3m
|
||||||
|
debug: true
|
||||||
annotations:
|
annotations:
|
||||||
summary: Too high connection number for {{$labels.instance}}
|
summary: Too high connection number for {{$labels.instance}}
|
||||||
{{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }}
|
{{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }}
|
||||||
|
|
|
@ -23,6 +23,7 @@ type QuerierParams struct {
|
||||||
EvaluationInterval time.Duration
|
EvaluationInterval time.Duration
|
||||||
QueryParams url.Values
|
QueryParams url.Values
|
||||||
Headers map[string]string
|
Headers map[string]string
|
||||||
|
Debug bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric is the basic entity which should be return by datasource
|
// Metric is the basic entity which should be return by datasource
|
||||||
|
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -39,6 +40,10 @@ type VMStorage struct {
|
||||||
evaluationInterval time.Duration
|
evaluationInterval time.Duration
|
||||||
extraParams url.Values
|
extraParams url.Values
|
||||||
extraHeaders []keyValue
|
extraHeaders []keyValue
|
||||||
|
|
||||||
|
// whether to print additional log messages
|
||||||
|
// for each sent request
|
||||||
|
debug bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type keyValue struct {
|
type keyValue struct {
|
||||||
|
@ -64,6 +69,7 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
|
||||||
s.dataSourceType = toDatasourceType(params.DataSourceType)
|
s.dataSourceType = toDatasourceType(params.DataSourceType)
|
||||||
s.evaluationInterval = params.EvaluationInterval
|
s.evaluationInterval = params.EvaluationInterval
|
||||||
s.extraParams = params.QueryParams
|
s.extraParams = params.QueryParams
|
||||||
|
s.debug = params.Debug
|
||||||
if params.Headers != nil {
|
if params.Headers != nil {
|
||||||
for key, value := range params.Headers {
|
for key, value := range params.Headers {
|
||||||
kv := keyValue{key: key, value: value}
|
kv := keyValue{key: key, value: value}
|
||||||
|
@ -151,6 +157,9 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
|
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
|
||||||
|
if s.debug {
|
||||||
|
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery)
|
||||||
|
}
|
||||||
resp, err := s.c.Do(req.WithContext(ctx))
|
resp, err := s.c.Do(req.WithContext(ctx))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)
|
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)
|
||||||
|
|
|
@ -22,6 +22,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||||
* FEATURE: set the `start` arg to `end - 5 minutes` if isn't passed explicitly to [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and [/api/v1/label/.../values](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3052).
|
* FEATURE: set the `start` arg to `end - 5 minutes` if isn't passed explicitly to [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and [/api/v1/label/.../values](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3052).
|
||||||
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `vm-native-step-interval` command line flag for `vm-native` mode. New option allows splitting the import process into chunks by time interval. This helps migrating data sets with high churn rate and provides better control over the process. See [feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2733).
|
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `vm-native-step-interval` command line flag for `vm-native` mode. New option allows splitting the import process into chunks by time interval. This helps migrating data sets with high churn rate and provides better control over the process. See [feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2733).
|
||||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
|
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
|
||||||
|
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules)
|
||||||
|
|
||||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
|
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
|
||||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).
|
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).
|
||||||
|
|
|
@ -188,6 +188,13 @@ expr: <string>
|
||||||
# as firing once they return.
|
# as firing once they return.
|
||||||
[ for: <duration> | default = 0s ]
|
[ for: <duration> | default = 0s ]
|
||||||
|
|
||||||
|
# Whether to print debug information into logs.
|
||||||
|
# Information includes alerts state changes and requests sent to the datasource.
|
||||||
|
# Please note, that if rule's query params contain sensitive
|
||||||
|
# information - it will be printed to logs.
|
||||||
|
# Is applicable to alerting rules only.
|
||||||
|
[ debug: <bool> | default = false ]
|
||||||
|
|
||||||
# Labels to add or overwrite for each alert.
|
# Labels to add or overwrite for each alert.
|
||||||
labels:
|
labels:
|
||||||
[ <labelname>: <tmpl_string> ]
|
[ <labelname>: <tmpl_string> ]
|
||||||
|
|
Loading…
Reference in a new issue