From a887c1bc07129a1b4f7c8016b955bb2297acf5ef Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 13 Sep 2022 15:25:43 +0200 Subject: [PATCH] vmalert: add `debug` mode for alerting rules (#3055) * vmalert: add `debug` mode for alerting rules Debug information includes alerts state changes and requests sent to the datasource. Debug can be enabled only on rule's level. It might be useful for debugging unexpected behaviour of alerting rule. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3025 Signed-off-by: hagen1778 * vmalert: review fixes Signed-off-by: hagen1778 * Update app/vmalert/alerting.go Co-authored-by: Aliaksandr Valialkin * vmalert: go fmt Signed-off-by: hagen1778 Signed-off-by: hagen1778 Co-authored-by: Aliaksandr Valialkin --- app/vmalert/README.md | 7 ++++ app/vmalert/alerting.go | 42 ++++++++++++++++++- app/vmalert/config/config.go | 1 + app/vmalert/config/config_test.go | 15 +++++++ .../config/testdata/rules/rules2-good.rules | 3 +- app/vmalert/datasource/datasource.go | 1 + app/vmalert/datasource/vm.go | 9 ++++ docs/CHANGELOG.md | 1 + docs/vmalert.md | 7 ++++ 9 files changed, 84 insertions(+), 2 deletions(-) diff --git a/app/vmalert/README.md b/app/vmalert/README.md index c9cbb39bbf..c96f56659a 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -184,6 +184,13 @@ expr: # as firing once they return. [ for: | default = 0s ] +# Whether to print debug information into logs. +# Information includes alerts state changes and requests sent to the datasource. +# Please note, that if rule's query params contain sensitive +# information - it will be printed to logs. +# Is applicable to alerting rules only. +[ debug: | default = false ] + # Labels to add or overwrite for each alert. labels: [ : ] diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index df433538f2..25d0002778 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -6,6 +6,7 @@ import ( "hash/fnv" "sort" "strconv" + "strings" "sync" "time" @@ -30,6 +31,7 @@ type AlertingRule struct { GroupID uint64 GroupName string EvalInterval time.Duration + Debug bool q datasource.Querier @@ -71,11 +73,13 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule GroupID: group.ID(), GroupName: group.Name, EvalInterval: group.Interval, + Debug: cfg.Debug, q: qb.BuildWithParams(datasource.QuerierParams{ DataSourceType: group.Type.String(), EvaluationInterval: group.Interval, QueryParams: group.Params, Headers: group.Headers, + Debug: cfg.Debug, }), alerts: make(map[uint64]*notifier.Alert), metrics: &alertingRuleMetrics{}, @@ -143,6 +147,32 @@ func (ar *AlertingRule) ID() uint64 { return ar.RuleID } +func (ar *AlertingRule) logDebug(at time.Time, a *notifier.Alert, msg string) { + if !ar.Debug { + return + } + + prefix := fmt.Sprintf("DEBUG rule %q:%q (%d) at %v: ", + ar.GroupName, ar.Name, ar.RuleID, at.Format(time.RFC3339)) + + if a != nil { + labelKeys := make([]string, len(a.Labels)) + var i int + for k := range a.Labels { + labelKeys[i] = k + i++ + } + sort.Strings(labelKeys) + var labels string + for _, l := range labelKeys { + labels += fmt.Sprintf("%s=%s,", l, a.Labels[l]) + } + labels = strings.TrimRight(labels, ",") + prefix += fmt.Sprintf("alert %d (%s) ", a.ID, labels) + } + logger.Infof("%s", prefix+msg) +} + type labelSet struct { // origin labels from series // used for templating @@ -255,9 +285,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err) } + ar.logDebug(ts, nil, fmt.Sprintf("query returned %d samples (elapsed: %s)", + ar.lastExecSamples, ar.lastExecDuration)) + for h, a := range ar.alerts { // cleanup inactive alerts from previous Exec if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention { + ar.logDebug(ts, a, "deleted as inactive") delete(ar.alerts, h) } } @@ -285,9 +319,10 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr // back to notifier.StatePending a.State = notifier.StatePending a.ActiveAt = ts + ar.logDebug(ts, a, "INACTIVE => PENDING") } if a.Value != m.Values[0] { - // update Value field with latest value + // update Value field with the latest value a.Value = m.Values[0] // and re-exec template since Value can be used // in annotations @@ -307,6 +342,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr a.State = notifier.StatePending a.ActiveAt = ts ar.alerts[h] = a + ar.logDebug(ts, a, "created in state PENDING") } var numActivePending int for h, a := range ar.alerts { @@ -317,11 +353,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr // alert was in Pending state - it is not // active anymore delete(ar.alerts, h) + ar.logDebug(ts, a, "PENDING => DELETED: is absent in current evaluation round") continue } if a.State == notifier.StateFiring { a.State = notifier.StateInactive a.ResolvedAt = ts + ar.logDebug(ts, a, "FIRING => INACTIVE: is absent in current evaluation round") } continue } @@ -330,6 +368,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr a.State = notifier.StateFiring a.Start = ts alertsFired.Inc() + ar.logDebug(ts, a, fmt.Sprintf("PENDING => FIRING: %s since becoming active at %v", + ts.Sub(a.ActiveAt), a.ActiveAt)) } } if limit > 0 && numActivePending > limit { diff --git a/app/vmalert/config/config.go b/app/vmalert/config/config.go index b0847448aa..41b3afcfb2 100644 --- a/app/vmalert/config/config.go +++ b/app/vmalert/config/config.go @@ -113,6 +113,7 @@ type Rule struct { For *promutils.Duration `yaml:"for,omitempty"` Labels map[string]string `yaml:"labels,omitempty"` Annotations map[string]string `yaml:"annotations,omitempty"` + Debug bool `yaml:"debug,omitempty"` // Catches all undefined fields and must be empty after parsing. XXX map[string]interface{} `yaml:",inline"` diff --git a/app/vmalert/config/config_test.go b/app/vmalert/config/config_test.go index 949f5ed1b3..28b7f9db4b 100644 --- a/app/vmalert/config/config_test.go +++ b/app/vmalert/config/config_test.go @@ -535,6 +535,21 @@ headers: rules: - alert: foo expr: sum by(job) (up == 1) +`) + }) + + t.Run("`debug` change", func(t *testing.T) { + f(t, ` +name: TestGroup +rules: + - alert: foo + expr: sum by(job) (up == 1) +`, ` +name: TestGroup +rules: + - alert: foo + expr: sum by(job) (up == 1) + debug: true `) }) } diff --git a/app/vmalert/config/testdata/rules/rules2-good.rules b/app/vmalert/config/testdata/rules/rules2-good.rules index 94b15bea9f..615a085afa 100644 --- a/app/vmalert/config/testdata/rules/rules2-good.rules +++ b/app/vmalert/config/testdata/rules/rules2-good.rules @@ -1,6 +1,6 @@ groups: - name: TestGroup - interval: 2s + interval: 5s concurrency: 2 limit: 1000 headers: @@ -11,6 +11,7 @@ groups: - alert: Conns expr: sum(vm_tcplistener_conns) by(instance) > 1 for: 3m + debug: true annotations: summary: Too high connection number for {{$labels.instance}} {{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }} diff --git a/app/vmalert/datasource/datasource.go b/app/vmalert/datasource/datasource.go index 9000079410..8f86621253 100644 --- a/app/vmalert/datasource/datasource.go +++ b/app/vmalert/datasource/datasource.go @@ -23,6 +23,7 @@ type QuerierParams struct { EvaluationInterval time.Duration QueryParams url.Values Headers map[string]string + Debug bool } // Metric is the basic entity which should be return by datasource diff --git a/app/vmalert/datasource/vm.go b/app/vmalert/datasource/vm.go index 36547481fc..331a5827a8 100644 --- a/app/vmalert/datasource/vm.go +++ b/app/vmalert/datasource/vm.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" ) @@ -39,6 +40,10 @@ type VMStorage struct { evaluationInterval time.Duration extraParams url.Values extraHeaders []keyValue + + // whether to print additional log messages + // for each sent request + debug bool } type keyValue struct { @@ -64,6 +69,7 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage { s.dataSourceType = toDatasourceType(params.DataSourceType) s.evaluationInterval = params.EvaluationInterval s.extraParams = params.QueryParams + s.debug = params.Debug if params.Headers != nil { for key, value := range params.Headers { kv := keyValue{key: key, value: value} @@ -151,6 +157,9 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim } func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) { + if s.debug { + logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery) + } resp, err := s.c.Do(req.WithContext(ctx)) if err != nil { return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2a581305f8..1c16583c1f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -22,6 +22,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: set the `start` arg to `end - 5 minutes` if isn't passed explicitly to [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and [/api/v1/label/.../values](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3052). * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `vm-native-step-interval` command line flag for `vm-native` mode. New option allows splitting the import process into chunks by time interval. This helps migrating data sets with high churn rate and provides better control over the process. See [feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2733). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules) * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045). * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810). diff --git a/docs/vmalert.md b/docs/vmalert.md index 48df5d9c17..e13d039154 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -188,6 +188,13 @@ expr: # as firing once they return. [ for: | default = 0s ] +# Whether to print debug information into logs. +# Information includes alerts state changes and requests sent to the datasource. +# Please note, that if rule's query params contain sensitive +# information - it will be printed to logs. +# Is applicable to alerting rules only. +[ debug: | default = false ] + # Labels to add or overwrite for each alert. labels: [ : ]