diff --git a/app/vmalert/README.md b/app/vmalert/README.md index c9cbb39bbf..c96f56659a 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -184,6 +184,13 @@ expr: # as firing once they return. [ for: | default = 0s ] +# Whether to print debug information into logs. +# Information includes alerts state changes and requests sent to the datasource. +# Please note, that if rule's query params contain sensitive +# information - it will be printed to logs. +# Is applicable to alerting rules only. +[ debug: | default = false ] + # Labels to add or overwrite for each alert. labels: [ : ] diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index df433538f2..25d0002778 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -6,6 +6,7 @@ import ( "hash/fnv" "sort" "strconv" + "strings" "sync" "time" @@ -30,6 +31,7 @@ type AlertingRule struct { GroupID uint64 GroupName string EvalInterval time.Duration + Debug bool q datasource.Querier @@ -71,11 +73,13 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule GroupID: group.ID(), GroupName: group.Name, EvalInterval: group.Interval, + Debug: cfg.Debug, q: qb.BuildWithParams(datasource.QuerierParams{ DataSourceType: group.Type.String(), EvaluationInterval: group.Interval, QueryParams: group.Params, Headers: group.Headers, + Debug: cfg.Debug, }), alerts: make(map[uint64]*notifier.Alert), metrics: &alertingRuleMetrics{}, @@ -143,6 +147,32 @@ func (ar *AlertingRule) ID() uint64 { return ar.RuleID } +func (ar *AlertingRule) logDebug(at time.Time, a *notifier.Alert, msg string) { + if !ar.Debug { + return + } + + prefix := fmt.Sprintf("DEBUG rule %q:%q (%d) at %v: ", + ar.GroupName, ar.Name, ar.RuleID, at.Format(time.RFC3339)) + + if a != nil { + labelKeys := make([]string, len(a.Labels)) + var i int + for k := range a.Labels { + labelKeys[i] = k + i++ + } + sort.Strings(labelKeys) + var labels string + for _, l := range labelKeys { + labels += fmt.Sprintf("%s=%s,", l, a.Labels[l]) + } + labels = strings.TrimRight(labels, ",") + prefix += fmt.Sprintf("alert %d (%s) ", a.ID, labels) + } + logger.Infof("%s", prefix+msg) +} + type labelSet struct { // origin labels from series // used for templating @@ -255,9 +285,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err) } + ar.logDebug(ts, nil, fmt.Sprintf("query returned %d samples (elapsed: %s)", + ar.lastExecSamples, ar.lastExecDuration)) + for h, a := range ar.alerts { // cleanup inactive alerts from previous Exec if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention { + ar.logDebug(ts, a, "deleted as inactive") delete(ar.alerts, h) } } @@ -285,9 +319,10 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr // back to notifier.StatePending a.State = notifier.StatePending a.ActiveAt = ts + ar.logDebug(ts, a, "INACTIVE => PENDING") } if a.Value != m.Values[0] { - // update Value field with latest value + // update Value field with the latest value a.Value = m.Values[0] // and re-exec template since Value can be used // in annotations @@ -307,6 +342,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr a.State = notifier.StatePending a.ActiveAt = ts ar.alerts[h] = a + ar.logDebug(ts, a, "created in state PENDING") } var numActivePending int for h, a := range ar.alerts { @@ -317,11 +353,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr // alert was in Pending state - it is not // active anymore delete(ar.alerts, h) + ar.logDebug(ts, a, "PENDING => DELETED: is absent in current evaluation round") continue } if a.State == notifier.StateFiring { a.State = notifier.StateInactive a.ResolvedAt = ts + ar.logDebug(ts, a, "FIRING => INACTIVE: is absent in current evaluation round") } continue } @@ -330,6 +368,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr a.State = notifier.StateFiring a.Start = ts alertsFired.Inc() + ar.logDebug(ts, a, fmt.Sprintf("PENDING => FIRING: %s since becoming active at %v", + ts.Sub(a.ActiveAt), a.ActiveAt)) } } if limit > 0 && numActivePending > limit { diff --git a/app/vmalert/config/config.go b/app/vmalert/config/config.go index b0847448aa..41b3afcfb2 100644 --- a/app/vmalert/config/config.go +++ b/app/vmalert/config/config.go @@ -113,6 +113,7 @@ type Rule struct { For *promutils.Duration `yaml:"for,omitempty"` Labels map[string]string `yaml:"labels,omitempty"` Annotations map[string]string `yaml:"annotations,omitempty"` + Debug bool `yaml:"debug,omitempty"` // Catches all undefined fields and must be empty after parsing. XXX map[string]interface{} `yaml:",inline"` diff --git a/app/vmalert/config/config_test.go b/app/vmalert/config/config_test.go index 949f5ed1b3..28b7f9db4b 100644 --- a/app/vmalert/config/config_test.go +++ b/app/vmalert/config/config_test.go @@ -535,6 +535,21 @@ headers: rules: - alert: foo expr: sum by(job) (up == 1) +`) + }) + + t.Run("`debug` change", func(t *testing.T) { + f(t, ` +name: TestGroup +rules: + - alert: foo + expr: sum by(job) (up == 1) +`, ` +name: TestGroup +rules: + - alert: foo + expr: sum by(job) (up == 1) + debug: true `) }) } diff --git a/app/vmalert/config/testdata/rules/rules2-good.rules b/app/vmalert/config/testdata/rules/rules2-good.rules index 94b15bea9f..615a085afa 100644 --- a/app/vmalert/config/testdata/rules/rules2-good.rules +++ b/app/vmalert/config/testdata/rules/rules2-good.rules @@ -1,6 +1,6 @@ groups: - name: TestGroup - interval: 2s + interval: 5s concurrency: 2 limit: 1000 headers: @@ -11,6 +11,7 @@ groups: - alert: Conns expr: sum(vm_tcplistener_conns) by(instance) > 1 for: 3m + debug: true annotations: summary: Too high connection number for {{$labels.instance}} {{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }} diff --git a/app/vmalert/datasource/datasource.go b/app/vmalert/datasource/datasource.go index 9000079410..8f86621253 100644 --- a/app/vmalert/datasource/datasource.go +++ b/app/vmalert/datasource/datasource.go @@ -23,6 +23,7 @@ type QuerierParams struct { EvaluationInterval time.Duration QueryParams url.Values Headers map[string]string + Debug bool } // Metric is the basic entity which should be return by datasource diff --git a/app/vmalert/datasource/vm.go b/app/vmalert/datasource/vm.go index 36547481fc..331a5827a8 100644 --- a/app/vmalert/datasource/vm.go +++ b/app/vmalert/datasource/vm.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" ) @@ -39,6 +40,10 @@ type VMStorage struct { evaluationInterval time.Duration extraParams url.Values extraHeaders []keyValue + + // whether to print additional log messages + // for each sent request + debug bool } type keyValue struct { @@ -64,6 +69,7 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage { s.dataSourceType = toDatasourceType(params.DataSourceType) s.evaluationInterval = params.EvaluationInterval s.extraParams = params.QueryParams + s.debug = params.Debug if params.Headers != nil { for key, value := range params.Headers { kv := keyValue{key: key, value: value} @@ -151,6 +157,9 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim } func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) { + if s.debug { + logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery) + } resp, err := s.c.Do(req.WithContext(ctx)) if err != nil { return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2a581305f8..1c16583c1f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -22,6 +22,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: set the `start` arg to `end - 5 minutes` if isn't passed explicitly to [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and [/api/v1/label/.../values](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3052). * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `vm-native-step-interval` command line flag for `vm-native` mode. New option allows splitting the import process into chunks by time interval. This helps migrating data sets with high churn rate and provides better control over the process. See [feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2733). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules) * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045). * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810). diff --git a/docs/vmalert.md b/docs/vmalert.md index 48df5d9c17..e13d039154 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -188,6 +188,13 @@ expr: # as firing once they return. [ for: | default = 0s ] +# Whether to print debug information into logs. +# Information includes alerts state changes and requests sent to the datasource. +# Please note, that if rule's query params contain sensitive +# information - it will be printed to logs. +# Is applicable to alerting rules only. +[ debug: | default = false ] + # Labels to add or overwrite for each alert. labels: [ : ]