From 6ff1de89a9f7729498ed21e003dcc92f25492f80 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Fri, 22 Nov 2024 16:11:31 +0800 Subject: [PATCH] vmalert: fix alert states restoration (#7624) Previously, when the alert got resolved shortly before the vmalert process shuts down, this could result in false alerts. This change switches vmalert to use MetricsQL function during alerts state restore, which makes it incompatible for state restoration with PromQL. --------- Co-authored-by: Roman Khavronenko --- app/vmalert/remoteread/init.go | 2 +- app/vmalert/rule/alerting.go | 3 ++- app/vmalert/rule/alerting_test.go | 14 +++++++------- docs/VictoriaLogs/vmalert.md | 2 +- docs/changelog/CHANGELOG.md | 1 + docs/vmalert.md | 4 ++-- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/app/vmalert/remoteread/init.go b/app/vmalert/remoteread/init.go index b134dad42..b583bcfdb 100644 --- a/app/vmalert/remoteread/init.go +++ b/app/vmalert/remoteread/init.go @@ -14,7 +14,7 @@ import ( ) var ( - addr = flag.String("remoteRead.url", "", "Optional URL to datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect."+ + addr = flag.String("remoteRead.url", "", "Optional URL to datasource compatible with MetricsQL. It can be single node VictoriaMetrics or vmselect."+ "Remote read is used to restore alerts state."+ "This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state. "+ "Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. "+ diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index 1c0e4b284..11daaa329 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -711,7 +711,8 @@ func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts ti for k, v := range ar.Labels { labelsFilter += fmt.Sprintf(",%s=%q", k, v) } - expr := fmt.Sprintf("last_over_time(%s{%s%s}[%ds])", + // use `default_rollup()` instead of `last_over_time()` here to accounts for possible staleness markers + expr := fmt.Sprintf("default_rollup(%s{%s%s}[%ds])", alertForStateMetricName, nameStr, labelsFilter, int(lookback.Seconds())) res, _, err := q.Query(ctx, expr, ts) diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index d54aefd34..061cd2faf 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -791,7 +791,7 @@ func TestGroup_Restore(t *testing.T) { // one active alert with state restore ts := time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, stateMetric("foo", ts)) fn( []config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}}, @@ -804,7 +804,7 @@ func TestGroup_Restore(t *testing.T) { // two rules, two active alerts, one with state restored ts = time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, stateMetric("bar", ts)) fn( []config.Rule{ @@ -824,9 +824,9 @@ func TestGroup_Restore(t *testing.T) { // two rules, two active alerts, two with state restored ts = time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, stateMetric("foo", ts)) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, stateMetric("bar", ts)) fn( []config.Rule{ @@ -846,7 +846,7 @@ func TestGroup_Restore(t *testing.T) { // one active alert but wrong state restore ts = time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`, stateMetric("wrong alert", ts)) fn( []config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}}, @@ -859,7 +859,7 @@ func TestGroup_Restore(t *testing.T) { // one active alert with labels ts = time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, stateMetric("foo", ts, "env", "dev")) fn( []config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}}, @@ -872,7 +872,7 @@ func TestGroup_Restore(t *testing.T) { // one active alert with restore labels missmatch ts = time.Now().Truncate(time.Hour) - fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, + fqr.Set(`default_rollup(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, stateMetric("foo", ts, "env", "dev", "team", "foo")) fn( []config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}}, diff --git a/docs/VictoriaLogs/vmalert.md b/docs/VictoriaLogs/vmalert.md index 0654cec22..a1931d70d 100644 --- a/docs/VictoriaLogs/vmalert.md +++ b/docs/VictoriaLogs/vmalert.md @@ -58,7 +58,7 @@ The following are key flags related to integration with VictoriaLogs: -remoteWrite.url string Optional URL to VictoriaMetrics or vminsert where to persist alerts state and recording rules results in form of timeseries. Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. For example, if -remoteWrite.url=http://127.0.0.1:8428 is specified, then the alerts state will be written to http://127.0.0.1:8428/api/v1/write . See also -remoteWrite.disablePathAppend, '-remoteWrite.showURL'. -remoteRead.url string - Optional URL to datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect.Remote read is used to restore alerts state.This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. See also '-remoteRead.disablePathAppend', '-remoteRead.showURL'. + Optional URL to datasource compatible with MetricsQL. It can be single node VictoriaMetrics or vmselect.Remote read is used to restore alerts state.This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. See also '-remoteRead.disablePathAppend', '-remoteRead.showURL'. -rule array Path to the files or http url with alerting and/or recording rules in YAML format. Supports hierarchical patterns and regexpes. diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index 4b53cc626..afaabdec6 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -27,6 +27,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent): Properly return `200 OK` HTTP status code when importing data via [Pushgateway protocol](https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format) using [multitenant URL format](https://docs.victoriametrics.com/cluster-victoriametrics/#url-format). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3636) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7571). * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly return result for binary operation `^` aka pow at query requests for `NaN` values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7359) for details. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix rendering of isolated data points on the graph that are not connected to other points. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): improve the correctness of alert [state restoration](https://docs.victoriametrics.com/vmalert/#alerts-state-on-restarts). Previously, it could result in false-positive alerts if alert was resolved shortly before vmalert restart. ## [v1.106.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.1) diff --git a/docs/vmalert.md b/docs/vmalert.md index 1ef85b93b..49e916353 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -85,7 +85,7 @@ Then configure `vmalert` accordingly: -notifier.url=http://localhost:9093 \ # AlertManager URL (required if alerting rules are used) -notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL -remoteWrite.url=http://localhost:8428 \ # Remote write compatible storage to persist rules and alerts state info (required if recording rules are used) - -remoteRead.url=http://localhost:8428 \ # Prometheus HTTP API compatible datasource to restore alerts state from + -remoteRead.url=http://localhost:8428 \ # MetricsQL compatible datasource to restore alerts state from -external.label=cluster=east-1 \ # External label to be applied for each rule -external.label=replica=a # Multiple external labels may be set ``` @@ -1354,7 +1354,7 @@ The shortlist of configuration flags is the following: -remoteRead.tlsServerName string Optional TLS server name to use for connections to -remoteRead.url. By default, the server name from -remoteRead.url is used -remoteRead.url string - Optional URL to datasource compatible with Prometheus HTTP API. It can be single node VictoriaMetrics or vmselect.Remote read is used to restore alerts state.This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. See also '-remoteRead.disablePathAppend', '-remoteRead.showURL'. + Optional URL to datasource compatible with MetricsQL. It can be single node VictoriaMetrics or vmselect.Remote read is used to restore alerts state.This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. Supports address in the form of IP address with a port (e.g., http://127.0.0.1:8428) or DNS SRV record. See also '-remoteRead.disablePathAppend', '-remoteRead.showURL'. -remoteWrite.basicAuth.password string Optional basic auth password for -remoteWrite.url -remoteWrite.basicAuth.passwordFile string