From 84742f229a4203f6d9eddde0a30c42da14da912c Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 21 Nov 2022 14:45:45 +0100 Subject: [PATCH] vmalert: add default list of alerting rules (#3373) The default list of alerting rules contains the basic rules for checking vmalert's health state and is recommended to use for monitoring vmalert deployments. Signed-off-by: hagen1778 --- app/vmalert/README.md | 4 +- deployment/docker/alerts-vmalert.yml | 72 ++++++++++++++++++++ deployment/docker/docker-compose-cluster.yml | 1 + deployment/docker/docker-compose.yml | 1 + docs/CHANGELOG.md | 1 + docs/vmalert.md | 4 +- 6 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 deployment/docker/alerts-vmalert.yml diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 347d724f6..6dcce9d67 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -673,10 +673,12 @@ See full description for these flags in `./vmalert -help`. ## Monitoring `vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. +The default list of alerting rules for these metric can be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker). We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported metrics may be analyzed later. -Use the official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. Graphs on this dashboard contain useful hints - hover the `i` icon at the top left corner of each graph in order to read it. +Use the official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. +Graphs on this dashboard contain useful hints - hover the `i` icon in the top left corner of each graph in order to read it. If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard. diff --git a/deployment/docker/alerts-vmalert.yml b/deployment/docker/alerts-vmalert.yml new file mode 100644 index 000000000..e5d48e096 --- /dev/null +++ b/deployment/docker/alerts-vmalert.yml @@ -0,0 +1,72 @@ +# File contains default list of alerts for мьфдуке service. +# The alerts below are just recommendations and may require some updates +# and threshold calibration according to every specific setup. +groups: + # Alerts group for vmalert assumes that Grafana dashboard + # https://grafana.com/grafana/dashboards/14950-victoriametrics-vmalert is installed. + # Pls update the `dashboard` annotation according to your setup. + - name: vmalert + interval: 5s + rules: + - alert: ConfigurationReloadFailure + expr: vmalert_config_last_reload_successful != 1 + labels: + severity: warning + annotations: + summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}" + description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}. + Check vmalert's logs for detailed error message." + + - alert: AlertingRulesError + expr: sum(vmalert_alerting_rules_error) by(job, instance, group) > 0 + for: 5m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" + summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" + description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". + Check vmalert's logs for detailed error message." + + - alert: RecordingRulesError + expr: sum(vmalert_recording_rules_error) by(job, instance, group) > 0 + for: 5m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" + summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" + description: "Recording rules execution is failing for group \"{{ $labels.group }}\". + Check vmalert's logs for detailed error message." + + - alert: RecordingRulesNoData + expr: sum(vmalert_recording_rules_last_evaluation_samples) by(job, group, recording) < 1 + for: 30m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" + summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data" + description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" + produces 0 samples over the last 30min. It might be caused by a misconfiguration + or incorrect query expression." + + - alert: RemoteWriteErrors + expr: sum(increase(vmalert_remotewrite_errors_total[5m])) by(job, instance) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL" + description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting + or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message." + + - alert: AlertmanagerErrors + expr: sum(increase(vmalert_alerts_send_errors_total[5m])) by(job, instance, addr) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" + description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". + Check vmalert's logs for detailed error message." diff --git a/deployment/docker/docker-compose-cluster.yml b/deployment/docker/docker-compose-cluster.yml index 2b6621e6f..1069e6674 100644 --- a/deployment/docker/docker-compose-cluster.yml +++ b/deployment/docker/docker-compose-cluster.yml @@ -91,6 +91,7 @@ services: - ./alerts-cluster.yml:/etc/alerts/alerts.yml - ./alerts-health.yml:/etc/alerts/alerts-health.yml - ./alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml + - ./alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml command: - '--datasource.url=http://vmselect:8481/select/0/prometheus' - '--remoteRead.url=http://vmselect:8481/select/0/prometheus' diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml index 3f1733403..3d6a8a47c 100644 --- a/deployment/docker/docker-compose.yml +++ b/deployment/docker/docker-compose.yml @@ -66,6 +66,7 @@ services: - ./alerts.yml:/etc/alerts/alerts.yml - ./alerts-health.yml:/etc/alerts/alerts-health.yml - ./alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml + - ./alerts-vmalert.yml:/etc/alerts/alerts-vmalert.yml command: - "--datasource.url=http://victoriametrics:8428/" - "--remoteRead.url=http://victoriametrics:8428/" diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index bc46cdba9..d8fd86897 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -21,6 +21,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to upload/paste JSON to investigate the trace. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3308) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3310). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reduce JS bundle size from 200Kb to 100Kb. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3298). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to hide results of a particular query by clicking the `eye` icon. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3359). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add default alert list for vmalert's metrics. See [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml). * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): properly display the requested graph on the requested time range when navigating from Prometheus URL in Grafana. * BUGFIX: reduce CPU usage spikes and memory usage spikes under high data ingestion rate introduced in [v1.83.0](https://docs.victoriametrics.com/CHANGELOG.html#v1830). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3343). diff --git a/docs/vmalert.md b/docs/vmalert.md index 5dcbe9ac8..60402d01d 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -677,10 +677,12 @@ See full description for these flags in `./vmalert -help`. ## Monitoring `vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. +The default list of alerting rules for these metric can be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker). We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported metrics may be analyzed later. -Use the official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. Graphs on this dashboard contain useful hints - hover the `i` icon at the top left corner of each graph in order to read it. +Use the official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. +Graphs on this dashboard contain useful hints - hover the `i` icon in the top left corner of each graph in order to read it. If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard.