diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml index 72dcd75d0..2994b6f9a 100644 --- a/deployment/docker/alerts-cluster.yml +++ b/deployment/docker/alerts-cluster.yml @@ -80,18 +80,6 @@ groups: description: "RPC errors are interconnection errors between cluster components.\n Possible reasons for errors are misconfiguration, overload, network blips or unreachable components." - - alert: ConcurrentFlushesHitTheLimit - expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity - for: 15m - labels: - severity: warning - show_at: dashboard - annotations: - dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}" - summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit" - description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n - When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU." - - alert: RowsRejectedOnIngestion expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 for: 15m diff --git a/deployment/docker/alerts-health.yml b/deployment/docker/alerts-health.yml index fde2de5ae..11ec22c1f 100644 --- a/deployment/docker/alerts-health.yml +++ b/deployment/docker/alerts-health.yml @@ -1,4 +1,5 @@ -# File contains default list of alerts for VM components. +# File contains default list of alerts for various VM components. +# The following alerts are recommended for use for any VM installation. # The alerts below are just recommendations and may require some updates # and threshold calibration according to every specific setup. groups: @@ -73,3 +74,16 @@ groups: description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).\n Make sure you're running VictoriaMetrics of v1.85.3 or higher.\n Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502" + + - alert: ConcurrentInsertsHitTheLimit + expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity + for: 15m + labels: + severity: warning + annotations: + summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit" + description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n + Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU. + In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients + making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then + it might be worth adjusting `-maxConcurrentInserts` cmd-line flag." diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 49cb4317e..62df9af9d 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -60,18 +60,6 @@ groups: description: "Requests to path {{ $labels.path }} are receiving errors. Please verify if clients are sending correct requests." - - alert: ConcurrentFlushesHitTheLimit - expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity - for: 15m - labels: - severity: warning - show_at: dashboard - annotations: - dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=59&var-instance={{ $labels.instance }}" - summary: "VictoriaMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit" - description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n - When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU." - - alert: RowsRejectedOnIngestion expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 for: 15m diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 4ec806ba7..86152d893 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -33,6 +33,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): do not add `/api/v1/read` suffix to remote read storage address defined by `--remote-read-src-addr` if a `--remote-read-disable-path-append` command-line flag is set. It allows an overriding path for remote-read API via `--remote-read-src-addr`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4655). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add warning in query field of vmui for partial data responses. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4721). * FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add `Concurrent inserts` panel to vmagent's dasbhoard. The new panel supposed to show whether the number of concurrent inserts processed by vmagent isn't reaching the limit. +* FEATURE: [Alerting rules for VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#alerts): `ConcurrentFlushesHitTheLimit` alerting rule was moved from [single-server](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts.yml) and [cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-cluster.yml) alerts to the [list of "health" alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-health.yml) as it could be related to many VictoriaMetrics components. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use local scrape timestamps for the scraped metrics unless `honor_timestamps: true` option is explicitly set at [scrape_config](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This fixes gaps for metrics collected from [cadvisor](https://github.com/google/cadvisor) or similar exporters, which export metrics with invalid timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1654614799) for details. The issue has been introduced in [v1.68.0](#v1680). * BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager.html): fix panic when creating a backup to a local filesystem on Windows. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4704).