VictoriaMetrics/deployment/docker/alerts.yml
Roman Khavronenko c6fc3fa94d
alerts: make alerting rule RPCErrors compatible with PromQL (#1204)
Original query can't be executed via PromQL which results in error
if expression is evaluated by Prometheus. The new expression is
compatible with both engines.
2021-04-13 08:10:23 +03:00

271 lines
13 KiB
YAML

# File contains default list of alerts for VM cluster and vmagent services.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
- name: serviceHealth
# note the `job` filter and update accordingly to your setup
rules:
- alert: TooManyRestarts
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
labels:
severity: critical
annotations:
summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
It might be crashlooping."
- alert: ServiceDown
expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
for: 2m
labels:
severity: "critical"
annotations:
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
# Alerts group for VM cluster assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/11176 is installed.
# Please, update the `dashboard` annotation according to your setup.
- name: vmcluster
interval: 30s
concurrency: 2
rules:
- alert: DiskRunsOutOfSpaceIn3Days
expr: |
vm_free_disk_space_bytes / ignoring(path) (
(
sum(rate(vm_rows_added_to_storage_total[1d])) -
sum(rate(vm_deduplicated_samples_total[1d])) without(type)
)
*
(
sum(vm_data_size_bytes{type!="indexdb"}) /
sum(vm_rows{type!="indexdb"})
)
) < 3 * 24 * 3600
for: 30m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} will run out of disk space in 3 days"
description: "Taking into account current ingestion rate, free disk space will be enough only
for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
- alert: DiskRunsOutOfSpace
expr: |
sum(vm_data_size_bytes) by(instance) /
(
sum(vm_free_disk_space_bytes) by(instance) +
sum(vm_data_size_bytes) by(instance)
) > 0.8
for: 30m
labels:
severity: critical
annotations:
dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=110&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
Having less than 20% of free disk space could cripple merges processes and overall performance.
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
- alert: RequestErrorsToAPI
expr: increase(vm_http_request_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=52&var-instance={{ $labels.instance }}"
summary: "Too many errors served for {{ $labels.job }} path {{ $labels.path }} (instance {{ $labels.instance }})"
description: "Requests to path {{ $labels.path }} are receiving errors.
Please verify if clients are sending correct requests."
- alert: RPCErrors
expr: |
(
sum(increase(vm_rpc_connection_errors_total[5m])) by(job, instance)
+
sum(increase(vm_rpc_dial_errors_total[5m])) by(job, instance)
+
sum(increase(vm_rpc_handshake_errors_total[5m])) by(job, instance)
+
sum(increase(vm_rpc_reroute_errors_total[5m])) by(job, instance)
) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=44&var-instance={{ $labels.instance }}"
summary: "Too many RPC errors for {{ $labels.job }} (instance {{ $labels.instance }})"
description: "RPC errors are interconnection errors between cluster components.\n
Possible reasons for errors are misconfiguration, overload, network blips or unreachable components."
- alert: ConcurrentFlushesHitTheLimit
expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}"
summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU."
- alert: TooManyLogs
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=104&var-instance={{ $labels.instance }}"
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
Worth to check logs for specific error messages."
- alert: RowsRejectedOnIngestion
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=135&var-instance={{ $labels.instance }}"
summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
following reason: \"{{ $labels.reason }}\""
- alert: TooHighChurnRate
expr: |
(
sum(rate(vm_new_timeseries_created_total[5m]))
/
sum(rate(vm_rows_inserted_total[5m]))
) > 0.1
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
summary: "Churn rate is more than 10% for the last 15m"
description: "VM constantly creates new time series.\n
This effect is known as Churn Rate.\n
High Churn Rate tightly connected with database performance and may
result in unexpected OOM's or slow queries."
- alert: TooHighChurnRate24h
expr: |
sum(increase(vm_new_timeseries_created_total[24h]))
>
(sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3)
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
summary: "Too high number of new series created over last 24h"
description: "The number of created new time series over last 24h is 3x times higher than
current number of active series.\n
This effect is known as Churn Rate.\n
High Churn Rate tightly connected with database performance and may
result in unexpected OOM's or slow queries."
- alert: TooHighSlowInsertsRate
expr: |
(
sum(rate(vm_slow_row_inserts_total[5m]))
/
sum(rate(vm_rows_inserted_total[5m]))
) > 0.05
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=108"
summary: "Percentage of slow inserts is more than 5% for the last 15m"
description: "High rate of slow inserts may be a sign of resource exhaustion
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
- alert: ProcessNearFDLimits
expr: (process_max_fds - process_open_fds) < 100
for: 5m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
Consider to increase the limit as fast as possible."
- alert: LabelsLimitExceededOnIngestion
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=116&var-instance={{ $labels.instance }}"
summary: "Metrics ingested to vminsert on ({{ $labels.instance }}) are exceeding labels limit"
description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
correctly or that clients which send these metrics aren't misbehaving."
- alert: VminsertIsDroppingRows
expr: sum(rate(vm_rpc_rows_lost_total[5m])) by(instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=84&var-instance={{ $labels.instance }}"
summary: "VMinsert \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops rows due to RPC errors."
description: "VMinsert starts to drop rows if there are no healthy VMstorage nodes where it can route
insert requests to. Check the health state of VMstorage nodes and RPC metrics."
# Alerts group for vmagent assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/12683 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmagent
interval: 30s
concurrency: 2
rules:
- alert: PersistentQueueIsDroppingData
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
for: 10m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert: TooManyScrapeErrors
expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert: TooManyWriteErrors
expr: |
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
+
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert: TooManyRemoteWriteErrors
expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
Ensure that destination is up and reachable."