From 51faea5e4bb3d7ad645fac603d678f36a6c24113 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 5 Apr 2021 20:29:04 +0100 Subject: [PATCH] deployment: add vmalert+alertmanager services and list of default alerts for cluster version (#1187) --- deployment/docker/alertmanager.yml | 5 + deployment/docker/alerts.yml | 253 +++++++++++++++++++++++++++ deployment/docker/docker-compose.yml | 32 ++++ 3 files changed, 290 insertions(+) create mode 100644 deployment/docker/alertmanager.yml create mode 100644 deployment/docker/alerts.yml diff --git a/deployment/docker/alertmanager.yml b/deployment/docker/alertmanager.yml new file mode 100644 index 000000000..4b68f7863 --- /dev/null +++ b/deployment/docker/alertmanager.yml @@ -0,0 +1,5 @@ +route: + receiver: blackhole + +receivers: + - name: blackhole \ No newline at end of file diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml new file mode 100644 index 000000000..fd61a943e --- /dev/null +++ b/deployment/docker/alerts.yml @@ -0,0 +1,253 @@ +# File contains default list of alerts for VM cluster and vmagent services. +# The alerts below are just recommendations and may require some updates +# and threshold calibration according to every specific setup. +groups: + - name: serviceHealth + rules: + # note the `job` filter and update accordingly to your setup + - alert: TooManyRestarts + expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2 + labels: + severity: critical + annotations: + summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" + description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. + It might be crashlooping." + + # Alerts group for VM cluster assumes that Grafana dashboard + # https://grafana.com/grafana/dashboards/11176 is installed. + # Please, update the `dashboard` annotation according to your setup. + - name: vmcluster + interval: 30s + concurrency: 2 + rules: + - alert: DiskRunsOutOfSpaceIn3Days + expr: | + vm_free_disk_space_bytes / ignoring(path) ( + ( + sum(rate(vm_rows_added_to_storage_total[1d])) - + sum(rate(vm_deduplicated_samples_total[1d])) without(type) + ) + * + ( + sum(vm_data_size_bytes{type!="indexdb"}) / + sum(vm_rows{type!="indexdb"}) + ) + ) < 3 * 24 * 3600 + for: 30m + labels: + severity: critical + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }}" + summary: "Instance {{ $labels.instance }} will run out of disk space in 3 days" + description: "Taking into account current ingestion rate, free disk space will be enough only + for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n + Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible." + + - alert: DiskRunsOutOfSpace + expr: | + sum(vm_data_size_bytes) by(instance) / + ( + sum(vm_free_disk_space_bytes) by(instance) + + sum(vm_data_size_bytes) by(instance) + ) > 0.8 + for: 30m + labels: + severity: critical + annotations: + dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=110&var-instance={{ $labels.instance }}" + summary: "Instance {{ $labels.instance }} will run out of disk space soon" + description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n + Having less than 20% of free disk space could cripple merges processes and overall performance. + Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." + + - alert: RequestErrorsToAPI + expr: increase(vm_http_request_errors_total[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=52&var-instance={{ $labels.instance }}" + summary: "Too many errors served for {{ $labels.job }} path {{ $labels.path }} (instance {{ $labels.instance }})" + description: "Requests to path {{ $labels.path }} are receiving errors. + Please verify if clients are sending correct requests." + + - alert: RPCErrors + expr: sum(increase({__name__=~"vm_rpc_.*_errors_total"}[5m])) by(job, instance) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=44&var-instance={{ $labels.instance }}" + summary: "Too many RPC errors for {{ $labels.job }} (instance {{ $labels.instance }})" + description: "RPC errors are interconnection errors between cluster components.\n + Possible reasons for errors are misconfiguration, overload, network blips or unreachable components." + + - alert: ConcurrentFlushesHitTheLimit + expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}" + summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit" + description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n + When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU." + + - alert: TooManyLogs + expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=104&var-instance={{ $labels.instance }}" + summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" + description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n + Worth to check logs for specific error messages." + + - alert: RowsRejectedOnIngestion + expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=135&var-instance={{ $labels.instance }}" + summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt" + description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the + following reason: \"{{ $labels.reason }}\"" + + - alert: TooHighChurnRate + expr: | + ( + sum(rate(vm_new_timeseries_created_total[5m])) + / + sum(rate(vm_rows_inserted_total[5m])) + ) > 0.1 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102" + summary: "Churn rate is more than 10% for the last 15m" + description: "VM constantly creates new time series.\n + This effect is known as Churn Rate.\n + High Churn Rate tightly connected with database performance and may + result in unexpected OOM's or slow queries." + + - alert: TooHighChurnRate24h + expr: | + sum(increase(vm_new_timeseries_created_total[24h])) + > + (sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3) + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102" + summary: "Too high number of new series created over last 24h" + description: "The number of created new time series over last 24h is 3x times higher than + current number of active series.\n + This effect is known as Churn Rate.\n + High Churn Rate tightly connected with database performance and may + result in unexpected OOM's or slow queries." + + - alert: TooHighSlowInsertsRate + expr: | + ( + sum(rate(vm_slow_row_inserts_total[5m])) + / + sum(rate(vm_rows_inserted_total[5m])) + ) > 0.05 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=108" + summary: "Percentage of slow inserts is more than 5% for the last 15m" + description: "High rate of slow inserts may be a sign of resource exhaustion + for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series." + + - alert: ProcessNearFDLimits + expr: (process_max_fds - process_open_fds) < 100 + for: 5m + labels: + severity: critical + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}" + summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" + description: "Exhausting OS file descriptors limit can cause severe degradation of the process. + Consider to increase the limit as fast as possible." + + - alert: LabelsLimitExceededOnIngestion + expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=116&var-instance={{ $labels.instance }}" + summary: "Metrics ingested to vminsert on ({{ $labels.instance }}) are exceeding labels limit" + description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n + This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured + correctly or that clients which send these metrics aren't misbehaving." + + - alert: VminsertIsDroppingRows + expr: sum(rate(vm_rpc_rows_lost_total[5m])) by(instance) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=84&var-instance={{ $labels.instance }}" + summary: "VMinsert \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops rows due to RPC errors." + description: "VMinsert starts to drop rows if there are no healthy VMstorage nodes where it can route + insert requests to. Check the health state of VMstorage nodes and RPC metrics." + + + # Alerts group for vmagent assumes that Grafana dashboard + # https://grafana.com/grafana/dashboards/12683 is installed. + # Pls update the `dashboard` annotation according to your setup. + - name: vmagent + interval: 30s + concurrency: 2 + rules: + - alert: PersistentQueueIsDroppingData + expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0 + for: 10m + labels: + severity: critical + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}" + summary: "Instance {{ $labels.instance }} is dropping data from persistent queue" + description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue + on instance {{ $labels.instance }} for the last 10m." + + - alert: TooManyScrapeErrors + expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}" + summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m" + + - alert: TooManyWriteErrors + expr: | + (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance) + + + sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}" + summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m." + + - alert: TooManyRemoteWriteErrors + expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}" + summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage" + description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n + Ensure that destination is up and reachable." diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml index c0e319499..28571c736 100644 --- a/deployment/docker/docker-compose.yml +++ b/deployment/docker/docker-compose.yml @@ -64,6 +64,38 @@ services: - 8481:8481 restart: always + vmalert: + container_name: vmalert + image: victoriametrics/vmalert + depends_on: + - "vmselect" + ports: + - 8880:8880 + volumes: + - ./alerts.yml:/etc/alerts/alerts.yml + command: + - '--datasource.url=http://vmselect:8481/select/0/prometheus' + - '--remoteRead.url=http://vmselect:8481/select/0/prometheus' + - '--remoteWrite.url=http://vminsert:8480/insert/0/prometheus' + - '--notifier.url=http://alertmanager:9093/' + - '--rule=/etc/alerts/*.yml' + # display source of alerts in grafana + - '-external.url=http://127.0.0.1:3000' #grafana outside container + # when copypaste the line below be aware of '$$' for escaping in '$expr' + - '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":"{{$$expr|quotesEscape|crlfEscape|queryEscape}}"},{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]' + restart: always + + alertmanager: + container_name: alertmanager + image: prom/alertmanager + volumes: + - ./alertmanager.yml:/config/alertmanager.yml + command: + - '--config.file=/config/alertmanager.yml' + ports: + - 9093:9093 + restart: always + volumes: vmagentdata: {} strgdata: {}