# File contains default list of alerts for vm-single and vmagent services. # The alerts below are just recommendations and may require some updates # and threshold calibration according to every specific setup. groups: - name: vm-health # note the `job` filter and update accordingly to your setup rules: # note the `job` filter and update accordingly to your setup - alert: TooManyRestarts expr: changes(process_start_time_seconds{job=~"victoriametrics|vmagent|vmalert"}[15m]) > 2 labels: severity: critical annotations: summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})" description: "Job {{ $labels.job }} has restarted more than twice in the last 15 minutes. It might be crashlooping." - alert: ServiceDown expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." - alert: ProcessNearFDLimits expr: (process_max_fds - process_open_fds) < 100 for: 5m labels: severity: critical annotations: summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" description: "Exhausting OS file descriptors limit can cause severe degradation of the process. Consider to increase the limit as fast as possible." - alert: TooHighMemoryUsage expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9 for: 5m labels: severity: critical annotations: summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. Consider to either increase available memory or decrease the load on the process." - alert: TooHighCPUUsage expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 for: 5m labels: severity: critical annotations: summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process." # Alerts group for VM single assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/10229 is installed. # Pls update the `dashboard` annotation according to your setup. - name: vmsingle interval: 30s concurrency: 2 rules: - alert: DiskRunsOutOfSpaceIn3Days expr: | vm_free_disk_space_bytes / ignoring(path) ( ( rate(vm_rows_added_to_storage_total[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d]) ) * scalar( sum(vm_data_size_bytes{type!="indexdb"}) / sum(vm_rows{type!="indexdb"}) ) ) < 3 * 24 * 3600 > 0 for: 30m labels: severity: critical annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=73&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} will run out of disk space soon" description: "Taking into account current ingestion rate, free disk space will be enough only for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." - alert: DiskRunsOutOfSpace expr: | sum(vm_data_size_bytes) by(instance) / ( sum(vm_free_disk_space_bytes) by(instance) + sum(vm_data_size_bytes) by(instance) ) > 0.8 for: 30m labels: severity: critical annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} will run out of disk space soon" description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." - alert: RequestErrorsToAPI expr: increase(vm_http_request_errors_total[5m]) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}" summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})" description: "Requests to path {{ $labels.path }} are receiving errors. Please verify if clients are sending correct requests." - alert: ConcurrentFlushesHitTheLimit expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=59&var-instance={{ $labels.instance }}" summary: "VictoriaMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit" description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU." - alert: TooManyLogs expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=67&var-instance={{ $labels.instance }}" summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n Worth to check logs for specific error messages." - alert: RowsRejectedOnIngestion expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=58&var-instance={{ $labels.instance }}" summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt" description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the following reason: \"{{ $labels.reason }}\"" - alert: TooHighChurnRate expr: | ( sum(rate(vm_new_timeseries_created_total[5m])) by(instance) / sum(rate(vm_rows_inserted_total[5m])) by (instance) ) > 0.1 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}" summary: "Churn rate is more than 10% on \"{{ $labels.instance }}\" for the last 15m" description: "VM constantly creates new time series on \"{{ $labels.instance }}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries." - alert: TooHighChurnRate24h expr: | sum(increase(vm_new_timeseries_created_total[24h])) by(instance) > (sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3) for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}" summary: "Too high number of new series on \"{{ $labels.instance }}\" created over last 24h" description: "The number of created new time series over last 24h is 3x times higher than current number of active series on \"{{ $labels.instance }}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries." - alert: TooHighSlowInsertsRate expr: | ( sum(rate(vm_slow_row_inserts_total[5m])) by(instance) / sum(rate(vm_rows_inserted_total[5m])) by (instance) ) > 0.05 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}" summary: "Percentage of slow inserts is more than 5% on \"{{ $labels.instance }}\" for the last 15m" description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series." - alert: LabelsLimitExceededOnIngestion expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=74&var-instance={{ $labels.instance }}" summary: "Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit" description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured correctly or that clients which send these metrics aren't misbehaving." # Alerts group for vmagent assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/12683 is installed. # Pls update the `dashboard` annotation according to your setup. - name: vmagent interval: 30s concurrency: 2 rules: - alert: PersistentQueueIsDroppingData expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0 for: 10m labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} is dropping data from persistent queue" description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue on instance {{ $labels.instance }} for the last 10m." - alert: RejectedRemoteWriteDataBlocksAreDropped expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects." - alert: TooManyScrapeErrors expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m" - alert: TooManyWriteErrors expr: | (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance) + sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m." - alert: TooManyRemoteWriteErrors expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage" description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable." - alert: RemoteWriteConnectionIsSaturated expr: | sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) > 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url) for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}" summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated" description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage." - alert: PersistentQueueForWritesIsSaturated expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}" summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated" description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk. In this case, consider to decrease load on the vmagent or improve the disk throughput." - alert: PersistentQueueForReadsIsSaturated expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}" summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated" description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk. In this case, consider to decrease load on the vmagent or improve the disk throughput." - alert: SeriesLimitHourReached expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9 labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} reached 90% of the limit" description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems." - alert: SeriesLimitDayReached expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9 labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} reached 90% of the limit" description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems."