mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-01 14:47:38 +00:00
e7119de7f7
* vmagent: expose metric `vmagent_remotewrite_queues` (#2871) The new metric `vmagent_remotewrite_queues` exports a static value of number of configured remote write queus. This metric is useful to calculate total saturation per each configured URL with given number of queues. See corresponding changes to vmagent alerts and dashboard. Signed-off-by: hagen1778 <roman@victoriametrics.com> * Update dashboards/vmagent.json Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: Roman Khavronenko <roman@victoriametrics.com> Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
329 lines
17 KiB
YAML
329 lines
17 KiB
YAML
# File contains default list of alerts for vm-single and vmagent services.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
- name: vm-health
|
|
# note the `job` filter and update accordingly to your setup
|
|
rules:
|
|
# note the `job` filter and update accordingly to your setup
|
|
- alert: TooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"victoriametrics|vmagent|vmalert"}[15m]) > 2
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
|
|
description: "Job {{ $labels.job }} has restarted more than twice in the last 15 minutes.
|
|
It might be crashlooping."
|
|
|
|
- alert: ServiceDown
|
|
expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
- alert: ProcessNearFDLimits
|
|
expr: (process_max_fds - process_open_fds) < 100
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
|
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
|
Consider to increase the limit as fast as possible."
|
|
|
|
- alert: TooHighMemoryUsage
|
|
expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
|
|
description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
|
|
Consider to either increase available memory or decrease the load on the process."
|
|
|
|
- alert: TooHighCPUUsage
|
|
expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
|
|
description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
|
|
Consider to either increase available CPU resources or decrease the load on the process."
|
|
|
|
|
|
# Alerts group for VM single assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/10229 is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmsingle
|
|
interval: 30s
|
|
concurrency: 2
|
|
rules:
|
|
- alert: DiskRunsOutOfSpaceIn3Days
|
|
expr: |
|
|
vm_free_disk_space_bytes / ignoring(path)
|
|
(
|
|
(
|
|
rate(vm_rows_added_to_storage_total[1d]) -
|
|
ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d])
|
|
)
|
|
* scalar(
|
|
sum(vm_data_size_bytes{type!="indexdb"}) /
|
|
sum(vm_rows{type!="indexdb"})
|
|
)
|
|
) < 3 * 24 * 3600 > 0
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=73&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
|
|
description: "Taking into account current ingestion rate, free disk space will be enough only
|
|
for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
|
|
|
|
- alert: DiskRunsOutOfSpace
|
|
expr: |
|
|
sum(vm_data_size_bytes) by(instance) /
|
|
(
|
|
sum(vm_free_disk_space_bytes) by(instance) +
|
|
sum(vm_data_size_bytes) by(instance)
|
|
) > 0.8
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
|
|
description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
|
|
Having less than 20% of free disk space could cripple merges processes and overall performance.
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
|
|
|
|
- alert: RequestErrorsToAPI
|
|
expr: increase(vm_http_request_errors_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
|
|
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
|
|
description: "Requests to path {{ $labels.path }} are receiving errors.
|
|
Please verify if clients are sending correct requests."
|
|
|
|
- alert: ConcurrentFlushesHitTheLimit
|
|
expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=59&var-instance={{ $labels.instance }}"
|
|
summary: "VictoriaMetrics on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
|
|
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
|
|
When VictoriaMetrics constantly hits the limit it means that storage is overloaded and requires more CPU."
|
|
|
|
- alert: TooManyLogs
|
|
expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=67&var-instance={{ $labels.instance }}"
|
|
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
|
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
|
|
Worth to check logs for specific error messages."
|
|
|
|
- alert: RowsRejectedOnIngestion
|
|
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=58&var-instance={{ $labels.instance }}"
|
|
summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
|
|
description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
|
|
following reason: \"{{ $labels.reason }}\""
|
|
|
|
- alert: TooHighChurnRate
|
|
expr: |
|
|
(
|
|
sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
|
|
/
|
|
sum(rate(vm_rows_inserted_total[5m])) by (instance)
|
|
) > 0.1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
|
|
summary: "Churn rate is more than 10% on \"{{ $labels.instance }}\" for the last 15m"
|
|
description: "VM constantly creates new time series on \"{{ $labels.instance }}\".\n
|
|
This effect is known as Churn Rate.\n
|
|
High Churn Rate tightly connected with database performance and may
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
- alert: TooHighChurnRate24h
|
|
expr: |
|
|
sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
|
|
>
|
|
(sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
|
|
summary: "Too high number of new series on \"{{ $labels.instance }}\" created over last 24h"
|
|
description: "The number of created new time series over last 24h is 3x times higher than
|
|
current number of active series on \"{{ $labels.instance }}\".\n
|
|
This effect is known as Churn Rate.\n
|
|
High Churn Rate tightly connected with database performance and may
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
- alert: TooHighSlowInsertsRate
|
|
expr: |
|
|
(
|
|
sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
|
|
/
|
|
sum(rate(vm_rows_inserted_total[5m])) by (instance)
|
|
) > 0.05
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}"
|
|
summary: "Percentage of slow inserts is more than 5% on \"{{ $labels.instance }}\" for the last 15m"
|
|
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
|
|
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
|
|
|
|
- alert: LabelsLimitExceededOnIngestion
|
|
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=74&var-instance={{ $labels.instance }}"
|
|
summary: "Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit"
|
|
description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
|
|
This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
|
|
correctly or that clients which send these metrics aren't misbehaving."
|
|
|
|
# Alerts group for vmagent assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/12683 is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmagent
|
|
interval: 30s
|
|
concurrency: 2
|
|
rules:
|
|
- alert: PersistentQueueIsDroppingData
|
|
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
|
|
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
|
|
on instance {{ $labels.instance }} for the last 10m."
|
|
|
|
- alert: RejectedRemoteWriteDataBlocksAreDropped
|
|
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
|
|
remote-write server data blocks. Check the logs to find the reason for rejects."
|
|
|
|
- alert: TooManyScrapeErrors
|
|
expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
|
|
|
|
- alert: TooManyWriteErrors
|
|
expr: |
|
|
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
|
|
+
|
|
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
|
|
|
|
- alert: TooManyRemoteWriteErrors
|
|
expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
|
|
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
|
|
Ensure that destination is up and reachable."
|
|
|
|
- alert: RemoteWriteConnectionIsSaturated
|
|
expr: |
|
|
sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url)
|
|
> 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
|
|
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
|
|
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
|
|
is saturated by more than 90% and vmagent won't be able to keep up.\n
|
|
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
|
|
the number of connections per each remote storage."
|
|
|
|
- alert: PersistentQueueForWritesIsSaturated
|
|
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: PersistentQueueForReadsIsSaturated
|
|
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: SeriesLimitHourReached
|
|
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: SeriesLimitDayReached
|
|
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|