mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
docs/vmanomaly: preset guide (#6151)
- added instructions of how to run presets - description of Node Exporter indicators covered
This commit is contained in:
parent
a43823774b
commit
c32b8d39dd
12 changed files with 2487 additions and 0 deletions
|
@ -0,0 +1,237 @@
|
||||||
|
# This is a reduced version of awesome alerts 2023-12-01.1
|
||||||
|
groups:
|
||||||
|
- name: AwesomeAlerts
|
||||||
|
rules:
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostMemoryIsUnderutilized
|
||||||
|
expr: (100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 1w
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostDiskWillFillIn24Hours
|
||||||
|
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfInodes
|
||||||
|
expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostFilesystemDeviceError
|
||||||
|
expr: node_filesystem_device_error == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostInodesWillFillIn24Hours
|
||||||
|
expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskIo
|
||||||
|
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||||
|
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSystemdServiceCrashed
|
||||||
|
expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||||
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: (node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||||
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||||
|
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostKernelVersionDeviations
|
||||||
|
expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||||
|
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||||
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkInterfaceSaturated
|
||||||
|
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||||
|
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkBondDegraded
|
||||||
|
expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||||
|
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||||
|
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock skew (instance {{ $labels.instance }})
|
||||||
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||||
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRequiresReboot
|
||||||
|
expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,41 @@
|
||||||
|
# an example of what config subset user should input to make node_exporter preset fully functional
|
||||||
|
# commented fields are optional for the user, as they are already defined in the preset:
|
||||||
|
# ./presets/node_exporter/vmanomaly_config.yml
|
||||||
|
preset: "node-exporter"
|
||||||
|
|
||||||
|
# schedulers: # section is already defined
|
||||||
|
# models: # section is already defined
|
||||||
|
|
||||||
|
reader:
|
||||||
|
# queries: # already defined
|
||||||
|
# sampling_period: # already defined
|
||||||
|
|
||||||
|
datasource_url: "http://victoriametrics:8428/" # your datasource url
|
||||||
|
# tenant_id: '0:0' # specify for cluster version
|
||||||
|
# add any other field - https://docs.victoriametrics.com/anomaly-detection/components/reader/#vm-reader
|
||||||
|
|
||||||
|
writer:
|
||||||
|
# metric_format: # already defined
|
||||||
|
# __name__: "node_exporter_$VAR"
|
||||||
|
# for: "$QUERY_KEY"
|
||||||
|
# but you can override it or add other labels
|
||||||
|
|
||||||
|
datasource_url: "http://victoriametrics:8428/" # your datasource url
|
||||||
|
# tenant_id: '0:0' # specify for cluster version
|
||||||
|
|
||||||
|
# add any other field - https://docs.victoriametrics.com/anomaly-detection/components/writer/#vm-writer
|
||||||
|
|
||||||
|
# monitoring:
|
||||||
|
# # pull section is already defined
|
||||||
|
# # pull:
|
||||||
|
# # addr: "0.0.0.0"
|
||||||
|
# # port: 8080
|
||||||
|
|
||||||
|
# push: # most of the fields are already defined
|
||||||
|
# # extra_labels: # some labels are already defined, but you can add more
|
||||||
|
# # job: "vmanomaly"
|
||||||
|
# # config: "node_exporter"
|
||||||
|
# url: "http://victoriametrics:8428/" # your datasource url to push self-monitoring metrics
|
||||||
|
# # tenant_id: '0:0' # specify for cluster version
|
||||||
|
|
||||||
|
# # add any other field - https://docs.victoriametrics.com/anomaly-detection/components/monitoring
|
|
@ -0,0 +1,74 @@
|
||||||
|
groups:
|
||||||
|
- name: VMAnomaly
|
||||||
|
rules:
|
||||||
|
- alert: PageFaults
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="page_faults"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal Page Faults (instance {{ $labels.instance }}).
|
||||||
|
|
||||||
|
- alert: ReceiveBytes
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="receive_bytes"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Received Bytes (instance {{ $labels.instance }}, device {{$labels.device}}).
|
||||||
|
|
||||||
|
- alert: TransmitBytes
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="transmit_bytes"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Transmit bytes (instance {{ $labels.instance }}, device {{$labels.device}}).
|
||||||
|
|
||||||
|
- alert: ReadLatency
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="read_latency"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal Read latency (instance {{ $labels.instance }}, device {{$labels.device}}).
|
||||||
|
|
||||||
|
- alert: WriteLatency
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="write_latency"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal Write latency (instance {{ $labels.instance }}, device {{$labels.device}}).
|
||||||
|
|
||||||
|
- alert: CpuSecondsTotal
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="cpu_seconds_total"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Node CPU seconds (instance {{ $labels.instance }}).
|
||||||
|
|
||||||
|
- alert: ContextSwitch
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="context_switch"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Context Switches (instance {{ $labels.instance }}).
|
||||||
|
|
||||||
|
- alert: HostNetworkReceiveErrors
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="host_network_receive_errors"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Host Network Receive Errors (instance {{ $labels.instance }}, device {{$labels.device}}).
|
||||||
|
|
||||||
|
- alert: HostNetworkTransmitErrors
|
||||||
|
expr: min(anomaly_score{preset="node-exporter", for="host_network_transmit_errors"}) without (model_alias, scheduler_alias)>=1.0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Abnormal amount of Host Network Transmit Errors (instance {{ $labels.instance }}, device {{$labels.device}}).
|
|
@ -35,6 +35,8 @@ In addition to that, setting up alerting rules manually has been proven to be te
|
||||||
error-prone, while anomaly detection can be easier to set up, and use the same model for different
|
error-prone, while anomaly detection can be easier to set up, and use the same model for different
|
||||||
metrics.
|
metrics.
|
||||||
|
|
||||||
|
`vmanomaly` can be used as a helper to set up your own alerting. You can rely on the spikes you see in anomaly scores to form the metric queries for alerting rules.
|
||||||
|
|
||||||
> **Note: `vmanomaly` is a part of [enterprise package](https://docs.victoriametrics.com/enterprise/). You need to get a [free trial license](https://victoriametrics.com/products/enterprise/trial/) for evaluation.**
|
> **Note: `vmanomaly` is a part of [enterprise package](https://docs.victoriametrics.com/enterprise/). You need to get a [free trial license](https://victoriametrics.com/products/enterprise/trial/) for evaluation.**
|
||||||
|
|
||||||
## How?
|
## How?
|
||||||
|
|
153
docs/anomaly-detection/Presets.md
Normal file
153
docs/anomaly-detection/Presets.md
Normal file
|
@ -0,0 +1,153 @@
|
||||||
|
---
|
||||||
|
sort: 3
|
||||||
|
weight: 1
|
||||||
|
title: Presets
|
||||||
|
menu:
|
||||||
|
docs:
|
||||||
|
parent: "anomaly-detection"
|
||||||
|
weight: 1
|
||||||
|
title: Presets
|
||||||
|
---
|
||||||
|
# Anomaly Detection Presets
|
||||||
|
> Please check the [Quick Start Guide](/anomaly-detection/quickstart/) to install and run `vmanomaly`
|
||||||
|
|
||||||
|
> Presets are available from v1.13.0
|
||||||
|
|
||||||
|
Presets enable anomaly detection in indicators that are hard to monitor using alerts based on static thresholds.
|
||||||
|
So, the anomaly detection alerting rules based on the [`anomaly_scores`](https://docs.victoriametrics.com/anomaly-detection/faq/#what-is-anomaly-score) stay the same over time, and we generate the anomaly scores using predefined machine learning models.
|
||||||
|
Models are constantly retraining on different time frames which helps to keep alerts up to date and to consider underlying data patterns.
|
||||||
|
|
||||||
|
You can set up the simplified configuration file for `vmanomaly` just specifying the type of preset and data sources in [`reader`](https://docs.victoriametrics.com/anomaly-detection/components/reader/) and [`writer`](https://docs.victoriametrics.com/anomaly-detection/components/writer/) sections of the config.
|
||||||
|
The rest of the parameters are already set up for you.
|
||||||
|
|
||||||
|
Available presets:
|
||||||
|
- [Node-Exporter](#node-exporter)
|
||||||
|
|
||||||
|
Here is an example config file to enable Node-Exporter preset:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
preset: "node-exporter"
|
||||||
|
reader:
|
||||||
|
datasource_url: "http://victoriametrics:8428/" # your datasource url
|
||||||
|
# tenant_id: '0:0' # specify for cluster version
|
||||||
|
writer:
|
||||||
|
datasource_url: "http://victoriametrics:8428/" # your datasource url
|
||||||
|
# tenant_id: '0:0' # specify for cluster version
|
||||||
|
```
|
||||||
|
Run a service using config file with one of the [available options](/anomaly-detection/quickstart/#how-to-install-and-run-vmanomaly).
|
||||||
|
|
||||||
|
After you run `vmanomaly`, the available assets can be found here: `http://localhost:8490/presets/`
|
||||||
|
|
||||||
|
<img alt="preset-localhost" src="presets-localhost.webp">
|
||||||
|
|
||||||
|
## Node-Exporter
|
||||||
|
|
||||||
|
> **Note: Configurations for presets can be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/)**
|
||||||
|
|
||||||
|
For enabling Node-Exporter in config file use `preset` parameter:
|
||||||
|
```yaml
|
||||||
|
preset: "node-exporter"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generated Anomaly scores
|
||||||
|
Machine learning models will be fit for each timeseries, returned by underlying [MetricsQL](https://docs.victoriametrics.com/metricsql/).
|
||||||
|
Anomaly score metric labels will also contain underlying [model classes](/anomaly-detection/components/models/) and [schedulers](/anomaly-detection/components/scheduler/).
|
||||||
|
Here's an example of produced metrics:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="system", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.23451242720277776
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="user", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.2637952255694444
|
||||||
|
anomaly_score{for="page_faults", instance="node-exporter:9100", job="node-exporter", preset="node-exporter", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.00593712535
|
||||||
|
anomaly_score{for="read_latency", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.27773362795333334
|
||||||
|
anomaly_score{for="receive_bytes", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.037753486136666674
|
||||||
|
anomaly_score{for="transmit_bytes", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.17633085235
|
||||||
|
anomaly_score{for="write_latency", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.019314370926666668
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="mad", scheduler_alias="1d_1m"} 4.2323617935
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="mad", scheduler_alias="2w_1m"} 1.5261359215
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="prophet", scheduler_alias="2w_1m"} 0.5850743651
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="z-score", scheduler_alias="1d_1m"} 1.6496064663
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="z-score", scheduler_alias="2w_1m"} 0.924392581
|
||||||
|
anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="iowait", model_alias="mad", scheduler_alias="1d_1m"} 0.8571428657
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Alerts
|
||||||
|
> We recommend to use [Awesome Prometheus alerts](https://github.com/samber/awesome-prometheus-alerts) for alerts not covered by presets.
|
||||||
|
|
||||||
|
Provided alerts are set to fire every time all models vote that the datapoint is anomalous.
|
||||||
|
|
||||||
|
You can find alerting rules here:
|
||||||
|
- `vmanomaly` [Anomaly Detection alerts](http://localhost:8490/presets/vmanomaly_alerts.yml): `http://localhost:8490/presets/vmanomaly_alerts.yml`
|
||||||
|
- [Modified Awesome Alerts](http://localhost:8490/presets/awesome_alerts.yml): `http://localhost:8490/presets/awesome_alerts.yml`
|
||||||
|
|
||||||
|
#### Awesome Alerts replaced by Machine Learning alerts
|
||||||
|
- HostMemoryUnderMemoryPressure
|
||||||
|
- HostContextSwitching
|
||||||
|
- HostHighCpuLoad
|
||||||
|
- HostCpuIsUnderutilized
|
||||||
|
- HostCpuStealNoisyNeighbor
|
||||||
|
- HostCpuHighIowait
|
||||||
|
- HostNetworkReceiveErrors
|
||||||
|
- HostNetworkTransmitErrors
|
||||||
|
- HostUnusualNetworkThroughputIn
|
||||||
|
- HostUnusualNetworkThroughputOut
|
||||||
|
|
||||||
|
### Grafana dashboard
|
||||||
|
Grafana dashboard `.json` file can be found [here](http://localhost:8490/presets/dashboard.json): `http://localhost:8490/presets/dashboard.json`
|
||||||
|
|
||||||
|
### Indicators monitored by preset
|
||||||
|
|
||||||
|
The produced anomaly scores will contain label `for` with the name of corresponding indicator.
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Indicator</th>
|
||||||
|
<th>Based on metrics</th>
|
||||||
|
<th>Description</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><code>page_faults</code></td>
|
||||||
|
<td><code>node_vmstat_pgmajfault</code></td>
|
||||||
|
<td>Number of major faults that have occurred since the last update. Major faults occur when a process tries to access a page in memory that is not currently mapped in the process's address space, and it requires loading data from the disk.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>context_switch</code></td>
|
||||||
|
<td><code>node_context_switches_total</code></td>
|
||||||
|
<td>This metric represents the total number of context switches across all CPUs.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>cpu_seconds_total</code></td>
|
||||||
|
<td><code>node_cpu_seconds_total</code></td>
|
||||||
|
<td>Total amount of CPU time consumed by the system in seconds by CPU processing mode (e.g., user, system, idle).</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>host_network_receive_errors</code> & <code>host_network_transmit_errors</code></td>
|
||||||
|
<td><code>node_network_receive_errs_total</code>, <code>node_network_receive_packets_total</code>, <code>node_network_transmit_errs_total</code>, <code>node_network_transmit_packets_total</code>
|
||||||
|
<td>Total number of errors encountered while receiving/transmitting packets on the network interfaces of a node.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>receive_bytes</code> & <code>transmit_bytes</code></td>
|
||||||
|
<td><code>node_network_receive_bytes_total</code>, <code>node_network_transmit_bytes_total</code></td>
|
||||||
|
<td>Total number of bytes received/transmitted on network interfaces of a node.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><code>read_latency</code> & <code>write_latency</code></td>
|
||||||
|
<td><code>node_disk_read_time_seconds_total</code>, <code>node_disk_reads_completed_total</code>, <code>node_disk_write_time_seconds_total</code>, <code>node_disk_writes_completed_total</code></td>
|
||||||
|
<td>Disk latency. The total read/write time spent in seconds. / The total number of reads/writes completed successfully.</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
## Example
|
||||||
|
On the graph 'Percentage of Anomalies,' you can see a spike to 8.75% of anomalies at the timestamp '2024-06-03 10:35:00'. The graph 'Anomalies per Indicator' shows the indicators that were anomalous at the corresponding time.
|
||||||
|
<img alt="global" src="presets_global_percentage.webp">
|
||||||
|
At this timestamp on the 'Number of Anomalous Indicators by Node,' we can identify the node that had the most anomalies: `10.142.0.27`
|
||||||
|
<img alt="by_node" src="presets_anomalies_by_node.webp">
|
||||||
|
For this node from the timestamp `2024-06-03 10:35:00` CPU time spent handling software interrupts started to grow.
|
||||||
|
(`cpu_seconds_total{mode="softirq"}`)
|
||||||
|
<img alt="irq" src="presets_cpu_seconds_softirq.webp">
|
||||||
|
At the same time `cpu_seconds_total` for `steal` mode started to grow as well.
|
||||||
|
<img alt="steal" src="presets_cpu_seconds_steal.webp">
|
|
@ -18,6 +18,7 @@ Begin your VictoriaMetrics Anomaly Detection journey with ease using our guides
|
||||||
- **Quickstart**: Check out how to get `vmanomaly` up and running [here](/anomaly-detection/QuickStart.html).
|
- **Quickstart**: Check out how to get `vmanomaly` up and running [here](/anomaly-detection/QuickStart.html).
|
||||||
- **Overview**: Find out how `vmanomaly` service operates [here](/anomaly-detection/Overview.html)
|
- **Overview**: Find out how `vmanomaly` service operates [here](/anomaly-detection/Overview.html)
|
||||||
- **Integration**: Integrate anomaly detection into your observability ecosystem. Get started [here](/anomaly-detection/guides/guide-vmanomaly-vmalert.html).
|
- **Integration**: Integrate anomaly detection into your observability ecosystem. Get started [here](/anomaly-detection/guides/guide-vmanomaly-vmalert.html).
|
||||||
|
- **Anomaly Detection Presets**: Enable anomaly detection on predefined set of indicators, that require frequently changing static thresholds for alerting. Find more information [here](/anomaly-detection/presets/).
|
||||||
|
|
||||||
- **Installation Options**: Select the method that aligns with your technical requirements:
|
- **Installation Options**: Select the method that aligns with your technical requirements:
|
||||||
- **Docker Installation**: Suitable for containerized environments. See [Docker guide](/anomaly-detection/Overview.html#run-vmanomaly-docker-container).
|
- **Docker Installation**: Suitable for containerized environments. See [Docker guide](/anomaly-detection/Overview.html#run-vmanomaly-docker-container).
|
||||||
|
|
BIN
docs/anomaly-detection/presets-localhost.webp
Normal file
BIN
docs/anomaly-detection/presets-localhost.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 150 KiB |
BIN
docs/anomaly-detection/presets_anomalies_by_node.webp
Normal file
BIN
docs/anomaly-detection/presets_anomalies_by_node.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 269 KiB |
BIN
docs/anomaly-detection/presets_cpu_seconds_softirq.webp
Normal file
BIN
docs/anomaly-detection/presets_cpu_seconds_softirq.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 247 KiB |
BIN
docs/anomaly-detection/presets_cpu_seconds_steal.webp
Normal file
BIN
docs/anomaly-detection/presets_cpu_seconds_steal.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 213 KiB |
BIN
docs/anomaly-detection/presets_global_percentage.webp
Normal file
BIN
docs/anomaly-detection/presets_global_percentage.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 260 KiB |
Loading…
Reference in a new issue