diff --git a/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/awesome_alerts.yml b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/awesome_alerts.yml new file mode 100644 index 0000000000..911b3b9d86 --- /dev/null +++ b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/awesome_alerts.yml @@ -0,0 +1,237 @@ +# This is a reduced version of awesome alerts 2023-12-01.1 +groups: +- name: AwesomeAlerts + rules: + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryIsUnderutilized + expr: (100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskWillFillIn24Hours + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: node_filesystem_device_error == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: (node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file diff --git a/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/dashboard.json b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/dashboard.json new file mode 100644 index 0000000000..d2d48b047b --- /dev/null +++ b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/dashboard.json @@ -0,0 +1,1979 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "- if not filtered, shows global percentage of anomalies\n-if filtered by Node, shows % of anomalous indicators", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No anomalies found", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 145, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count((min(anomaly_score{preset=\"node-exporter\",instance=~\"$node\"}) by (for, instance))>1)/count((min(anomaly_score{preset=\"node-exporter\",instance=~\"$node\"}) by (for, instance)))", + "hide": false, + "legendFormat": "Percentage", + "range": true, + "refId": "A" + } + ], + "title": "Percentage of Anomalous Nodes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Count of anomaly scores greater than threshold", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "noValue": "No anomalies found", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 146, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={preset=\"node-exporter\"}) count(count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (for)) by (for)/count(count by (for) (anomaly_score{f})) by (for)", + "format": "time_series", + "hide": false, + "legendFormat": "{{for}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies per Indicator", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of anomalous indicators per node", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 224, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count((min(anomaly_score{preset=\"node-exporter\",instance=~\"$node\"}) without (model_alias,scheduler_alias))>=1.0) by (instance)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Numbers of Anomalous Indicators by Node", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 88, + "panels": [], + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 6, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={instance=~\"$node\",for=\"receive_bytes\",preset=\"node-exporter\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Network Receive Bytes", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 94, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_network_receive_bytes_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])*8", + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Network Traffic- Receive Bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 156, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={instance=~\"$node\",for=\"transmit_bytes\",preset=\"node-exporter\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Network Transmit Bytes", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 157, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_network_transmit_bytes_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])*8", + "hide": false, + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "B" + } + ], + "title": "Network Traffic - Transmit Bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 111, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# Title\n\nFor markdown syntax help: [commonmark.org/help](https://commonmark.org/help/)", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "title": "CPU seconds", + "type": "text" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 158, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden spike of Node CPU seconds", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 22, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={instance=~\"$node\",for=\"cpu_seconds_total\",preset=\"node-exporter\",mode=~\"$cpu_mode\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "hide": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "Anomalies: CPU Seconds: Mode $cpu_mode", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=~\"$job\", mode=~\"$cpu_mode\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])))", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Basic", + "type": "timeseries" + } + ], + "repeat": "cpu_mode", + "repeatDirection": "h", + "title": "CPU seconds. Mode: $cpu_mode", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 78, + "panels": [], + "title": "Storage disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 114, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# Title\n\nFor markdown syntax help: [commonmark.org/help](https://commonmark.org/help/)", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "title": "Latency", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden drop or spike of Read latency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 18, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={instance=~\"$node\",for=\"read_latency\",preset=\"node-exporter\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "hide": false, + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anoamlies: Read Latency", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 74, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])", + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Disk Average Wait Time: Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden drop or spike of Write latency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 20, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={instance=~\"$node\",for=\"write_latency\",preset=\"node-exporter\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Write Latency", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 61 + }, + "id": 180, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])", + "hide": false, + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "B" + } + ], + "title": "Disk Average Wait Time: Write", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 71 + }, + "id": 62, + "panels": [], + "title": "System Misc", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden spike of Context Switch", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 24, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={for=\"context_switch\",preset=\"node-exporter\",instance=~\"$node\",job=~\"$job\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Context Switch", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 66, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_context_switches_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Context Switches", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 82 + }, + "id": 40, + "panels": [], + "title": "Network Traffic", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden drop or spike of Host Network Transmit Errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 32, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={for=\"host_network_transmit_errors\",preset=\"node-exporter\",instance=~\"$node\",job=~\"$job\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "Value - {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Host Network Transmit Errors", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 52, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_network_receive_errs_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])", + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Network Receive Traffic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Sudden drop or spike of Host Network Receive Errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 93 + }, + "id": 30, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "WITH(f={for=\"host_network_receive_errors\",preset=\"node-exporter\",instance=~\"$node\",job=~\"$job\"}) count(min(anomaly_score{f}) without (model_alias,scheduler_alias)>=1.0) by (instance)", + "legendFormat": "Value - {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Anomalies: Host Network Receive Errors", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 93 + }, + "id": 202, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(node_network_transmit_errs_total{instance=~\"$node\",job=~\"$job\"}[$__rate_interval])", + "hide": false, + "legendFormat": "{{instance}},{{device}}", + "range": true, + "refId": "B" + } + ], + "title": "Network Transmit Traffic Errors", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "sandbox-monitoring", + "value": "PB894574A363DF0AF" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info{job=~\"$job\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": true, + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=~\"$job\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info, job)", + "hide": 0, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "hide": 2, + "includeAll": false, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(anomaly_score{for=\"cpu_seconds_total\"},mode)", + "hide": 2, + "includeAll": true, + "label": "cpu_mode", + "multi": true, + "name": "cpu_mode", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(anomaly_score{for=\"cpu_seconds_total\"},mode)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node Exporter - vmanomaly", + "uid": "feceb53e-c252-44aa-ae7a-51c20f58cd241", + "version": 21, + "weekStart": "" +} \ No newline at end of file diff --git a/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/user_input_example.yml b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/user_input_example.yml new file mode 100644 index 0000000000..1a0a751ae6 --- /dev/null +++ b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/user_input_example.yml @@ -0,0 +1,41 @@ +# an example of what config subset user should input to make node_exporter preset fully functional +# commented fields are optional for the user, as they are already defined in the preset: +# ./presets/node_exporter/vmanomaly_config.yml +preset: "node-exporter" + +# schedulers: # section is already defined +# models: # section is already defined + +reader: + # queries: # already defined + # sampling_period: # already defined + + datasource_url: "http://victoriametrics:8428/" # your datasource url + # tenant_id: '0:0' # specify for cluster version + # add any other field - https://docs.victoriametrics.com/anomaly-detection/components/reader/#vm-reader + +writer: + # metric_format: # already defined + # __name__: "node_exporter_$VAR" + # for: "$QUERY_KEY" + # but you can override it or add other labels + + datasource_url: "http://victoriametrics:8428/" # your datasource url + # tenant_id: '0:0' # specify for cluster version + + # add any other field - https://docs.victoriametrics.com/anomaly-detection/components/writer/#vm-writer + +# monitoring: +# # pull section is already defined +# # pull: +# # addr: "0.0.0.0" +# # port: 8080 + +# push: # most of the fields are already defined +# # extra_labels: # some labels are already defined, but you can add more +# # job: "vmanomaly" +# # config: "node_exporter" +# url: "http://victoriametrics:8428/" # your datasource url to push self-monitoring metrics +# # tenant_id: '0:0' # specify for cluster version + +# # add any other field - https://docs.victoriametrics.com/anomaly-detection/components/monitoring \ No newline at end of file diff --git a/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/vmanomaly_alerts.yml b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/vmanomaly_alerts.yml new file mode 100644 index 0000000000..899790e178 --- /dev/null +++ b/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/vmanomaly_alerts.yml @@ -0,0 +1,74 @@ +groups: +- name: VMAnomaly + rules: + - alert: PageFaults + expr: min(anomaly_score{preset="node-exporter", for="page_faults"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal Page Faults (instance {{ $labels.instance }}). + + - alert: ReceiveBytes + expr: min(anomaly_score{preset="node-exporter", for="receive_bytes"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Received Bytes (instance {{ $labels.instance }}, device {{$labels.device}}). + + - alert: TransmitBytes + expr: min(anomaly_score{preset="node-exporter", for="transmit_bytes"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Transmit bytes (instance {{ $labels.instance }}, device {{$labels.device}}). + + - alert: ReadLatency + expr: min(anomaly_score{preset="node-exporter", for="read_latency"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal Read latency (instance {{ $labels.instance }}, device {{$labels.device}}). + + - alert: WriteLatency + expr: min(anomaly_score{preset="node-exporter", for="write_latency"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal Write latency (instance {{ $labels.instance }}, device {{$labels.device}}). + + - alert: CpuSecondsTotal + expr: min(anomaly_score{preset="node-exporter", for="cpu_seconds_total"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Node CPU seconds (instance {{ $labels.instance }}). + + - alert: ContextSwitch + expr: min(anomaly_score{preset="node-exporter", for="context_switch"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Context Switches (instance {{ $labels.instance }}). + + - alert: HostNetworkReceiveErrors + expr: min(anomaly_score{preset="node-exporter", for="host_network_receive_errors"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Host Network Receive Errors (instance {{ $labels.instance }}, device {{$labels.device}}). + + - alert: HostNetworkTransmitErrors + expr: min(anomaly_score{preset="node-exporter", for="host_network_transmit_errors"}) without (model_alias, scheduler_alias)>=1.0 + for: 2m + labels: + severity: warning + annotations: + summary: Abnormal amount of Host Network Transmit Errors (instance {{ $labels.instance }}, device {{$labels.device}}). \ No newline at end of file diff --git a/docs/anomaly-detection/Overview.md b/docs/anomaly-detection/Overview.md index d164fa4c8b..10a73e4bf1 100644 --- a/docs/anomaly-detection/Overview.md +++ b/docs/anomaly-detection/Overview.md @@ -35,6 +35,8 @@ In addition to that, setting up alerting rules manually has been proven to be te error-prone, while anomaly detection can be easier to set up, and use the same model for different metrics. +`vmanomaly` can be used as a helper to set up your own alerting. You can rely on the spikes you see in anomaly scores to form the metric queries for alerting rules. + > **Note: `vmanomaly` is a part of [enterprise package](https://docs.victoriametrics.com/enterprise/). You need to get a [free trial license](https://victoriametrics.com/products/enterprise/trial/) for evaluation.** ## How? diff --git a/docs/anomaly-detection/Presets.md b/docs/anomaly-detection/Presets.md new file mode 100644 index 0000000000..e6fbf879e1 --- /dev/null +++ b/docs/anomaly-detection/Presets.md @@ -0,0 +1,153 @@ +--- +sort: 3 +weight: 1 +title: Presets +menu: + docs: + parent: "anomaly-detection" + weight: 1 + title: Presets +--- +# Anomaly Detection Presets +> Please check the [Quick Start Guide](/anomaly-detection/quickstart/) to install and run `vmanomaly` + +> Presets are available from v1.13.0 + +Presets enable anomaly detection in indicators that are hard to monitor using alerts based on static thresholds. +So, the anomaly detection alerting rules based on the [`anomaly_scores`](https://docs.victoriametrics.com/anomaly-detection/faq/#what-is-anomaly-score) stay the same over time, and we generate the anomaly scores using predefined machine learning models. +Models are constantly retraining on different time frames which helps to keep alerts up to date and to consider underlying data patterns. + +You can set up the simplified configuration file for `vmanomaly` just specifying the type of preset and data sources in [`reader`](https://docs.victoriametrics.com/anomaly-detection/components/reader/) and [`writer`](https://docs.victoriametrics.com/anomaly-detection/components/writer/) sections of the config. +The rest of the parameters are already set up for you. + +Available presets: +- [Node-Exporter](#node-exporter) + +Here is an example config file to enable Node-Exporter preset: + +```yaml +preset: "node-exporter" +reader: + datasource_url: "http://victoriametrics:8428/" # your datasource url + # tenant_id: '0:0' # specify for cluster version +writer: + datasource_url: "http://victoriametrics:8428/" # your datasource url + # tenant_id: '0:0' # specify for cluster version +``` +Run a service using config file with one of the [available options](/anomaly-detection/quickstart/#how-to-install-and-run-vmanomaly). + +After you run `vmanomaly`, the available assets can be found here: `http://localhost:8490/presets/` + +<img alt="preset-localhost" src="presets-localhost.webp"> + +## Node-Exporter + +> **Note: Configurations for presets can be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-node-exporter-preset/)** + +For enabling Node-Exporter in config file use `preset` parameter: +```yaml +preset: "node-exporter" +``` + +### Generated Anomaly scores +Machine learning models will be fit for each timeseries, returned by underlying [MetricsQL](https://docs.victoriametrics.com/metricsql/). +Anomaly score metric labels will also contain underlying [model classes](/anomaly-detection/components/models/) and [schedulers](/anomaly-detection/components/scheduler/). +Here's an example of produced metrics: + +```shell +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="system", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.23451242720277776 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="user", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.2637952255694444 +anomaly_score{for="page_faults", instance="node-exporter:9100", job="node-exporter", preset="node-exporter", model_alias="holt-winters", scheduler_alias="1d_1m"} 0.00593712535 +anomaly_score{for="read_latency", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.27773362795333334 +anomaly_score{for="receive_bytes", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.037753486136666674 +anomaly_score{for="transmit_bytes", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.17633085235 +anomaly_score{for="write_latency", instance="node-exporter:9100", preset="node-exporter", model_alias="mad", scheduler_alias="1d_1m"} 0.019314370926666668 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="mad", scheduler_alias="1d_1m"} 4.2323617935 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="mad", scheduler_alias="2w_1m"} 1.5261359215 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="prophet", scheduler_alias="2w_1m"} 0.5850743651 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="z-score", scheduler_alias="1d_1m"} 1.6496064663 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="idle", model_alias="z-score", scheduler_alias="2w_1m"} 0.924392581 +anomaly_score{for="cpu_seconds_total", instance="node-exporter:9100", preset="node-exporter", mode="iowait", model_alias="mad", scheduler_alias="1d_1m"} 0.8571428657 +... +``` + +### Alerts +> We recommend to use [Awesome Prometheus alerts](https://github.com/samber/awesome-prometheus-alerts) for alerts not covered by presets. + +Provided alerts are set to fire every time all models vote that the datapoint is anomalous. + +You can find alerting rules here: +- `vmanomaly` [Anomaly Detection alerts](http://localhost:8490/presets/vmanomaly_alerts.yml): `http://localhost:8490/presets/vmanomaly_alerts.yml` +- [Modified Awesome Alerts](http://localhost:8490/presets/awesome_alerts.yml): `http://localhost:8490/presets/awesome_alerts.yml` + +#### Awesome Alerts replaced by Machine Learning alerts +- HostMemoryUnderMemoryPressure +- HostContextSwitching +- HostHighCpuLoad +- HostCpuIsUnderutilized +- HostCpuStealNoisyNeighbor +- HostCpuHighIowait +- HostNetworkReceiveErrors +- HostNetworkTransmitErrors +- HostUnusualNetworkThroughputIn +- HostUnusualNetworkThroughputOut + +### Grafana dashboard +Grafana dashboard `.json` file can be found [here](http://localhost:8490/presets/dashboard.json): `http://localhost:8490/presets/dashboard.json` + +### Indicators monitored by preset + +The produced anomaly scores will contain label `for` with the name of corresponding indicator. + +<table> + <thead> + <tr> + <th>Indicator</th> + <th>Based on metrics</th> + <th>Description</th> + </tr> + </thead> + <tbody> + <tr> + <td><code>page_faults</code></td> + <td><code>node_vmstat_pgmajfault</code></td> + <td>Number of major faults that have occurred since the last update. Major faults occur when a process tries to access a page in memory that is not currently mapped in the process's address space, and it requires loading data from the disk.</td> + </tr> + <tr> + <td><code>context_switch</code></td> + <td><code>node_context_switches_total</code></td> + <td>This metric represents the total number of context switches across all CPUs.</td> + </tr> + <tr> + <td><code>cpu_seconds_total</code></td> + <td><code>node_cpu_seconds_total</code></td> + <td>Total amount of CPU time consumed by the system in seconds by CPU processing mode (e.g., user, system, idle).</td> + </tr> + <tr> + <td><code>host_network_receive_errors</code> & <code>host_network_transmit_errors</code></td> + <td><code>node_network_receive_errs_total</code>, <code>node_network_receive_packets_total</code>, <code>node_network_transmit_errs_total</code>, <code>node_network_transmit_packets_total</code> + <td>Total number of errors encountered while receiving/transmitting packets on the network interfaces of a node.</td> + </tr> + <tr> + <td><code>receive_bytes</code> & <code>transmit_bytes</code></td> + <td><code>node_network_receive_bytes_total</code>, <code>node_network_transmit_bytes_total</code></td> + <td>Total number of bytes received/transmitted on network interfaces of a node.</td> + </tr> + <tr> + <td><code>read_latency</code> & <code>write_latency</code></td> + <td><code>node_disk_read_time_seconds_total</code>, <code>node_disk_reads_completed_total</code>, <code>node_disk_write_time_seconds_total</code>, <code>node_disk_writes_completed_total</code></td> + <td>Disk latency. The total read/write time spent in seconds. / The total number of reads/writes completed successfully.</td> + </tr> + </tbody> +</table> + +## Example +On the graph 'Percentage of Anomalies,' you can see a spike to 8.75% of anomalies at the timestamp '2024-06-03 10:35:00'. The graph 'Anomalies per Indicator' shows the indicators that were anomalous at the corresponding time. +<img alt="global" src="presets_global_percentage.webp"> +At this timestamp on the 'Number of Anomalous Indicators by Node,' we can identify the node that had the most anomalies: `10.142.0.27` +<img alt="by_node" src="presets_anomalies_by_node.webp"> +For this node from the timestamp `2024-06-03 10:35:00` CPU time spent handling software interrupts started to grow. +(`cpu_seconds_total{mode="softirq"}`) +<img alt="irq" src="presets_cpu_seconds_softirq.webp"> +At the same time `cpu_seconds_total` for `steal` mode started to grow as well. +<img alt="steal" src="presets_cpu_seconds_steal.webp"> diff --git a/docs/anomaly-detection/README.md b/docs/anomaly-detection/README.md index 682e69ddda..381fede0f3 100644 --- a/docs/anomaly-detection/README.md +++ b/docs/anomaly-detection/README.md @@ -18,6 +18,7 @@ Begin your VictoriaMetrics Anomaly Detection journey with ease using our guides - **Quickstart**: Check out how to get `vmanomaly` up and running [here](/anomaly-detection/QuickStart.html). - **Overview**: Find out how `vmanomaly` service operates [here](/anomaly-detection/Overview.html) - **Integration**: Integrate anomaly detection into your observability ecosystem. Get started [here](/anomaly-detection/guides/guide-vmanomaly-vmalert.html). +- **Anomaly Detection Presets**: Enable anomaly detection on predefined set of indicators, that require frequently changing static thresholds for alerting. Find more information [here](/anomaly-detection/presets/). - **Installation Options**: Select the method that aligns with your technical requirements: - **Docker Installation**: Suitable for containerized environments. See [Docker guide](/anomaly-detection/Overview.html#run-vmanomaly-docker-container). diff --git a/docs/anomaly-detection/presets-localhost.webp b/docs/anomaly-detection/presets-localhost.webp new file mode 100644 index 0000000000..d227093216 Binary files /dev/null and b/docs/anomaly-detection/presets-localhost.webp differ diff --git a/docs/anomaly-detection/presets_anomalies_by_node.webp b/docs/anomaly-detection/presets_anomalies_by_node.webp new file mode 100644 index 0000000000..12c83d5016 Binary files /dev/null and b/docs/anomaly-detection/presets_anomalies_by_node.webp differ diff --git a/docs/anomaly-detection/presets_cpu_seconds_softirq.webp b/docs/anomaly-detection/presets_cpu_seconds_softirq.webp new file mode 100644 index 0000000000..c0821e6c5c Binary files /dev/null and b/docs/anomaly-detection/presets_cpu_seconds_softirq.webp differ diff --git a/docs/anomaly-detection/presets_cpu_seconds_steal.webp b/docs/anomaly-detection/presets_cpu_seconds_steal.webp new file mode 100644 index 0000000000..18fad4fe75 Binary files /dev/null and b/docs/anomaly-detection/presets_cpu_seconds_steal.webp differ diff --git a/docs/anomaly-detection/presets_global_percentage.webp b/docs/anomaly-detection/presets_global_percentage.webp new file mode 100644 index 0000000000..b67acfc3e0 Binary files /dev/null and b/docs/anomaly-detection/presets_global_percentage.webp differ