Vmanomaly Guide - dashboard and query change (#5771)

* dashboard fix

* query fix

* changed screenshots

* minor fixes
This commit is contained in:
Daria Karavaieva 2024-02-06 22:40:40 +01:00 committed by GitHub
parent eaa2125f2c
commit 4a31bd9661
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 75 additions and 48 deletions

View file

@ -2,14 +2,14 @@
Please read the "vmanomaly integration" guide first - [https://docs.victoriametrics.com/anomaly-detection/guides/guide-vmanomaly-vmalert.html](https://docs.victoriametrics.com/anomaly-detection/guides/guide-vmanomaly-vmalert.html)
To make this Docker compose file work, you MUST replace the content of [vmanomaly_license.txt](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-vmalert-guide/vmanomaly_license.txt) with valid license.
To make this Docker compose file work, you MUST replace the content of [vmanomaly_license](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_license) with valid license.
You can issue the [trial license here](https://victoriametrics.com/products/enterprise/trial/)
## How to run
1. Replace content of `vmanomaly_license.txt` with your license
1. Replace content of `vmanomaly_license` with your license
1. Run
```sh

View file

@ -12,7 +12,7 @@ reader:
datasource_url: "http://victoriametrics:8428/"
sampling_period: "60s"
queries:
node_cpu_rate: "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))"
node_cpu_rate: "sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)"
writer:
datasource_url: "http://victoriametrics:8428/"

View file

@ -40,7 +40,7 @@
"showLineNumbers": false,
"showMiniMap": false
},
"content": "If you don't see any data, please wait a few minutes. \n\nYou will see a lot of false positive anomalies when you run the guide for the first time. \nThe prediction must be more accurate if you provide vmanomaly 2w of data.\n\nEvery row represents information for one specific mode. \nThe query for anomaly detection is `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))`\nThis is a median (or 50% quantileof `rate` function over `node_cpu_seconds_total`)",
"content": "If you don't observe any data initially, please wait a few minutes for it to appear. \n\nUpon the first running the guide (if there is not enough node_exporter monitoring data collected in your system), you may notice a significant number of false positive anomalies found. The predictions will become more accurate with at least two weeks' (full `fit_window`) worth of data provided to vmanomaly.\n\nEach row displays information for a distinct mode. The query used for anomaly detection is `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`.\n",
"mode": "markdown"
},
"pluginVersion": "10.2.1",
@ -67,7 +67,7 @@
"type": "prometheus",
"uid": "VictoriaMetrics"
},
"description": "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))",
"description": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)",
"fieldConfig": {
"defaults": {
"color": {
@ -90,12 +90,15 @@
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"pointSize": 1,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -106,6 +109,7 @@
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
@ -118,7 +122,8 @@
"value": 80
}
]
}
},
"unit": "none"
},
"overrides": [
{
@ -166,7 +171,7 @@
"showLegend": true
},
"tooltip": {
"mode": "single",
"mode": "multi",
"sort": "none"
}
},
@ -177,14 +182,14 @@
"uid": "VictoriaMetrics"
},
"editorMode": "code",
"expr": "quantile by (mode, instance,job) (0.5, rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m]))",
"expr": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)",
"instant": false,
"legendFormat": "Instance: {{instance}}, Job {{job}}",
"range": true,
"refId": "A"
}
],
"title": "CPU median for $mode mode",
"title": "CPU rate sum for $mode mode",
"type": "timeseries"
},
{
@ -219,7 +224,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -236,15 +241,27 @@
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
"overrides": [
{
"matcher": {
"id": "byName",
"options": "threshold"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
@ -281,6 +298,19 @@
"legendFormat": "Instance: {{instance}}, Job: {{job}}",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "VictoriaMetrics"
},
"editorMode": "code",
"expr": "vector(1)",
"hide": false,
"instant": false,
"legendFormat": "threshold",
"range": true,
"refId": "B"
}
],
"title": "Anomaly Scores for $mode mode",
@ -318,7 +348,7 @@
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
@ -344,18 +374,6 @@
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Predicted Value: yhat"
},
"properties": [
{
"id": "custom.fillBelowTo",
"value": "yhat_lower"
}
]
},
{
"matcher": {
"id": "byName",
@ -364,7 +382,7 @@
"properties": [
{
"id": "custom.fillBelowTo",
"value": "yhat"
"value": "Predicted Lower Boundary"
}
]
}
@ -381,11 +399,11 @@
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"mode": "multi",
"sort": "none"
}
},
@ -427,6 +445,19 @@
"legendFormat": "Predicted Upper Boundary",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "VictoriaMetrics"
},
"editorMode": "code",
"expr": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)",
"hide": false,
"instant": false,
"legendFormat": "Value",
"range": true,
"refId": "D"
}
],
"title": "Predicted Value and Boundaries for $mode mode",
@ -440,11 +471,7 @@
"list": [
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"current": {},
"datasource": {
"type": "prometheus",
"uid": "VictoriaMetrics"
@ -464,19 +491,19 @@
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"sort": 2,
"type": "query"
}
]
},
"time": {
"from": "now-30m",
"from": "now-3h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Vmanomaly Guide",
"uid": "cfa61e6a-6074-4626-8e54-ea33e08746b9",
"version": 2,
"version": 3,
"weekStart": ""
}

View file

@ -101,7 +101,7 @@ node_cpu_seconds_total{cpu="1",mode="iowait"} 51.22
In this context, the metric `node_cpu_seconds_total` provides a comprehensive breakdown of the time each CPU core has spent in various operational modes. These modes include: _user_, _system_, _iowait_, _idle_, _irq&softirq_, _guest_, and _steal_. Each of these eight modes is mutually exclusive, offering distinct insights into CPU activity. For instance, a predominant _iowait_ suggests disk or network bottlenecks, while elevated levels in _user_ or _system_ indicate significant CPU utilization.
The `node_cpu_seconds_total` metric is classified as a [counter](https://docs.victoriametrics.com/keyConcepts.html#counter) type. To analyze the duration each CPU core spends in these modes, it is necessary to compute the rate of change per second using the [rate function](https://docs.victoriametrics.com/MetricsQL.html#rate): `rate(node_cpu_seconds_total)`. For a more refined and smoother aggregation of data by mode, we apply the median function, or the 50% quantile. The resulting query is formulated as follows: `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))`.
The `node_cpu_seconds_total` metric is classified as a [counter](https://docs.victoriametrics.com/keyConcepts.html#counter) type. To analyze the duration each CPU core spends in these modes, it is necessary to compute the rate of change per second using the [rate function](https://docs.victoriametrics.com/MetricsQL.html#rate): `rate(node_cpu_seconds_total)`. For a more refined and smoother aggregation of data by mode, we apply the sum function. The resulting query is formulated as follows: `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`.
Below is an illustrative example of how this query might be visualized in Grafana:
<img alt="node_cpu_rate_graph" src="guide-vmanomaly-vmalert-query.webp">
@ -160,7 +160,7 @@ model:
reader:
datasource_url: "http://victoriametrics:8428/"
queries:
node_cpu_rate: "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m])"
node_cpu_rate: "sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)"
writer:
datasource_url: "http://victoriametrics:8428/"
@ -199,7 +199,7 @@ groups:
labels:
severity: warning
annotations:
summary: Anomaly Score exceeded 1.0. `rate(node_cpu_seconds_total)` is showing abnormal behavior.
summary: Anomaly Score exceeded 1.0. `sum(rate(node_cpu_seconds_total))` is showing abnormal behavior.
```
In the query expression `expr`, it's crucial to establish a criterion based on the generated anomaly scores. Typically, an [anomaly score](https://docs.victoriametrics.com/anomaly-detection/faq/#what-is-anomaly-score) ranging from 0.0 to 1.0 indicates that the analyzed value falls within normal behavior. Scores exceeding 1.0 signal increasing confidence from our model that the observed value is anomalous.
@ -470,13 +470,13 @@ To look at model results we need to go to grafana on the `localhost:3000`. Data
vmanomaly need some time to generate more data to visualize.
Let's investigate model output visualization in Grafana.
On the Grafana Dashboard `Vmanomaly Guide` for each mode of CPU you can investigate:
* initial query result - `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))`
* initial query result - `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`
* `anomaly_score`
* `yhat` - Predicted value
* `yhat_lower` - Predicted lower boundary
* `yhat_upper` - Predicted upper boundary
Each of these metrics will contain same labels our query `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))` returns.
Each of these metrics will contain same labels our query `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)` returns.
### Anomaly scores for each metric with its according labels.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 133 KiB

After

Width:  |  Height:  |  Size: 104 KiB