diff --git a/deployment/docker/vmanomaly/vmanomaly-integration/README.md b/deployment/docker/vmanomaly/vmanomaly-integration/README.md index a5adeca2a..6cc29a28d 100644 --- a/deployment/docker/vmanomaly/vmanomaly-integration/README.md +++ b/deployment/docker/vmanomaly/vmanomaly-integration/README.md @@ -2,14 +2,14 @@ Please read the "vmanomaly integration" guide first - [https://docs.victoriametrics.com/anomaly-detection/guides/guide-vmanomaly-vmalert.html](https://docs.victoriametrics.com/anomaly-detection/guides/guide-vmanomaly-vmalert.html) -To make this Docker compose file work, you MUST replace the content of [vmanomaly_license.txt](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-vmalert-guide/vmanomaly_license.txt) with valid license. +To make this Docker compose file work, you MUST replace the content of [vmanomaly_license](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_license) with valid license. You can issue the [trial license here](https://victoriametrics.com/products/enterprise/trial/) ## How to run -1. Replace content of `vmanomaly_license.txt` with your license +1. Replace content of `vmanomaly_license` with your license 1. Run ```sh diff --git a/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_config.yml b/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_config.yml index a22cb1d57..9990779b1 100644 --- a/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_config.yml +++ b/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_config.yml @@ -12,7 +12,7 @@ reader: datasource_url: "http://victoriametrics:8428/" sampling_period: "60s" queries: - node_cpu_rate: "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))" + node_cpu_rate: "sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)" writer: datasource_url: "http://victoriametrics:8428/" diff --git a/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_guide_dashboard.json b/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_guide_dashboard.json index 4e8aee67d..a4c32e793 100644 --- a/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_guide_dashboard.json +++ b/deployment/docker/vmanomaly/vmanomaly-integration/vmanomaly_guide_dashboard.json @@ -1,5 +1,5 @@ { - "annotations": { + "annotations": { "list": [ { "builtIn": 1, @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "links": [], + "links": [], "liveNow": false, "panels": [ { @@ -40,7 +40,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "If you don't see any data, please wait a few minutes. \n\nYou will see a lot of false positive anomalies when you run the guide for the first time. \nThe prediction must be more accurate if you provide vmanomaly 2w of data.\n\nEvery row represents information for one specific mode. \nThe query for anomaly detection is `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))`\nThis is a median (or 50% quantileof `rate` function over `node_cpu_seconds_total`)", + "content": "If you don't observe any data initially, please wait a few minutes for it to appear. \n\nUpon the first running the guide (if there is not enough node_exporter monitoring data collected in your system), you may notice a significant number of false positive anomalies found. The predictions will become more accurate with at least two weeks' (full `fit_window`) worth of data provided to vmanomaly.\n\nEach row displays information for a distinct mode. The query used for anomaly detection is `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`.\n", "mode": "markdown" }, "pluginVersion": "10.2.1", @@ -67,7 +67,7 @@ "type": "prometheus", "uid": "VictoriaMetrics" }, - "description": "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))", + "description": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)", "fieldConfig": { "defaults": { "color": { @@ -90,12 +90,15 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, - "pointSize": 5, + "pointSize": 1, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -106,6 +109,7 @@ } }, "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -118,7 +122,8 @@ "value": 80 } ] - } + }, + "unit": "none" }, "overrides": [ { @@ -166,7 +171,7 @@ "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, @@ -177,14 +182,14 @@ "uid": "VictoriaMetrics" }, "editorMode": "code", - "expr": "quantile by (mode, instance,job) (0.5, rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m]))", + "expr": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)", "instant": false, "legendFormat": "Instance: {{instance}}, Job {{job}}", "range": true, "refId": "A" } ], - "title": "CPU median for $mode mode", + "title": "CPU rate sum for $mode mode", "type": "timeseries" }, { @@ -219,7 +224,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -236,15 +241,27 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 1 } ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "threshold" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -281,6 +298,19 @@ "legendFormat": "Instance: {{instance}}, Job: {{job}}", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "VictoriaMetrics" + }, + "editorMode": "code", + "expr": "vector(1)", + "hide": false, + "instant": false, + "legendFormat": "threshold", + "range": true, + "refId": "B" } ], "title": "Anomaly Scores for $mode mode", @@ -318,7 +348,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -344,18 +374,6 @@ } }, "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Predicted Value: yhat" - }, - "properties": [ - { - "id": "custom.fillBelowTo", - "value": "yhat_lower" - } - ] - }, { "matcher": { "id": "byName", @@ -364,7 +382,7 @@ "properties": [ { "id": "custom.fillBelowTo", - "value": "yhat" + "value": "Predicted Lower Boundary" } ] } @@ -381,11 +399,11 @@ "legend": { "calcs": [], "displayMode": "table", - "placement": "right", + "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, @@ -427,6 +445,19 @@ "legendFormat": "Predicted Upper Boundary", "range": true, "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "VictoriaMetrics" + }, + "editorMode": "code", + "expr": "sum(rate(node_cpu_seconds_total{mode=~\"$mode\"}[5m])) by (mode, instance,job)", + "hide": false, + "instant": false, + "legendFormat": "Value", + "range": true, + "refId": "D" } ], "title": "Predicted Value and Boundaries for $mode mode", @@ -440,11 +471,7 @@ "list": [ { "allValue": ".*", - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, + "current": {}, "datasource": { "type": "prometheus", "uid": "VictoriaMetrics" @@ -464,19 +491,19 @@ "refresh": 2, "regex": "", "skipUrlSync": false, - "sort": 1, + "sort": 2, "type": "query" } ] }, "time": { - "from": "now-30m", + "from": "now-3h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Vmanomaly Guide", "uid": "cfa61e6a-6074-4626-8e54-ea33e08746b9", - "version": 2, + "version": 3, "weekStart": "" } \ No newline at end of file diff --git a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md index be309601e..53096e855 100644 --- a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md +++ b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md @@ -101,7 +101,7 @@ node_cpu_seconds_total{cpu="1",mode="iowait"} 51.22 In this context, the metric `node_cpu_seconds_total` provides a comprehensive breakdown of the time each CPU core has spent in various operational modes. These modes include: _user_, _system_, _iowait_, _idle_, _irq&softirq_, _guest_, and _steal_. Each of these eight modes is mutually exclusive, offering distinct insights into CPU activity. For instance, a predominant _iowait_ suggests disk or network bottlenecks, while elevated levels in _user_ or _system_ indicate significant CPU utilization. -The `node_cpu_seconds_total` metric is classified as a [counter](https://docs.victoriametrics.com/keyConcepts.html#counter) type. To analyze the duration each CPU core spends in these modes, it is necessary to compute the rate of change per second using the [rate function](https://docs.victoriametrics.com/MetricsQL.html#rate): `rate(node_cpu_seconds_total)`. For a more refined and smoother aggregation of data by mode, we apply the median function, or the 50% quantile. The resulting query is formulated as follows: `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))`. +The `node_cpu_seconds_total` metric is classified as a [counter](https://docs.victoriametrics.com/keyConcepts.html#counter) type. To analyze the duration each CPU core spends in these modes, it is necessary to compute the rate of change per second using the [rate function](https://docs.victoriametrics.com/MetricsQL.html#rate): `rate(node_cpu_seconds_total)`. For a more refined and smoother aggregation of data by mode, we apply the sum function. The resulting query is formulated as follows: `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`. Below is an illustrative example of how this query might be visualized in Grafana: node_cpu_rate_graph @@ -160,7 +160,7 @@ model: reader: datasource_url: "http://victoriametrics:8428/" queries: - node_cpu_rate: "quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m])" + node_cpu_rate: "sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)" writer: datasource_url: "http://victoriametrics:8428/" @@ -199,7 +199,7 @@ groups: labels: severity: warning annotations: - summary: Anomaly Score exceeded 1.0. `rate(node_cpu_seconds_total)` is showing abnormal behavior. + summary: Anomaly Score exceeded 1.0. `sum(rate(node_cpu_seconds_total))` is showing abnormal behavior. ``` In the query expression `expr`, it's crucial to establish a criterion based on the generated anomaly scores. Typically, an [anomaly score](https://docs.victoriametrics.com/anomaly-detection/faq/#what-is-anomaly-score) ranging from 0.0 to 1.0 indicates that the analyzed value falls within normal behavior. Scores exceeding 1.0 signal increasing confidence from our model that the observed value is anomalous. @@ -470,13 +470,13 @@ To look at model results we need to go to grafana on the `localhost:3000`. Data vmanomaly need some time to generate more data to visualize. Let's investigate model output visualization in Grafana. On the Grafana Dashboard `Vmanomaly Guide` for each mode of CPU you can investigate: -* initial query result - `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))` +* initial query result - `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)` * `anomaly_score` * `yhat` - Predicted value * `yhat_lower` - Predicted lower boundary * `yhat_upper` - Predicted upper boundary -Each of these metrics will contain same labels our query `quantile by (mode) (0.5, rate(node_cpu_seconds_total[5m]))` returns. +Each of these metrics will contain same labels our query `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)` returns. ### Anomaly scores for each metric with its according labels. diff --git a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert/guide-vmanomaly-vmalert-query.webp b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert/guide-vmanomaly-vmalert-query.webp index a851793dd..a26894055 100644 Binary files a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert/guide-vmanomaly-vmalert-query.webp and b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert/guide-vmanomaly-vmalert-query.webp differ