From e29b2b84446b4636017216441c9971929e1e9d2e Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 15 Feb 2022 11:54:28 +0200 Subject: [PATCH] Monitoring single (#2190) * dashboards: plot cpu limits for vmagent, vmalert and vm-single dashboards Signed-off-by: hagen1778 * alerts: add `TooHighCPUUsage` alert for all VM components Signed-off-by: hagen1778 * dashboards: bump components version requirements Signed-off-by: hagen1778 --- dashboards/victoriametrics.json | 1317 ++++++++++++++++--------------- dashboards/vmagent.json | 101 ++- dashboards/vmalert.json | 410 ++++------ deployment/docker/alerts.yml | 11 + 4 files changed, 890 insertions(+), 949 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 081ace71d..82d99a89c 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -5,7 +5,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.3.2" + "version": "8.3.5" }, { "type": "panel", @@ -51,13 +51,13 @@ } ] }, - "description": "Overview for single node VictoriaMetrics v1.70.0 or higher", + "description": "Overview for single node VictoriaMetrics v1.73.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "gnetId": 10229, "graphTooltip": 0, "id": null, - "iteration": 1639989804164, + "iteration": 1644907807949, "links": [ { "icon": "doc", @@ -88,7 +88,7 @@ "liveNow": false, "panels": [ { - "collapsed": true, + "collapsed": false, "datasource": { "uid": "$ds" }, @@ -99,610 +99,609 @@ "y": 0 }, "id": 6, - "panels": [ - { - "datasource": { - "uid": "$ds" - }, - "description": "", - "gridPos": { - "h": 2, - "w": 4, - "x": 0, - "y": 1 - }, - "id": 85, - "options": { - "content": "
$version
", - "mode": "markdown" - }, - "pluginVersion": "8.3.2", - "title": "Version", - "type": "text" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "How many datapoints are in storage", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 4, - "y": 1 - }, - "id": 26, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Total datapoints", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "Total amount of used disk space", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 9, - "y": 1 - }, - "id": 81, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_data_size_bytes{job=~\"$job\", type!=\"indexdb\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Disk space usage", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "Average disk usage per datapoint.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 14, - "y": 1 - }, - "id": 82, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_data_size_bytes{job=~\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=~\"$job\", type!=\"indexdb\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Bytes per point", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "Total size of allowed memory via flag `-memory.allowedPercent`", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 19, - "y": 1 - }, - "id": 79, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_allowed_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Allowed memory", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "green", - "value": 1800 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 0, - "y": 3 - }, - "id": 87, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "vm_app_uptime_seconds{job=~\"$job\", instance=~\"$instance\"}", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Uptime", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 4, - "y": 3 - }, - "id": 38, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type=\"indexdb\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Index size", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "The minimum free disk space left", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 9, - "y": 3 - }, - "id": 80, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "min(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Min free disk space", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "Total number of available CPUs for VM process", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 14, - "y": 3 - }, - "id": 77, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Available CPU", - "type": "stat" - }, - { - "datasource": { - "uid": "$ds" - }, - "description": "Total size of available memory for VM process", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 5, - "x": 19, - "y": 3 - }, - "id": 78, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.3.2", - "targets": [ - { - "exemplar": true, - "expr": "sum(vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Available memory", - "type": "stat" - } - ], + "panels": [], "title": "Stats", "type": "row" }, + { + "datasource": { + "uid": "$ds" + }, + "description": "", + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 85, + "options": { + "content": "
$version
", + "mode": "markdown" + }, + "pluginVersion": "8.3.5", + "title": "Version", + "type": "text" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "How many datapoints are in storage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 4, + "y": 1 + }, + "id": 26, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Total datapoints", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "Total amount of used disk space", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 9, + "y": 1 + }, + "id": 81, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_data_size_bytes{job=~\"$job\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Disk space usage", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "Average disk usage per datapoint.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 14, + "y": 1 + }, + "id": 82, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_data_size_bytes{job=~\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=~\"$job\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Bytes per point", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "Total size of allowed memory via flag `-memory.allowedPercent`", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 79, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_allowed_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Allowed memory", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 3 + }, + "id": 87, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "vm_app_uptime_seconds{job=~\"$job\", instance=~\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 4, + "y": 3 + }, + "id": 38, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Index size", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "The minimum free disk space left", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 9, + "y": 3 + }, + "id": 80, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "min(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Min free disk space", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "Total number of available CPUs for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 14, + "y": 3 + }, + "id": 77, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Available CPU", + "type": "stat" + }, + { + "datasource": { + "uid": "$ds" + }, + "description": "Total size of available memory for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 19, + "y": 3 + }, + "id": 78, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.3.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Available memory", + "type": "stat" + }, { "collapsed": false, "datasource": { @@ -712,7 +711,7 @@ "h": 1, "w": 24, "x": 0, - "y": 1 + "y": 5 }, "id": 24, "panels": [], @@ -740,7 +739,7 @@ "h": 8, "w": 12, "x": 0, - "y": 2 + "y": 6 }, "hiddenSeries": false, "id": 12, @@ -764,7 +763,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -835,7 +834,7 @@ "h": 8, "w": 12, "x": 12, - "y": 2 + "y": 6 }, "hiddenSeries": false, "id": 22, @@ -859,7 +858,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -929,7 +928,7 @@ "h": 8, "w": 12, "x": 0, - "y": 10 + "y": 14 }, "hiddenSeries": false, "id": 51, @@ -959,7 +958,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1029,7 +1028,7 @@ "h": 8, "w": 12, "x": 12, - "y": 10 + "y": 14 }, "hiddenSeries": false, "id": 33, @@ -1053,7 +1052,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1137,7 +1136,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 22 }, "hiddenSeries": false, "id": 59, @@ -1163,7 +1162,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1248,7 +1247,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 22 }, "hiddenSeries": false, "id": 35, @@ -1272,7 +1271,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1332,7 +1331,7 @@ "h": 1, "w": 24, "x": 0, - "y": 26 + "y": 30 }, "id": 14, "panels": [ @@ -2432,7 +2431,7 @@ "h": 1, "w": 24, "x": 0, - "y": 27 + "y": 31 }, "id": 71, "panels": [ @@ -2883,7 +2882,7 @@ { "datasource": { "type": "prometheus", - "uid": "${ds}" + "uid": "${DS_VICTORIAMETRICS}" }, "exemplar": true, "expr": "sum(vm_merge_need_free_disk_space{job=~\"$job\", instance=~\"$instance\"}) by(type)", @@ -2935,7 +2934,7 @@ "dashes": false, "datasource": { "type": "prometheus", - "uid": "${ds}" + "uid": "${DS_VICTORIAMETRICS}" }, "description": "Shows the percentage of used cache size from the allowed size by type. \nValues close to 100% show the maximum potential utilization.\nValues close to 0% show that cache is underutilized.", "fill": 0, @@ -2976,7 +2975,7 @@ { "datasource": { "type": "prometheus", - "uid": "${ds}" + "uid": "${DS_VICTORIAMETRICS}" }, "exemplar": true, "expr": "vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"} / vm_cache_size_max_bytes{job=~\"$job\", instance=~\"$instance\"}", @@ -3030,7 +3029,7 @@ "h": 1, "w": 24, "x": 0, - "y": 28 + "y": 32 }, "id": 46, "panels": [ @@ -3055,7 +3054,7 @@ "h": 8, "w": 12, "x": 0, - "y": 29 + "y": 5 }, "hiddenSeries": false, "id": 44, @@ -3079,7 +3078,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3170,8 +3169,10 @@ "dashLength": 10, "dashes": false, "datasource": { + "type": "prometheus", "uid": "$ds" }, + "description": "", "fieldConfig": { "defaults": { "links": [] @@ -3184,7 +3185,7 @@ "h": 8, "w": 12, "x": 12, - "y": 29 + "y": 5 }, "hiddenSeries": false, "id": 57, @@ -3208,22 +3209,46 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "$$hashKey": "object:85", + "alias": "Limit", + "color": "#F2495C" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, "expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "CPU cores used", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Limit", + "refId": "B" } ], "thresholds": [], @@ -3279,7 +3304,7 @@ "h": 8, "w": 12, "x": 0, - "y": 37 + "y": 13 }, "hiddenSeries": false, "id": 75, @@ -3303,7 +3328,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3388,7 +3413,7 @@ "h": 8, "w": 12, "x": 12, - "y": 37 + "y": 13 }, "hiddenSeries": false, "id": 76, @@ -3412,7 +3437,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3496,7 +3521,7 @@ "h": 8, "w": 12, "x": 0, - "y": 45 + "y": 21 }, "hiddenSeries": false, "id": 47, @@ -3520,7 +3545,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3591,7 +3616,7 @@ "h": 8, "w": 12, "x": 12, - "y": 45 + "y": 21 }, "hiddenSeries": false, "id": 42, @@ -3615,7 +3640,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3684,7 +3709,7 @@ "h": 8, "w": 12, "x": 0, - "y": 53 + "y": 29 }, "hiddenSeries": false, "id": 48, @@ -3708,7 +3733,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3779,7 +3804,7 @@ "h": 8, "w": 12, "x": 12, - "y": 53 + "y": 29 }, "hiddenSeries": false, "id": 37, @@ -3803,7 +3828,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3874,7 +3899,7 @@ "h": 8, "w": 12, "x": 12, - "y": 61 + "y": 37 }, "hiddenSeries": false, "id": 49, @@ -3898,7 +3923,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3954,7 +3979,7 @@ } ], "refresh": "30s", - "schemaVersion": 33, + "schemaVersion": 34, "style": "dark", "tags": [ "victoriametrics", @@ -3965,8 +3990,8 @@ { "current": { "selected": false, - "text": "cloud-test-13", - "value": "cloud-test-13" + "text": "VictoriaMetrics", + "value": "VictoriaMetrics" }, "hide": 0, "includeAll": false, diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 7cfea626d..5465a4fc9 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -6,7 +6,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.3.2" + "version": "8.3.5" }, { "type": "panel", @@ -58,12 +58,12 @@ } ] }, - "description": "Overview for VictoriaMetrics vmagent v1.70.0 or higher", + "description": "Overview for VictoriaMetrics vmagent v1.73.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, - "iteration": 1639980687827, + "iteration": 1644908591152, "links": [ { "icon": "doc", @@ -151,7 +151,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -215,7 +215,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -282,7 +282,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -341,7 +341,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -487,7 +487,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -583,7 +583,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -687,7 +687,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -785,7 +785,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -906,7 +906,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -999,7 +999,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1098,7 +1098,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1196,7 +1196,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1295,7 +1295,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3613,6 +3613,7 @@ "dashLength": 10, "dashes": false, "datasource": { + "type": "prometheus", "uid": "$ds" }, "description": "Shows the CPU usage per vmagent instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.", @@ -3628,7 +3629,7 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 45 }, "hiddenSeries": false, "id": 35, @@ -3658,21 +3659,47 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "$$hashKey": "object:77", + "alias": "/Limit.*/", + "color": "#F2495C" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, "expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(instance)", "format": "time_series", + "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Limit ({{instance}})", + "refId": "B" } ], "thresholds": [], @@ -3727,7 +3754,7 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 45 }, "hiddenSeries": false, "id": 37, @@ -3757,7 +3784,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3834,7 +3861,7 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 53 }, "hiddenSeries": false, "id": 81, @@ -3858,7 +3885,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -3943,7 +3970,7 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 53 }, "hiddenSeries": false, "id": 7, @@ -3967,7 +3994,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -4045,7 +4072,7 @@ "h": 8, "w": 12, "x": 0, - "y": 30 + "y": 61 }, "hiddenSeries": false, "id": 83, @@ -4069,7 +4096,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -4153,7 +4180,7 @@ "h": 8, "w": 12, "x": 12, - "y": 30 + "y": 61 }, "hiddenSeries": false, "id": 39, @@ -4177,7 +4204,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -4247,7 +4274,7 @@ "h": 8, "w": 12, "x": 0, - "y": 38 + "y": 69 }, "hiddenSeries": false, "id": 43, @@ -4271,7 +4298,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -4339,7 +4366,7 @@ "h": 8, "w": 12, "x": 12, - "y": 38 + "y": 69 }, "hiddenSeries": false, "id": 41, @@ -4363,7 +4390,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.3.2", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -4418,7 +4445,7 @@ } ], "refresh": false, - "schemaVersion": 33, + "schemaVersion": 34, "style": "dark", "tags": [ "vmagent", @@ -4428,9 +4455,9 @@ "list": [ { "current": { - "selected": true, - "text": "dbaas-test-t3-medium-inst", - "value": "dbaas-test-t3-medium-inst" + "selected": false, + "text": "VictoriaMetrics", + "value": "VictoriaMetrics" }, "hide": 0, "includeAll": false, diff --git a/dashboards/vmalert.json b/dashboards/vmalert.json index e8bceca51..d4cf3ec11 100644 --- a/dashboards/vmalert.json +++ b/dashboards/vmalert.json @@ -5,7 +5,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.0.3" + "version": "8.3.5" }, { "type": "panel", @@ -57,12 +57,12 @@ } ] }, - "description": "Overview for VictoriaMetrics vmalert v1.65.0 or higher", + "description": "Overview for VictoriaMetrics vmalert v1.73.0 or higher", "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, - "iteration": 1630393200659, + "iteration": 1644909221704, "links": [ { "asDropdown": false, @@ -101,10 +101,10 @@ "url": " https://github.com/VictoriaMetrics/VictoriaMetrics/releases" } ], + "liveNow": false, "panels": [ { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -117,7 +117,9 @@ "type": "row" }, { - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows if the last configuration update was successful. \"Not Ok\" means there was an unsuccessful attempt to update the configuration due to some error. Check the log for details.", "fieldConfig": { "defaults": { @@ -179,7 +181,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "targets": [ { "exemplar": false, @@ -189,13 +191,13 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Config error", "type": "stat" }, { - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the total number of errors generated by recording/alerting rules for selected instances and groups.", "fieldConfig": { "defaults": { @@ -238,7 +240,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "targets": [ { "exemplar": false, @@ -248,13 +250,13 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Errors", "type": "stat" }, { - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the total number of loaded alerting rules across selected instances and groups.", "fieldConfig": { "defaults": { @@ -293,7 +295,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "targets": [ { "exemplar": false, @@ -303,13 +305,13 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Alerting rules", "type": "stat" }, { - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the total number of loaded recording rules across selected instances and groups.", "fieldConfig": { "defaults": { @@ -348,7 +350,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "targets": [ { "exemplar": false, @@ -358,14 +360,14 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Recording rules", "type": "stat" }, { "columns": [], - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "fontSize": "100%", "gridPos": { "h": 7, @@ -374,7 +376,6 @@ "y": 1 }, "id": 2, - "pageSize": null, "scroll": true, "showHeader": true, "sort": { @@ -405,7 +406,6 @@ { "alias": "", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -422,7 +422,6 @@ { "alias": "", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -449,8 +448,6 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Uptime", "transform": "table", "type": "table-old" @@ -460,7 +457,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "fieldConfig": { "defaults": { "links": [] @@ -487,7 +486,6 @@ "min": false, "rightSide": true, "show": true, - "sideWidth": null, "sort": "current", "sortDesc": false, "total": false, @@ -500,7 +498,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -520,9 +518,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Uptime", "tooltip": { "shared": true, @@ -531,9 +527,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -542,20 +536,15 @@ "$$hashKey": "object:170", "decimals": 0, "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:171", - "decimals": null, "format": "short", "label": "", "logBase": 1, - "max": null, - "min": null, "show": true } ], @@ -569,7 +558,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the number of fired alerts by instance.", "fill": 1, "fillGradient": 0, @@ -600,7 +591,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -618,9 +609,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Alerts fired total", "tooltip": { "shared": true, @@ -629,33 +618,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -663,7 +643,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Average evaluation duration by group. Basically means how long it takes to execute all the rules per each group.", "fieldConfig": { "defaults": { @@ -700,7 +682,7 @@ "alertThreshold": false }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -718,9 +700,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Groups avg evaluation duration ($group)", "tooltip": { "shared": true, @@ -729,33 +709,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -763,7 +734,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows how many requests (executions) per second vmalert sends to the configured datasource.", "fill": 0, "fillGradient": 0, @@ -794,7 +767,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -812,9 +785,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Rules execution rate ($instance)", "tooltip": { "shared": true, @@ -823,33 +794,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -857,7 +819,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the error rate while executing configured rules. Non-zero value means there are some issues with existing rules. Check the logs to get more details.", "fill": 1, "fillGradient": 0, @@ -888,7 +852,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -906,9 +870,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Rules execution errors ($instance)", "tooltip": { "shared": true, @@ -917,38 +879,28 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -962,7 +914,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the current active (firing) alerting rules per group.", "fill": 0, "fillGradient": 0, @@ -1011,9 +965,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Active ($group)", "tooltip": { "shared": true, @@ -1022,33 +974,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1056,7 +999,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the events when rule execution resulted into an error. Check the logs for more details.", "fill": 0, "fillGradient": 0, @@ -1105,9 +1050,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Errors ($group)", "tooltip": { "shared": true, @@ -1116,33 +1059,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1150,7 +1084,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the current pending alerting rules per group.\nBy pending means the rule which remains active less than configured `for` parameter.", "fill": 0, "fillGradient": 0, @@ -1199,9 +1135,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Pending ($group)", "tooltip": { "shared": true, @@ -1210,33 +1144,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1244,7 +1169,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows how many alerts are sent to Alertmanager per second. Only active alerts are sent.", "fill": 0, "fillGradient": 0, @@ -1293,9 +1220,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Requests rate to Alertmanager ($group)", "tooltip": { "shared": true, @@ -1304,9 +1229,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -1314,25 +1237,20 @@ { "$$hashKey": "object:229", "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:230", "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1340,7 +1258,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the error rate for the attempts to send alerts to Alertmanager. If not zero it means there issues on attempt to send notification to Alertmanager and some alerts may be not delivered properly. Check the logs for more details.", "fill": 0, "fillGradient": 0, @@ -1389,9 +1309,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Errors rate to Alertmanager ($group)", "tooltip": { "shared": true, @@ -1400,9 +1318,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -1410,25 +1326,20 @@ { "$$hashKey": "object:229", "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:230", "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], @@ -1437,7 +1348,6 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -1451,7 +1361,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the top 10 recording rules which generate the most of samples. Each generated sample is basically a time series which then ingested into configured remote storage. Rules with high numbers may cause the most pressure on the remote database and become a source of too high cardinality.", "fill": 0, "fillGradient": 0, @@ -1500,9 +1412,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Top 10 rules by produced samples ($group)", "tooltip": { "shared": true, @@ -1511,37 +1421,30 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Shows the rules which do not produce any samples during the evaluation. Usually it means that such rules are misconfigured, since they give no output during the evaluation.\nPlease check if rule's expression is correct and it is working as expected.", "fieldConfig": { "defaults": { @@ -1581,8 +1484,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1625,8 +1527,6 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Rules with 0 produced samples ($group)", "type": "timeseries" }, @@ -1635,7 +1535,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "fill": 0, "fillGradient": 0, "gridPos": { @@ -1683,9 +1585,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Errors ($group)", "tooltip": { "shared": true, @@ -1694,33 +1594,24 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], @@ -1729,7 +1620,6 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -1743,7 +1633,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, "description": "Shows the CPU usage per vmalert instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.", "fieldConfig": { "defaults": { @@ -1757,7 +1650,7 @@ "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 27 }, "hiddenSeries": false, "id": 35, @@ -1787,16 +1680,26 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "$$hashKey": "object:61", + "alias": "/Limit .*/", + "color": "#F2495C" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, "exemplar": false, "expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(instance)", "format": "time_series", @@ -1804,12 +1707,24 @@ "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Limit ({{instance}})", + "refId": "B" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "CPU ($instance)", "tooltip": { "shared": true, @@ -1818,33 +1733,25 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1852,7 +1759,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Amount of used memory\n\nResident memory shows share which can be freed by OS when needed.\n\nAnonymous shows share for memory allocated by the process itself. This share cannot be freed by the OS, so it must be taken into account by OOM killer.\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.", "fieldConfig": { "defaults": { @@ -1866,7 +1775,7 @@ "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 27 }, "hiddenSeries": false, "id": 37, @@ -1896,7 +1805,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -1922,9 +1831,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Memory usage ($instance)", "tooltip": { "shared": true, @@ -1933,33 +1840,25 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1967,7 +1866,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { @@ -1981,7 +1882,7 @@ "h": 8, "w": 12, "x": 0, - "y": 12 + "y": 35 }, "hiddenSeries": false, "id": 39, @@ -2005,7 +1906,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -2039,9 +1940,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Open FDs ($instance)", "tooltip": { "shared": true, @@ -2050,9 +1949,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -2060,24 +1957,19 @@ { "decimals": 0, "format": "short", - "label": null, "logBase": 2, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -2085,7 +1977,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "fieldConfig": { "defaults": { "links": [] @@ -2098,7 +1992,7 @@ "h": 8, "w": 12, "x": 12, - "y": 12 + "y": 35 }, "hiddenSeries": false, "id": 41, @@ -2122,7 +2016,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.3", + "pluginVersion": "8.3.5", "pointradius": 2, "points": false, "renderer": "flot", @@ -2141,9 +2035,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Goroutines ($instance)", "tooltip": { "shared": true, @@ -2152,9 +2044,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -2162,24 +2052,18 @@ { "decimals": 0, "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], @@ -2188,7 +2072,7 @@ } ], "refresh": false, - "schemaVersion": 30, + "schemaVersion": 34, "style": "dark", "tags": [ "victoriametrics", @@ -2199,14 +2083,11 @@ { "current": { "selected": false, - "text": "Prometheus", - "value": "Prometheus" + "text": "VictoriaMetrics", + "value": "VictoriaMetrics" }, - "description": null, - "error": null, "hide": 0, "includeAll": false, - "label": null, "multi": false, "name": "ds", "options": [], @@ -2218,15 +2099,13 @@ "type": "datasource" }, { - "allValue": null, "current": {}, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "definition": "label_values(vm_app_version{version=~\"^vmalert.*\"}, job)", - "description": null, - "error": null, "hide": 0, "includeAll": false, - "label": null, "multi": false, "name": "job", "options": [], @@ -2243,13 +2122,12 @@ { "allValue": ".*", "current": {}, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", - "description": null, - "error": null, "hide": 0, "includeAll": true, - "label": null, "multi": true, "name": "instance", "options": [], @@ -2266,13 +2144,12 @@ { "allValue": ".*", "current": {}, - "datasource": "$ds", + "datasource": { + "uid": "$ds" + }, "definition": "label_values(vmalert_iteration_duration_seconds{job=~\"$job\", instance=~\"$instance\"}, group)", - "description": null, - "error": null, "hide": 0, "includeAll": true, - "label": null, "multi": true, "name": "group", "options": [], @@ -2296,5 +2173,6 @@ "timezone": "", "title": "vmalert", "uid": "LzldHAVnz", - "version": 1 + "version": 1, + "weekStart": "" } \ No newline at end of file diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index e3fc1866f..5b5de4dfd 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -44,6 +44,17 @@ groups: description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. Consider to either increase available memory or decrease the load on the process." + - alert: TooHighCPUUsage + expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 + for: 5m + labels: + severity: critical + annotations: + summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" + description: "Too high CPU usage may be a sign of insufficient resources and make process unstable. + Consider to either increase available CPU resources or decrease the load on the process." + + # Alerts group for VM single assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/10229 is installed. # Pls update the `dashboard` annotation according to your setup.