Monitoring single (#2190)

* dashboards: plot cpu limits for vmagent, vmalert and vm-single dashboards

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* alerts: add `TooHighCPUUsage` alert for all VM components

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* dashboards: bump components version requirements

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-02-15 11:54:28 +02:00 committed by GitHub
parent 0d3e00e512
commit e29b2b8444
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 890 additions and 949 deletions

View file

@ -5,7 +5,7 @@
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "8.3.2"
"version": "8.3.5"
},
{
"type": "panel",
@ -51,13 +51,13 @@
}
]
},
"description": "Overview for single node VictoriaMetrics v1.70.0 or higher",
"description": "Overview for single node VictoriaMetrics v1.73.0 or higher",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 10229,
"graphTooltip": 0,
"id": null,
"iteration": 1639989804164,
"iteration": 1644907807949,
"links": [
{
"icon": "doc",
@ -88,7 +88,7 @@
"liveNow": false,
"panels": [
{
"collapsed": true,
"collapsed": false,
"datasource": {
"uid": "$ds"
},
@ -99,7 +99,10 @@
"y": 0
},
"id": 6,
"panels": [
"panels": [],
"title": "Stats",
"type": "row"
},
{
"datasource": {
"uid": "$ds"
@ -116,7 +119,7 @@
"content": "<div style=\"text-align: center;\">$version</div>",
"mode": "markdown"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"title": "Version",
"type": "text"
},
@ -168,7 +171,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -232,7 +235,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -296,7 +299,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -360,7 +363,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -425,7 +428,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -487,7 +490,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -551,7 +554,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -619,7 +622,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -683,7 +686,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"exemplar": true,
@ -698,10 +701,6 @@
],
"title": "Available memory",
"type": "stat"
}
],
"title": "Stats",
"type": "row"
},
{
"collapsed": false,
@ -712,7 +711,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 1
"y": 5
},
"id": 24,
"panels": [],
@ -740,7 +739,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 2
"y": 6
},
"hiddenSeries": false,
"id": 12,
@ -764,7 +763,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -835,7 +834,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 2
"y": 6
},
"hiddenSeries": false,
"id": 22,
@ -859,7 +858,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -929,7 +928,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 10
"y": 14
},
"hiddenSeries": false,
"id": 51,
@ -959,7 +958,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1029,7 +1028,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 10
"y": 14
},
"hiddenSeries": false,
"id": 33,
@ -1053,7 +1052,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1137,7 +1136,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 18
"y": 22
},
"hiddenSeries": false,
"id": 59,
@ -1163,7 +1162,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1248,7 +1247,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 18
"y": 22
},
"hiddenSeries": false,
"id": 35,
@ -1272,7 +1271,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1332,7 +1331,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 26
"y": 30
},
"id": 14,
"panels": [
@ -2432,7 +2431,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 27
"y": 31
},
"id": 71,
"panels": [
@ -2883,7 +2882,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "${ds}"
"uid": "${DS_VICTORIAMETRICS}"
},
"exemplar": true,
"expr": "sum(vm_merge_need_free_disk_space{job=~\"$job\", instance=~\"$instance\"}) by(type)",
@ -2935,7 +2934,7 @@
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "${ds}"
"uid": "${DS_VICTORIAMETRICS}"
},
"description": "Shows the percentage of used cache size from the allowed size by type. \nValues close to 100% show the maximum potential utilization.\nValues close to 0% show that cache is underutilized.",
"fill": 0,
@ -2976,7 +2975,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "${ds}"
"uid": "${DS_VICTORIAMETRICS}"
},
"exemplar": true,
"expr": "vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"} / vm_cache_size_max_bytes{job=~\"$job\", instance=~\"$instance\"}",
@ -3030,7 +3029,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 28
"y": 32
},
"id": 46,
"panels": [
@ -3055,7 +3054,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 29
"y": 5
},
"hiddenSeries": false,
"id": 44,
@ -3079,7 +3078,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3170,8 +3169,10 @@
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"description": "",
"fieldConfig": {
"defaults": {
"links": []
@ -3184,7 +3185,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 29
"y": 5
},
"hiddenSeries": false,
"id": 57,
@ -3208,22 +3209,46 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"seriesOverrides": [
{
"$$hashKey": "object:85",
"alias": "Limit",
"color": "#F2495C"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[5m])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "CPU cores used",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"exemplar": false,
"expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Limit",
"refId": "B"
}
],
"thresholds": [],
@ -3279,7 +3304,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 37
"y": 13
},
"hiddenSeries": false,
"id": 75,
@ -3303,7 +3328,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3388,7 +3413,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 37
"y": 13
},
"hiddenSeries": false,
"id": 76,
@ -3412,7 +3437,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3496,7 +3521,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 45
"y": 21
},
"hiddenSeries": false,
"id": 47,
@ -3520,7 +3545,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3591,7 +3616,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 45
"y": 21
},
"hiddenSeries": false,
"id": 42,
@ -3615,7 +3640,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3684,7 +3709,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 53
"y": 29
},
"hiddenSeries": false,
"id": 48,
@ -3708,7 +3733,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3779,7 +3804,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 53
"y": 29
},
"hiddenSeries": false,
"id": 37,
@ -3803,7 +3828,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3874,7 +3899,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 61
"y": 37
},
"hiddenSeries": false,
"id": 49,
@ -3898,7 +3923,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.0",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3954,7 +3979,7 @@
}
],
"refresh": "30s",
"schemaVersion": 33,
"schemaVersion": 34,
"style": "dark",
"tags": [
"victoriametrics",
@ -3965,8 +3990,8 @@
{
"current": {
"selected": false,
"text": "cloud-test-13",
"value": "cloud-test-13"
"text": "VictoriaMetrics",
"value": "VictoriaMetrics"
},
"hide": 0,
"includeAll": false,

View file

@ -6,7 +6,7 @@
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "8.3.2"
"version": "8.3.5"
},
{
"type": "panel",
@ -58,12 +58,12 @@
}
]
},
"description": "Overview for VictoriaMetrics vmagent v1.70.0 or higher",
"description": "Overview for VictoriaMetrics vmagent v1.73.0 or higher",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"iteration": 1639980687827,
"iteration": 1644908591152,
"links": [
{
"icon": "doc",
@ -151,7 +151,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})",
@ -215,7 +215,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})",
@ -282,7 +282,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))",
@ -341,7 +341,7 @@
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"targets": [
{
"expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})",
@ -487,7 +487,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -583,7 +583,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -687,7 +687,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -785,7 +785,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -906,7 +906,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -999,7 +999,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1098,7 +1098,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1196,7 +1196,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -1295,7 +1295,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3613,6 +3613,7 @@
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"description": "Shows the CPU usage per vmagent instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.",
@ -3628,7 +3629,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 14
"y": 45
},
"hiddenSeries": false,
"id": 35,
@ -3658,21 +3659,47 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"seriesOverrides": [
{
"$$hashKey": "object:77",
"alias": "/Limit.*/",
"color": "#F2495C"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"exemplar": false,
"expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(instance)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "$ds"
},
"exemplar": false,
"expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Limit ({{instance}})",
"refId": "B"
}
],
"thresholds": [],
@ -3727,7 +3754,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 14
"y": 45
},
"hiddenSeries": false,
"id": 37,
@ -3757,7 +3784,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3834,7 +3861,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 22
"y": 53
},
"hiddenSeries": false,
"id": 81,
@ -3858,7 +3885,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -3943,7 +3970,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 22
"y": 53
},
"hiddenSeries": false,
"id": 7,
@ -3967,7 +3994,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -4045,7 +4072,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 30
"y": 61
},
"hiddenSeries": false,
"id": 83,
@ -4069,7 +4096,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -4153,7 +4180,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 30
"y": 61
},
"hiddenSeries": false,
"id": 39,
@ -4177,7 +4204,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -4247,7 +4274,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 38
"y": 69
},
"hiddenSeries": false,
"id": 43,
@ -4271,7 +4298,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -4339,7 +4366,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 38
"y": 69
},
"hiddenSeries": false,
"id": 41,
@ -4363,7 +4390,7 @@
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.2",
"pluginVersion": "8.3.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -4418,7 +4445,7 @@
}
],
"refresh": false,
"schemaVersion": 33,
"schemaVersion": 34,
"style": "dark",
"tags": [
"vmagent",
@ -4428,9 +4455,9 @@
"list": [
{
"current": {
"selected": true,
"text": "dbaas-test-t3-medium-inst",
"value": "dbaas-test-t3-medium-inst"
"selected": false,
"text": "VictoriaMetrics",
"value": "VictoriaMetrics"
},
"hide": 0,
"includeAll": false,

File diff suppressed because it is too large Load diff

View file

@ -44,6 +44,17 @@ groups:
description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
Consider to either increase available memory or decrease the load on the process."
- alert: TooHighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
Consider to either increase available CPU resources or decrease the load on the process."
# Alerts group for VM single assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/10229 is installed.
# Pls update the `dashboard` annotation according to your setup.