diff --git a/dashboards/Makefile b/dashboards/Makefile index 03c9abc1e..c07375d0d 100644 --- a/dashboards/Makefile +++ b/dashboards/Makefile @@ -15,3 +15,4 @@ dashboards-sync: SRC=vmagent.json D_UID=G7Z9GzMGz TITLE="VictoriaMetrics - vmagent" $(MAKE) dashboard-copy SRC=vmalert.json D_UID=LzldHAVnz TITLE="VictoriaMetrics - vmalert" $(MAKE) dashboard-copy SRC=vmauth.json D_UID=nbuo5Mr4k TITLE="VictoriaMetrics - vmauth" $(MAKE) dashboard-copy + SRC=operator.json D_UID=1H179hunk TITLE="VictoriaMetrics - operator" $(MAKE) dashboard-copy diff --git a/dashboards/operator.json b/dashboards/operator.json index 6f1716139..7f34a3fc4 100644 --- a/dashboards/operator.json +++ b/dashboards/operator.json @@ -1,38 +1,4 @@ { - "__inputs": [], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -67,7 +33,7 @@ "collapsed": false, "datasource": { "type": "prometheus", - "uid": "PB894574A363DF0AF" + "uid": "$ds" }, "gridPos": { "h": 1, @@ -81,7 +47,7 @@ { "datasource": { "type": "prometheus", - "uid": "PB894574A363DF0AF" + "uid": "$ds" }, "refId": "A" } @@ -207,12 +173,12 @@ "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, { - "color": "green", - "value": 1800 + "color": "red", + "value": 80 } ] }, @@ -269,6 +235,214 @@ "type": "prometheus", "uid": "$ds" }, + "description": " Shows per namespace watchers for Prometheus Operator objects (ServiceMonitors, PodMonitors, etc) ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 8 + }, + "id": 39, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(operator_prometheus_converter_active_watchers)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Prometheus Objects watchers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " Number of operator instances with obtained leader status. \n Value above 1 indicates that instances with the same job may behave incorrectly.\n It's recommend to check Operator logs. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 8 + }, + "id": 40, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(leader_election_master_status{job=~\"$job\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Electer Leaders", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " Shows number of active reconcile workers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 8 + }, + "id": 41, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(controller_runtime_active_workers{job=~\"$job\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " Shows amount of Prometheus Operator objects processed by Operator.", "fieldConfig": { "defaults": { "color": { @@ -296,6 +470,104 @@ "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(operator_prometheus_converter_watch_events_total{job=~\"$job\"}[$__interval])) by (event_type,object_type_name)", + "instant": false, + "legendFormat": "{{object_type_name}} {{event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Prometheus Converter Watch events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, "showPoints": "never", "spanNulls": false, "stacking": { @@ -328,7 +600,7 @@ "h": 13, "w": 12, "x": 0, - "y": 8 + "y": 13 }, "id": 12, "options": { @@ -343,7 +615,7 @@ }, "tooltip": { "mode": "multi", - "sort": "desc" + "sort": "none" } }, "pluginVersion": "10.4.0", @@ -354,7 +626,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=~\"requeue_after|requeue|success\"}[$__rate_interval])) by(controller) > 0", + "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=~\"requeue_after|requeue|success\"}[$__rate_interval])) by(controller)", "legendFormat": "{{controller}}", "range": true, "refId": "A" @@ -368,7 +640,7 @@ "type": "prometheus", "uid": "$ds" }, - "description": "", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", "fieldConfig": { "defaults": { "color": { @@ -428,7 +700,7 @@ "h": 13, "w": 12, "x": 12, - "y": 8 + "y": 13 }, "id": 16, "options": { @@ -443,7 +715,7 @@ }, "tooltip": { "mode": "multi", - "sort": "desc" + "sort": "none" } }, "pluginVersion": "10.4.0", @@ -464,25 +736,883 @@ "type": "timeseries" }, { - "collapsed": true, + "collapsed": false, "datasource": { "type": "prometheus", - "uid": "PB894574A363DF0AF" + "uid": "$ds" }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 21 + "y": 26 }, "id": 6, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "refId": "A" + } + ], + "title": "Troubleshooting", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Non zero metrics indicates about error with CR object definition (typos or incorrect values) or errors with kubernetes API connection.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(controller_runtime_reconcile_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) by(controller) > 0 ", + "instant": false, + "legendFormat": "{{controller}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=\"error\"}[$__rate_interval])) by(controller) > 0", + "hide": false, + "legendFormat": "result errors: {{controller}}", + "range": true, + "refId": "B" + } + ], + "title": "reconcile errors by controller", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Operator limits number of reconcile configuration events to 5 events per 2 seconds by default.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(operator_reconcile_throttled_events_total[$__rate_interval])) by(controller)", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A" + } + ], + "title": "throttled reconcilation config events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Number of objects waiting in the queue for reconciliation. Non-zero values indicate that operator cannot process CR objects changes with the given resources.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(workqueue_depth{job=~\"$job\",instance=~\"$instance\"}) by (name)", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Wokring queue depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " For controllers with StatefulSet it's ok to see latency greater then 3 seconds. It could be vmalertmanager,vmcluster or vmagent in statefulMode.\n\n For other controllers, latency greater then 2 second may indicate issues with kubernetes cluster or operator's performance.\n ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99,sum(rate(controller_runtime_reconcile_time_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by(le,controller) )", + "legendFormat": "q.99 {{controller}}", + "range": true, + "refId": "A" + } + ], + "title": "Reconcilation latency by controller", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Number of HTTP requests to the Kubernetes API server break down by code and method", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(rest_client_requests_total{job=~\"$job\"}[$__interval])) by (method,code)", + "instant": false, + "legendFormat": "{{method}} {{code}}", + "range": true, + "refId": "A" + } + ], + "title": "Rest client requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows how many ongoing reconcile events are taking place, where:\n* `max` - equal to the value of flag`-controller.maxConcurrentReconciles`;\n* `current` - current number of reconcile workers processing CRD objects.\n\nWhen `current` hits `max` constantly, it means operator cannot process events in time. It should be either increased value for flag `-controller.maxConcurrentReconciles` or allocated additional CPU resources to the operator.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Max" + }, + "properties": [ + { + "id": "custom.axisColorMode", + "value": "text" + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(max_over_time(controller_runtime_active_workers{job=~\"$job\"}[$__interval]))", + "instant": false, + "legendFormat": "Current", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "min(controller_runtime_max_concurrent_reconciles{job=~\"$job\"})", + "hide": false, + "instant": false, + "legendFormat": "Max", + "range": true, + "refId": "B" + } + ], + "title": "Concurrent reconcile ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the time goroutines have spent in runnable state before actually running. The lower is better.\n\nHigh values or values exceeding the threshold is usually a sign of insufficient CPU resources or CPU throttling. \n\nVerify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (job, instance, le))) by(job)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Go scheduling latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " Requests latency to the Kubernetes API server.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 54 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99,sum(rate(rest_client_request_duration_seconds_bucket{job=~\"$job\"})) by(le,method,api) )", + "instant": false, + "legendFormat": "{{method}} {{api}}", + "range": true, + "refId": "A" + } + ], + "title": "rest client latency", + "type": "timeseries" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 62 + }, + "id": 4, "panels": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, - "description": "Non zero metrics indicates about error with CR object definition (typos or incorrect values) or errors with kubernetes API connection.", "fieldConfig": { "defaults": { "color": { @@ -525,8 +1655,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -534,7 +1663,7 @@ } ] }, - "unit": "short" + "unit": "bytes" }, "overrides": [] }, @@ -542,12 +1671,13 @@ "h": 8, "w": 12, "x": 0, - "y": 2 + "y": 63 }, - "id": 10, + "id": 28, "options": { "legend": { "calcs": [ + "mean", "lastNotNull", "max" ], @@ -568,10 +1698,8 @@ "uid": "$ds" }, "editorMode": "code", - "exemplar": false, - "expr": "sum(rate(controller_runtime_reconcile_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) by(controller) > 0 ", - "instant": false, - "legendFormat": "{{controller}}", + "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"}) ", + "legendFormat": "requested from system", "range": true, "refId": "A" }, @@ -581,14 +1709,38 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=\"error\"}[$__rate_interval])) by(controller) > 0", + "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"}) ", "hide": false, - "legendFormat": "{{label_name}}", + "legendFormat": "heap inuse", "range": true, "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "hide": false, + "legendFormat": "stack inuse", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", + "hide": false, + "legendFormat": "resident", + "range": true, + "refId": "D" } ], - "title": "reconcile errors by controller", + "title": "Memory usage ($instance)", "type": "timeseries" }, { @@ -596,7 +1748,6 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Operator limits number of reconcilation events to 5 events per 2 seconds.\n For now, this limit is applied only for vmalert and vmagent controllers.\n It should reduce load at kubernetes cluster and increase operator performance.", "fieldConfig": { "defaults": { "color": { @@ -639,8 +1790,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -656,9 +1806,9 @@ "h": 8, "w": 12, "x": 12, - "y": 2 + "y": 63 }, - "id": 18, + "id": 30, "options": { "legend": { "calcs": [], @@ -679,13 +1829,13 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(rate(operator_reconcile_throttled_events_total[$__rate_interval])) by(controller)", - "legendFormat": "{{controller}}", + "expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) ", + "legendFormat": "CPU cores used", "range": true, "refId": "A" } ], - "title": "throttled reconcilation events", + "title": "CPU ($instance)", "type": "timeseries" }, { @@ -693,7 +1843,6 @@ "type": "prometheus", "uid": "$ds" }, - "description": "Number of objects waiting in the queue for reconciliation. Non-zero values indicate that operator cannot process CR objects changes with the given resources.", "fieldConfig": { "defaults": { "color": { @@ -736,8 +1885,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -750,25 +1898,22 @@ "overrides": [] }, "gridPos": { - "h": 11, + "h": 8, "w": 12, "x": 0, - "y": 10 + "y": 71 }, - "id": 20, + "id": 32, "options": { "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", - "sort": "desc" + "sort": "none" } }, "pluginVersion": "10.4.0", @@ -779,13 +1924,13 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "max(workqueue_depth{job=~\"$job\",instance=~\"$instance\"}) by (name)", - "legendFormat": "{{label_name}}", + "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"})", + "legendFormat": "goroutines", "range": true, "refId": "A" } ], - "title": "Working queue depth", + "title": "Goroutines ($instance)", "type": "timeseries" }, { @@ -793,7 +1938,6 @@ "type": "prometheus", "uid": "$ds" }, - "description": " For controllers with StatefulSet it's ok to see latency greater then 3 seconds. It could be vmalertmanager,vmcluster or vmagent in statefulMode.\n\n For other controllers, latency greater then 1 second may indicate issues with kubernetes cluster or operator's performance.\n ", "fieldConfig": { "defaults": { "color": { @@ -836,8 +1980,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -850,12 +1993,12 @@ "overrides": [] }, "gridPos": { - "h": 11, + "h": 8, "w": 12, "x": 12, - "y": 10 + "y": 71 }, - "id": 26, + "id": 34, "options": { "legend": { "calcs": [ @@ -868,7 +2011,7 @@ }, "tooltip": { "mode": "multi", - "sort": "desc" + "sort": "none" } }, "pluginVersion": "10.4.0", @@ -879,13 +2022,13 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(controller_runtime_reconcile_time_seconds_bucket[$__rate_interval])) by(le,controller))", - "legendFormat": "q.99 {{controller}}", + "expr": "sum(rate(go_gc_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n/\nsum(rate(go_gc_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "avg gc duration", "range": true, "refId": "A" } ], - "title": "Reconcilation latency by controller", + "title": "GC duration ($instance)", "type": "timeseries" } ], @@ -893,467 +2036,13 @@ { "datasource": { "type": "prometheus", - "uid": "PB894574A363DF0AF" - }, - "refId": "A" - } - ], - "title": "Troubleshooting", - "type": "row" - }, - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PB894574A363DF0AF" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 22 - }, - "id": 4, - "panels": [], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PB894574A363DF0AF" + "uid": "$ds" }, "refId": "A" } ], "title": "resources", "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 23 - }, - "id": 28, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"}) ", - "legendFormat": "requested from system", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"}) ", - "hide": false, - "legendFormat": "heap inuse", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", - "hide": false, - "legendFormat": "stack inuse", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", - "hide": false, - "legendFormat": "resident", - "range": true, - "refId": "D" - } - ], - "title": "Memory usage ($instance)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 23 - }, - "id": 30, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])", - "legendFormat": "CPU cores used", - "range": true, - "refId": "A" - } - ], - "title": "CPU ($instance)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 31 - }, - "id": 32, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"})", - "legendFormat": "goroutines", - "range": true, - "refId": "A" - } - ], - "title": "Goroutines ($instance)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 31 - }, - "id": 34, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "sum(rate(go_gc_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n/\nsum(rate(go_gc_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", - "legendFormat": "avg gc duration", - "range": true, - "refId": "A" - } - ], - "title": "GC duration ($instance)", - "type": "timeseries" } ], "refresh": "", @@ -1365,18 +2054,14 @@ "templating": { "list": [ { - "current": { - "selected": true, - "text": "VictoriaMetrics", - "value": "PF64AB64142051B50" - }, + "current": {}, "hide": 0, "includeAll": false, "multi": false, "name": "ds", "options": [], "query": "prometheus", - "queryValue": "", + "queryValue": "te", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -1432,16 +2117,15 @@ "type": "prometheus", "uid": "$ds" }, - "definition": "label_values(vm_app_version{job=\"$job\", instance=~\"$instance\"},version)", + "definition": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", "hide": 2, "includeAll": false, "multi": false, "name": "version", "options": [], "query": { - "qryType": 1, - "query": "label_values(vm_app_version{job=\"$job\", instance=~\"$instance\"},version)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", @@ -1459,6 +2143,6 @@ "timezone": "", "title": "VictoriaMetrics - operator", "uid": "1H179hunk", - "version": 1, + "version": 7, "weekStart": "" -} \ No newline at end of file +} diff --git a/dashboards/vm/operator.json b/dashboards/vm/operator.json new file mode 100644 index 000000000..e28377b9b --- /dev/null +++ b/dashboards/vm/operator.json @@ -0,0 +1,2149 @@ + +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Overview for operator VictoriaMetrics v0.25.0 or higher", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "$ds" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 8, + "panels": [], + "targets": [ + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "$ds" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "$ds" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 24, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "