{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "description": "Overview for operator VictoriaMetrics v0.25.0 or higher", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 38, "iteration": 1653261405647, "links": [], "liveNow": false, "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 8, "panels": [], "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "gridPos": { "h": 3, "w": 4, "x": 0, "y": 1 }, "id": 24, "options": { "content": "
$version
", "mode": "markdown" }, "pluginVersion": "8.3.2", "title": "Version", "type": "text" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "description": "Number of objects at kubernetes cluster per each controller", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 7, "w": 20, "x": 4, "y": 1 }, "id": 14, "options": { "colorMode": "none", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "text": {}, "textMode": "auto" }, "pluginVersion": "8.3.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "max(operator_controller_objects_count{job=~\"$job\",instance=~\"$instance\"}) by (controller)", "legendFormat": "{{controller}}", "range": true, "refId": "A" } ], "title": "CRD Objects count by controller", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 4 }, "id": 22, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "8.3.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "exemplar": false, "expr": "vm_app_uptime_seconds{job=~\"$job\",instance=~\"$instance\"}", "format": "table", "instant": true, "interval": "", "legendFormat": "{{instance}}", "range": false, "refId": "A" } ], "title": "Uptime", "type": "stat" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 13, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 12, "legend": { "alignAsTable": true, "avg": true, "current": false, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=~\"requeue_after|requeue|success\"}[$__rate_interval])) by(controller)", "legendFormat": "{{controller}}", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Reconciliation rate by controller", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "description": "", "fill": 1, "fillGradient": 0, "gridPos": { "h": 13, "w": 12, "x": 12, "y": 8 }, "hiddenSeries": false, "id": 16, "legend": { "alignAsTable": true, "avg": true, "current": false, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(rate(operator_log_messages_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) by (level)", "legendFormat": "{{label_name}}", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Log message rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, "id": 6, "panels": [], "title": "Troubleshooting", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "description": "Non zero metrics indicates about error with CR object definition (typos or incorrect values) or errors with kubernetes API connection.", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, "hiddenSeries": false, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(controller_runtime_reconcile_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) by(controller) > 0 ", "instant": false, "legendFormat": "{{controller}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(rate(controller_runtime_reconcile_total{job=~\"$job\",instance=~\"$instance\",result=\"error\"}[$__rate_interval])) by(controller) > 0", "hide": false, "legendFormat": "{{label_name}}", "range": true, "refId": "B" } ], "thresholds": [], "timeRegions": [], "title": "reconcile errors by controller", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "description": "Operator limits number of reconcilation events to 5 events per 2 seconds.\n For now, this limit is applied only for vmalert and vmagent controllers.\n It should reduce load at kubernetes cluster and increase operator performance.", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, "hiddenSeries": false, "id": 18, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(rate(operator_reconcile_throttled_events_total[$__rate_interval])) by(controller)", "legendFormat": "{{controller}}", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "throttled reconcilation events", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "description": "Number of objects waiting in the queue for reconciliation. Non-zero values indicate that operator cannot process CR objects changes with the given resources.", "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 0, "y": 30 }, "hiddenSeries": false, "id": 20, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "max(workqueue_depth{job=~\"$job\",instance=~\"$instance\"}) by (name)", "legendFormat": "{{label_name}}", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Wokring queue depth", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "description": " For controllers with StatefulSet it's ok to see latency greater then 3 seconds. It could be vmalertmanager,vmcluster or vmagent in statefulMode.\n\n For other controllers, latency greater then 1 second may indicate issues with kubernetes cluster or operator's performance.\n ", "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 11, "w": 12, "x": 12, "y": 30 }, "hiddenSeries": false, "id": 26, "legend": { "alignAsTable": true, "avg": true, "current": false, "max": true, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "histogram_quantile(0.99,sum(rate(controller_runtime_reconcile_time_seconds_bucket[$__rate_interval])) by(le,controller) )", "legendFormat": "q.99 {{controller}}", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Reconcilation latency by controller", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "s", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, "id": 4, "panels": [], "title": "resources", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 42 }, "hiddenSeries": false, "id": 28, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"}) ", "legendFormat": "requested from system", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"}) ", "hide": false, "legendFormat": "heap inuse", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", "hide": false, "legendFormat": "stack inuse", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", "hide": false, "legendFormat": "resident", "range": true, "refId": "D" } ], "thresholds": [], "timeRegions": [], "title": "Memory usage ($instance)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 42 }, "hiddenSeries": false, "id": 30, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])", "legendFormat": "CPU cores used", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "CPU ($instance)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 50 }, "hiddenSeries": false, "id": 32, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"})", "legendFormat": "goroutines", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "Goroutines ($instance)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "short", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": { "type": "prometheus", "uid": "$ds" }, "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 50 }, "hiddenSeries": false, "id": 34, "legend": { "alignAsTable": true, "avg": true, "current": false, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "8.3.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "datasource": { "type": "prometheus", "uid": "$ds" }, "editorMode": "code", "expr": "sum(rate(go_gc_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n/\nsum(rate(go_gc_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", "legendFormat": "avg gc duration", "range": true, "refId": "A" } ], "thresholds": [], "timeRegions": [], "title": "GC duration ($instance)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "show": true, "values": [] }, "yaxes": [ { "format": "s", "logBase": 1, "show": true }, { "format": "short", "logBase": 1, "show": true } ], "yaxis": { "align": false } } ], "refresh": "", "schemaVersion": 33, "style": "dark", "tags": [ "operator", "VictoriaMetrics" ], "templating": { "list": [ { "current": { "selected": false, "text": "cloud-test-13", "value": "cloud-test-13" }, "hide": 0, "includeAll": false, "multi": false, "name": "datasource", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", "skipUrlSync": false, "type": "datasource" }, { "current": { "selected": false, "text": "vm-operator-victoria-metrics-operator", "value": "vm-operator-victoria-metrics-operator" }, "datasource": { "type": "prometheus", "uid": "$ds" }, "definition": "label_values(operator_log_messages_total,job)", "hide": 0, "includeAll": false, "multi": false, "name": "job", "options": [], "query": { "query": "label_values(operator_log_messages_total,job)", "refId": "StandardVariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "selected": false, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "$ds" }, "definition": "label_values(operator_log_messages_total{job=~\"$job\"},instance)", "hide": 0, "includeAll": true, "multi": false, "name": "instance", "options": [], "query": { "query": "label_values(operator_log_messages_total{job=~\"$job\"},instance)", "refId": "StandardVariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "isNone": true, "selected": false, "text": "None", "value": "" }, "datasource": { "type": "prometheus", "uid": "$ds" }, "definition": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", "hide": 2, "includeAll": false, "multi": false, "name": "version", "options": [], "query": { "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, "type": "query" } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "VictoriaMetrics - operator", "uid": "1H179hunk", "version": 1, "weekStart": "" }