From bea2f86b7b018bbf4bfc33929fa822ff59c2930b Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 16 Feb 2021 23:33:59 +0200 Subject: [PATCH 01/21] docs/CHANGELOG.md: document new per-tenant metrics --- docs/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 82e2cb2b0..6473ddc29 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,7 @@ * FEATURE: remove dependency on external programs such as `cat`, `grep` and `cut` when detecting cpu and memory limits inside Docker or LXC container. * FEATURE: vmagent: add `__meta_kubernetes_endpoints_label_*`, `__meta_kubernetes_endpoints_labelpresent_*`, `__meta_kubernetes_endpoints_annotation_*` and `__meta_kubernetes_endpoints_annotationpresent_*` labels for `role: endpoints` in Kubernetes service discovery. These labels where added in Prometheus 2.25. * FEATURE: reduce the minimum supported retention period for inverted index (aka `indexdb`) from one month to one day. This should reduce disk space usage for `<-storageDataPath>/indexdb` folder if `-retentionPeriod` is set to values smaller than one month. +* FEATURE: vmselect: export per-tenant metrics `vm_vmselect_http_requests_total` and `vm_vmselect_http_requests_duration_ms_total` . Other per-tenant metrics are available as a part of [enterprise package](https://victoriametrics.com/enterprise.html). See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/932 for details. * BUGFIX: properly convert regexp tag filters containing escaped dots to non-regexp tag filters. For example, `{foo=~"bar\.baz"}` should be converted to `{foo="bar.baz"}`. Previously it was incorrectly converted to `{foo="bar\.baz"}`, which could result in missing time series for this tag filter. * BUGFIX: do not spam error logs when discovering Docker Swarm targets without dedicated IP. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1028 . From 9ca7d76b25c63121694f4f8fa152a9abe554312a Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 16 Feb 2021 21:38:20 +0000 Subject: [PATCH 02/21] Add `Labels limit exceeded` panel to dashboard (#1072) New panel supposed to display events when VM drops extra label on exceeding `maxLabelsPerTimeseries` limit. --- dashboards/victoriametrics.json | 2367 ++++++++++++++++--------------- 1 file changed, 1231 insertions(+), 1136 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 2aefcda75..484e972ab 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -56,7 +56,7 @@ "gnetId": 10229, "graphTooltip": 0, "id": null, - "iteration": 1603307754894, + "iteration": 1613320710757, "links": [ { "icon": "doc", @@ -1135,7 +1135,7 @@ } }, { - "collapsed": true, + "collapsed": false, "datasource": "$ds", "gridPos": { "h": 1, @@ -1144,1116 +1144,1115 @@ "y": 35 }, "id": 14, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are inserted into storage per second", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ingestion rate ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 73, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": false, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Storage full ETA ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "bytes-per-datapoint", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "total datapoints", - "refId": "A" - }, - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "bytes-per-datapoint", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 2, - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "pending index entries", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending datapoints", - "refId": "A" - }, - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending index entries", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Pending datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 3, - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "hiddenSeries": false, - "id": 53, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Used", - "refId": "A" - }, - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Free", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "LSM parts ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 27 - }, - "hiddenSeries": false, - "id": 62, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active merges ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by inverted index.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 27 - }, - "hiddenSeries": false, - "id": 55, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - index ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 35 - }, - "hiddenSeries": false, - "id": 58, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{reason}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows ignored ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of rows merged per second by storage nodes.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 35 - }, - "hiddenSeries": false, - "id": 64, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Merge speed ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 43 - }, - "hiddenSeries": false, - "id": 67, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{level}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Logging rate ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], + "panels": [], "title": "Storage", "type": "row" }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "How many datapoints are inserted into storage per second", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ingestion rate ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "hiddenSeries": false, + "id": 73, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage full ETA ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "bytes-per-datapoint", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "total datapoints", + "refId": "A" + }, + { + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "bytes-per-datapoint", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "decimals": 2, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "pending index entries", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "pending datapoints", + "refId": "A" + }, + { + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "pending index entries", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pending datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "decimals": 3, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + }, + { + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 52 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "LSM parts ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active merges ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by inverted index.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "hiddenSeries": false, + "id": 55, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - index ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 68 + }, + "hiddenSeries": false, + "id": 58, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows ignored ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of rows merged per second by storage nodes.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 68 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Merge speed ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 76 + }, + "hiddenSeries": false, + "id": 67, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logging rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "collapsed": true, "datasource": "$ds", @@ -2261,7 +2260,7 @@ "h": 1, "w": 24, "x": 0, - "y": 36 + "y": 84 }, "id": 71, "panels": [ @@ -2274,7 +2273,8 @@ "description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "links": [] }, "overrides": [] }, @@ -2284,7 +2284,7 @@ "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 85 }, "hiddenSeries": false, "id": 66, @@ -2300,10 +2300,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -2368,7 +2366,8 @@ "description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "links": [] }, "overrides": [] }, @@ -2378,7 +2377,7 @@ "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 85 }, "hiddenSeries": false, "id": 60, @@ -2395,10 +2394,8 @@ "linewidth": 1, "links": [], "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -2467,17 +2464,18 @@ "description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 8, - "w": 24, + "h": 9, + "w": 12, "x": 0, - "y": 12 + "y": 93 }, "hiddenSeries": false, "id": 68, @@ -2494,10 +2492,8 @@ "linewidth": 1, "links": [], "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -2557,6 +2553,105 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n\nThis prevents from ingesting metrics with too many labels. The value of `maxLabelsPerTimeseries` must be adjusted for your workload.\n\nWhen limit is exceeded (graph is > 0) - extra labels are dropped, which could result in unexpected identical time series.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 93 + }, + "hiddenSeries": false, + "id": 74, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=\"$job\", instance=\"$instance\"}[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Labels limit exceeded ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Troubleshooting", @@ -2569,7 +2664,7 @@ "h": 1, "w": 24, "x": 0, - "y": 37 + "y": 85 }, "id": 46, "panels": [ @@ -2593,7 +2688,7 @@ "h": 8, "w": 12, "x": 0, - "y": 78 + "y": 102 }, "hiddenSeries": false, "id": 44, @@ -2705,7 +2800,7 @@ "h": 8, "w": 12, "x": 12, - "y": 78 + "y": 102 }, "hiddenSeries": false, "id": 57, @@ -2801,7 +2896,7 @@ "h": 8, "w": 12, "x": 0, - "y": 86 + "y": 110 }, "hiddenSeries": false, "id": 47, @@ -2897,7 +2992,7 @@ "h": 8, "w": 12, "x": 12, - "y": 86 + "y": 110 }, "hiddenSeries": false, "id": 42, @@ -2992,7 +3087,7 @@ "h": 8, "w": 12, "x": 0, - "y": 94 + "y": 118 }, "hiddenSeries": false, "id": 48, @@ -3089,7 +3184,7 @@ "h": 8, "w": 12, "x": 12, - "y": 94 + "y": 118 }, "hiddenSeries": false, "id": 37, @@ -3187,7 +3282,7 @@ "h": 8, "w": 12, "x": 0, - "y": 102 + "y": 126 }, "hiddenSeries": false, "id": 49, @@ -3395,4 +3490,4 @@ "title": "VictoriaMetrics", "uid": "wNf0q_kZk", "version": 1 -} +} \ No newline at end of file From 4a078200486572b1d9cb7cd71113ff63e79c73e6 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 14:59:04 +0200 Subject: [PATCH 03/21] lib/storage: make sure that nobody uses partitions when closing the table --- lib/storage/storage.go | 2 ++ lib/storage/storage_test.go | 3 ++- lib/storage/table.go | 32 ++++++++++++++++++-------------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 0cdfc22df..d1f8dd57c 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -588,6 +588,8 @@ func (s *Storage) mustRotateIndexDB() { } // MustClose closes the storage. +// +// It is expected that the s is no longer used during the close. func (s *Storage) MustClose() { close(s.stop) diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go index 9c8bbd60c..f2891841e 100644 --- a/lib/storage/storage_test.go +++ b/lib/storage/storage_test.go @@ -967,15 +967,16 @@ func testStorageAddRows(s *Storage) error { return fmt.Errorf("error when force merging partitions: %w", err) } ptws := s1.tb.GetPartitions(nil) - defer s1.tb.PutPartitions(ptws) for _, ptw := range ptws { pws := ptw.pt.GetParts(nil) numParts := len(pws) ptw.pt.PutParts(pws) if numParts != 1 { + s1.tb.PutPartitions(ptws) return fmt.Errorf("unexpected number of parts for partition %q after force merge; got %d; want 1", ptw.pt.name, numParts) } } + s1.tb.PutPartitions(ptws) s1.MustClose() diff --git a/lib/storage/table.go b/lib/storage/table.go index faea8c1a0..646d33074 100644 --- a/lib/storage/table.go +++ b/lib/storage/table.go @@ -189,6 +189,7 @@ func (tb *table) addPartitionNolock(pt *partition) { } // MustClose closes the table. +// It is expected that all the pending searches on the table are finished before calling MustClose. func (tb *table) MustClose() { close(tb.stop) tb.retentionWatcherWG.Wait() @@ -198,10 +199,11 @@ func (tb *table) MustClose() { tb.ptws = nil tb.ptwsLock.Unlock() - // Decrement references to partitions, so they may be eventually closed after - // pending searches are done. for _, ptw := range ptws { - ptw.decRef() + if n := atomic.LoadUint64(&ptw.refCount); n != 1 { + logger.Panicf("BUG: unexpected refCount=%d when closing the partition; probably there are pending searches", n) + } + ptw.pt.MustClose() } // Release exclusive lock on the table. @@ -271,10 +273,10 @@ func (tb *table) AddRows(rows []rawRow) error { ptwsX.a = tb.GetPartitions(ptwsX.a[:0]) ptws := ptwsX.a - for _, ptw := range ptws { + for i, ptw := range ptws { singlePt := true - for i := range rows { - if !ptw.pt.HasTimestamp(rows[i].Timestamp) { + for j := range rows { + if !ptw.pt.HasTimestamp(rows[j].Timestamp) { singlePt = false break } @@ -283,16 +285,18 @@ func (tb *table) AddRows(rows []rawRow) error { continue } - // Move the partition with the matching rows to the front of tb.ptws, - // so it will be detected faster next time. - tb.ptwsLock.Lock() - for i := range tb.ptws { - if ptw == tb.ptws[i] { - tb.ptws[0], tb.ptws[i] = tb.ptws[i], tb.ptws[0] - break + if i != 0 { + // Move the partition with the matching rows to the front of tb.ptws, + // so it will be detected faster next time. + tb.ptwsLock.Lock() + for j := range tb.ptws { + if ptw == tb.ptws[j] { + tb.ptws[0], tb.ptws[j] = tb.ptws[j], tb.ptws[0] + break + } } + tb.ptwsLock.Unlock() } - tb.ptwsLock.Unlock() // Fast path - add all the rows into the ptw. ptw.pt.AddRows(rows) From 442fcfec5a0d5e6a1d546d3e1d5871dd436125ff Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 15:00:08 +0200 Subject: [PATCH 04/21] lib/storage: tune the logic for sorting tag filters according the their exeuction times --- lib/storage/index_db.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index 51fde2806..835704649 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2786,23 +2786,21 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter } tfws := make([]tagFilterWithWeight, len(tfs.tfs)) ct := fasttime.UnixTimestamp() + hasDurationReset := false for i := range tfs.tfs { tf := &tfs.tfs[i] durationSeconds, lastQueryTimestamp := is.getDurationAndTimestampForDateFilter(date, tf) - if ct > lastQueryTimestamp+60 { - // It is time to update filter duration stats. - if tf.isNegative || tf.isRegexp && len(tf.orSuffixes) == 0 { - // Negative and regexp filters usually take the most time, so move them to the end of filters - // in the hope they won't be executed at all. - if durationSeconds == 0 { - durationSeconds = 10 - } - } else { - // Reset duration stats for relatively fast {key="value"} and {key=~"foo|bar|baz"} filters, so it is re-populated below. - if durationSeconds < 0.5 { - durationSeconds = 0 - } - } + if durationSeconds == 0 && (tf.isNegative || tf.isRegexp && len(tf.orSuffixes) == 0) { + // Negative and regexp filters usually take the most time, so move them to the end of filters + // in the hope they won't be executed at all. + durationSeconds = 10 + is.storeDurationAndTimestampForDateFilter(date, tf, durationSeconds, 0) + } + if !hasDurationReset && ct > lastQueryTimestamp+10*60 && durationSeconds < 0.2 { + // Reset duration stats for relatively fast filters, so it is re-populated below. + // Reset duration for a single filter at a time in order to reduce negative impact on query time. + durationSeconds = 0 + hasDurationReset = true } tfws[i] = tagFilterWithWeight{ tf: tf, From 4edfe76befde8941170ed64b6fe0ddd89b2869fe Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 15:17:38 +0200 Subject: [PATCH 05/21] deployment/dm: update Go builder image from v1.15.8 to v1.16.0 See release notes for Go1.16 at https://golang.org/doc/go1.16 --- deployment/docker/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/docker/Makefile b/deployment/docker/Makefile index 57d724049..4b392de95 100644 --- a/deployment/docker/Makefile +++ b/deployment/docker/Makefile @@ -4,7 +4,7 @@ DOCKER_NAMESPACE := victoriametrics ROOT_IMAGE ?= alpine:3.13.1 CERTS_IMAGE := alpine:3.13.1 -GO_BUILDER_IMAGE := golang:1.15.8 +GO_BUILDER_IMAGE := golang:1.16.0 BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _) BASE_IMAGE := local/base:1.1.3-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _) From 74424b55ee00bdef71e5e477298281479541f295 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 17:28:15 +0200 Subject: [PATCH 06/21] lib/storage: further tune tag filters sorting --- lib/storage/index_db.go | 44 ++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index 835704649..3d772c3c6 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -25,6 +25,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache" "github.com/VictoriaMetrics/fastcache" xxhash "github.com/cespare/xxhash/v2" + "github.com/valyala/fastrand" ) const ( @@ -2310,7 +2311,7 @@ func (is *indexSearch) updateMetricIDsForTagFilters(metricIDs *uint64set.Set, tf // Slow path - try searching over the whole inverted index. // Sort tag filters for faster ts.Seek below. - sort.SliceStable(tfs.tfs, func(i, j int) bool { + sort.Slice(tfs.tfs, func(i, j int) bool { return tfs.tfs[i].Less(&tfs.tfs[j]) }) minTf, minMetricIDs, err := is.getTagFilterWithMinMetricIDsCountOptimized(tfs, tr, maxMetrics) @@ -2786,29 +2787,37 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter } tfws := make([]tagFilterWithWeight, len(tfs.tfs)) ct := fasttime.UnixTimestamp() - hasDurationReset := false + var tfwsEst []*tagFilterWithWeight for i := range tfs.tfs { tf := &tfs.tfs[i] durationSeconds, lastQueryTimestamp := is.getDurationAndTimestampForDateFilter(date, tf) - if durationSeconds == 0 && (tf.isNegative || tf.isRegexp && len(tf.orSuffixes) == 0) { - // Negative and regexp filters usually take the most time, so move them to the end of filters - // in the hope they won't be executed at all. - durationSeconds = 10 - is.storeDurationAndTimestampForDateFilter(date, tf, durationSeconds, 0) - } - if !hasDurationReset && ct > lastQueryTimestamp+10*60 && durationSeconds < 0.2 { - // Reset duration stats for relatively fast filters, so it is re-populated below. - // Reset duration for a single filter at a time in order to reduce negative impact on query time. - durationSeconds = 0 - hasDurationReset = true - } tfws[i] = tagFilterWithWeight{ tf: tf, durationSeconds: durationSeconds, lastQueryTimestamp: lastQueryTimestamp, } + if ct > lastQueryTimestamp+5*60+uint64(fastrand.Uint32n(5*60)) { + // Re-estimate the time required for tf execution. + tfwsEst = append(tfwsEst, &tfws[i]) + } } - sort.SliceStable(tfws, func(i, j int) bool { + maxDateMetrics := maxMetrics * 50 + if len(tfwsEst) > 0 { + sort.Slice(tfwsEst, func(i, j int) bool { + return tfwsEst[i].tf.Less(tfwsEst[j].tf) + }) + // Allocate up to one second for the estimation of tf execution in order to reduce + // possible negative impact on query duration. + // It is OK if the deadline is exceeded during the estimation - the corresponding filters + // will be re-estimated after the lastQueryTimestamp expiration. + isEst := is.db.getIndexSearch(ct + 1) + for _, tfw := range tfwsEst { + _, _ = isEst.getMetricIDsForDateTagFilter(tfw.tf, 0, date, tfs.commonPrefix, maxDateMetrics) + tfw.durationSeconds, tfw.lastQueryTimestamp = is.getDurationAndTimestampForDateFilter(date, tfw.tf) + } + is.db.putIndexSearch(isEst) + } + sort.Slice(tfws, func(i, j int) bool { a, b := &tfws[i], &tfws[j] if a.durationSeconds != b.durationSeconds { return a.durationSeconds < b.durationSeconds @@ -2819,7 +2828,6 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter // Populate metricIDs for the first non-negative filter. var tfsPostponed []*tagFilter var metricIDs *uint64set.Set - maxDateMetrics := maxMetrics * 50 tfwsRemaining := tfws[:0] for i := range tfws { tfw := tfws[i] @@ -3090,6 +3098,10 @@ func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTime // Store the duration for tag filter execution in the cache in order to sort tag filters // in ascending durations on the next search. durationSeconds := time.Since(startTime).Seconds() + if err != nil && durationSeconds < 10 { + // Set high duration for failing filter, so it is moved to the end of filter list. + durationSeconds = 10 + } if metricIDs.Len() >= maxMetrics { // Increase the duration for tag filter matching too many metrics, // So next time it will be applied after filters matching lower number of metrics. From 63fc140624a912bb08aa1d7da6c4d2a16b5303a2 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 17:55:29 +0200 Subject: [PATCH 07/21] lib/storage: tag filters sorting... --- lib/storage/index_db.go | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index 3d772c3c6..4a1759408 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -25,7 +25,6 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache" "github.com/VictoriaMetrics/fastcache" xxhash "github.com/cespare/xxhash/v2" - "github.com/valyala/fastrand" ) const ( @@ -2786,36 +2785,18 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter lastQueryTimestamp uint64 } tfws := make([]tagFilterWithWeight, len(tfs.tfs)) - ct := fasttime.UnixTimestamp() - var tfwsEst []*tagFilterWithWeight for i := range tfs.tfs { tf := &tfs.tfs[i] durationSeconds, lastQueryTimestamp := is.getDurationAndTimestampForDateFilter(date, tf) + if durationSeconds == 0 { + // Assume that unknown tag filters can take quite big amounts of time. + durationSeconds = 1.0 + } tfws[i] = tagFilterWithWeight{ tf: tf, durationSeconds: durationSeconds, lastQueryTimestamp: lastQueryTimestamp, } - if ct > lastQueryTimestamp+5*60+uint64(fastrand.Uint32n(5*60)) { - // Re-estimate the time required for tf execution. - tfwsEst = append(tfwsEst, &tfws[i]) - } - } - maxDateMetrics := maxMetrics * 50 - if len(tfwsEst) > 0 { - sort.Slice(tfwsEst, func(i, j int) bool { - return tfwsEst[i].tf.Less(tfwsEst[j].tf) - }) - // Allocate up to one second for the estimation of tf execution in order to reduce - // possible negative impact on query duration. - // It is OK if the deadline is exceeded during the estimation - the corresponding filters - // will be re-estimated after the lastQueryTimestamp expiration. - isEst := is.db.getIndexSearch(ct + 1) - for _, tfw := range tfwsEst { - _, _ = isEst.getMetricIDsForDateTagFilter(tfw.tf, 0, date, tfs.commonPrefix, maxDateMetrics) - tfw.durationSeconds, tfw.lastQueryTimestamp = is.getDurationAndTimestampForDateFilter(date, tfw.tf) - } - is.db.putIndexSearch(isEst) } sort.Slice(tfws, func(i, j int) bool { a, b := &tfws[i], &tfws[j] @@ -2829,6 +2810,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter var tfsPostponed []*tagFilter var metricIDs *uint64set.Set tfwsRemaining := tfws[:0] + maxDateMetrics := maxMetrics * 50 for i := range tfws { tfw := tfws[i] tf := tfw.tf From 33806264ec5844173b99de9e2203ed696e3f2d82 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 18:54:59 +0200 Subject: [PATCH 08/21] lib/storage: revert ecf132933e52f4ecc25be9b5c6d61b8a0eb7458d, since negative filters require the same amount of work as non-negative filters --- lib/storage/tag_filters.go | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index d32c48ec9..5d62cbc04 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -243,15 +243,15 @@ func (tf *tagFilter) Less(other *tagFilter) bool { if tf.matchCost != other.matchCost { return tf.matchCost < other.matchCost } - if tf.isNegative != other.isNegative { - return !tf.isNegative - } if tf.isRegexp != other.isRegexp { return !tf.isRegexp } if len(tf.orSuffixes) != len(other.orSuffixes) { return len(tf.orSuffixes) < len(other.orSuffixes) } + if tf.isNegative != other.isNegative { + return !tf.isNegative + } return bytes.Compare(tf.prefix, other.prefix) < 0 } @@ -315,9 +315,6 @@ func (tf *tagFilter) InitFromGraphiteQuery(commonPrefix, query []byte, paths []s tf.prefix = marshalTagValueNoTrailingTagSeparator(tf.prefix, []byte(prefix)) tf.orSuffixes = append(tf.orSuffixes[:0], orSuffixes...) tf.reSuffixMatch, tf.matchCost = newMatchFuncForOrSuffixes(orSuffixes) - if isNegative { - tf.matchCost *= negativeMatchCostMultiplier - } } func getCommonPrefix(ss []string) (string, []string) { @@ -385,9 +382,6 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp tf.orSuffixes = append(tf.orSuffixes[:0], "") tf.isEmptyMatch = len(prefix) == 0 tf.matchCost = fullMatchCost - if isNegative { - tf.matchCost *= negativeMatchCostMultiplier - } return nil } rcv, err := getRegexpFromCache(expr) @@ -397,9 +391,6 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp tf.orSuffixes = append(tf.orSuffixes[:0], rcv.orValues...) tf.reSuffixMatch = rcv.reMatch tf.matchCost = rcv.reCost - if isNegative { - tf.matchCost *= negativeMatchCostMultiplier - } tf.isEmptyMatch = len(prefix) == 0 && tf.reSuffixMatch(nil) if !tf.isNegative && len(key) == 0 && strings.IndexByte(rcv.literalSuffix, '.') >= 0 { // Reverse suffix is needed only for non-negative regexp filters on __name__ that contains dots. @@ -571,8 +562,6 @@ const ( reMatchCost = 100 ) -const negativeMatchCostMultiplier = 1000 - func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) (func(b []byte) bool, string, uint64) { if isDotStar(sre) { // '.*' From d4849561efa00ed6c38aedfcd1cedc99999053d5 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 19:13:38 +0200 Subject: [PATCH 09/21] app/vmstorage: export vm_composite_filter_success_conversions_total and vm_composite_filter_missing_conversions_total metrics --- app/vmstorage/main.go | 6 ++++++ lib/storage/index_db.go | 6 +++++- lib/storage/tag_filters.go | 8 ++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 3a5f4bfe1..4a96fd130 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -448,6 +448,12 @@ func registerStorageMetrics() { metrics.NewGauge(`vm_composite_index_min_timestamp`, func() float64 { return float64(idbm().MinTimestampForCompositeIndex) / 1e3 }) + metrics.NewGauge(`vm_composite_filter_success_conversions_total`, func() float64 { + return float64(idbm().CompositeFilterSuccessConversions) + }) + metrics.NewGauge(`vm_composite_filter_missing_conversions_total`, func() float64 { + return float64(idbm().CompositeFilterMissingConversions) + }) metrics.NewGauge(`vm_assisted_merges_total{type="storage/small"}`, func() float64 { return float64(tm().SmallAssistedMerges) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index 4a1759408..c6ade8c65 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -205,7 +205,9 @@ type IndexDBMetrics struct { IndexBlocksWithMetricIDsProcessed uint64 IndexBlocksWithMetricIDsIncorrectOrder uint64 - MinTimestampForCompositeIndex uint64 + MinTimestampForCompositeIndex uint64 + CompositeFilterSuccessConversions uint64 + CompositeFilterMissingConversions uint64 mergeset.TableMetrics } @@ -249,6 +251,8 @@ func (db *indexDB) UpdateMetrics(m *IndexDBMetrics) { m.IndexBlocksWithMetricIDsIncorrectOrder = atomic.LoadUint64(&indexBlocksWithMetricIDsIncorrectOrder) m.MinTimestampForCompositeIndex = uint64(db.minTimestampForCompositeIndex) + m.CompositeFilterSuccessConversions = atomic.LoadUint64(&compositeFilterSuccessConversions) + m.CompositeFilterMissingConversions = atomic.LoadUint64(&compositeFilterMissingConversions) db.tb.UpdateMetrics(&m.TableMetrics) db.doExtDB(func(extDB *indexDB) { diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index 5d62cbc04..67a373f4e 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -36,6 +36,7 @@ func convertToCompositeTagFilters(tfs *TagFilters) *TagFilters { } if len(name) == 0 { // There is no metric name filter, so composite filters cannot be created. + atomic.AddUint64(&compositeFilterMissingConversions, 1) return tfs } tfsNew := make([]tagFilter, 0, len(tfs.tfs)) @@ -61,13 +62,20 @@ func convertToCompositeTagFilters(tfs *TagFilters) *TagFilters { compositeFilters++ } if compositeFilters == 0 { + atomic.AddUint64(&compositeFilterMissingConversions, 1) return tfs } tfsCompiled := NewTagFilters() tfsCompiled.tfs = tfsNew + atomic.AddUint64(&compositeFilterSuccessConversions, 1) return tfsCompiled } +var ( + compositeFilterSuccessConversions uint64 + compositeFilterMissingConversions uint64 +) + // TagFilters represents filters used for filtering tags. type TagFilters struct { tfs []tagFilter From faad6f84a49027804c24b0d26deed4c020815d97 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 19:29:48 +0200 Subject: [PATCH 10/21] lib/storage: return back filter arg to getMetricIDsForTagFilter function The filter arg has been removed in the commit c7ee2fabb84c247f0ace0f0ac2f57bcae76efbd3 because it was preventing from caching the number of matching time series per each tf. Now the cache contains duration for tf execution, so the filter shouldn't break such caching. --- lib/storage/index_db.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index c6ade8c65..d3e77d90a 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2047,7 +2047,7 @@ func (is *indexSearch) getTagFilterWithMinMetricIDsCount(tfs *TagFilters, maxMet continue } - metricIDs, err := is.getMetricIDsForTagFilter(tf, maxMetrics) + metricIDs, err := is.getMetricIDsForTagFilter(tf, nil, maxMetrics) if err != nil { if err == errFallbackToMetricNameMatch { // Skip tag filters requiring to scan for too many metrics. @@ -2369,7 +2369,7 @@ const ( var uselessTagFilterCacheValue = []byte("1") -func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, maxMetrics int) (*uint64set.Set, error) { +func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, filter *uint64set.Set, maxMetrics int) (*uint64set.Set, error) { if tf.isNegative { logger.Panicf("BUG: isNegative must be false") } @@ -2387,7 +2387,7 @@ func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, maxMetrics int) ( // Slow path - scan for all the rows with the given prefix. maxLoops := maxMetrics * maxIndexScanSlowLoopsPerMetric - if err := is.getMetricIDsForTagFilterSlow(tf, nil, maxLoops, metricIDs.Add); err != nil { + if err := is.getMetricIDsForTagFilterSlow(tf, filter, maxLoops, metricIDs.Add); err != nil { if err == errFallbackToMetricNameMatch { return nil, err } @@ -2822,7 +2822,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter tfwsRemaining = append(tfwsRemaining, tfw) continue } - m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, tfs.commonPrefix, maxDateMetrics) + m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, nil, tfs.commonPrefix, maxDateMetrics) if err != nil { return nil, err } @@ -2878,7 +2878,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter tfsPostponed = append(tfsPostponed, tf) continue } - m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, tfs.commonPrefix, maxDateMetrics) + m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, metricIDs, tfs.commonPrefix, maxDateMetrics) if err != nil { return nil, err } @@ -3060,7 +3060,8 @@ func (is *indexSearch) hasDateMetricID(date, metricID uint64) (bool, error) { return true, nil } -func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTimestamp, date uint64, commonPrefix []byte, maxMetrics int) (*uint64set.Set, error) { +func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTimestamp, date uint64, + filter *uint64set.Set, commonPrefix []byte, maxMetrics int) (*uint64set.Set, error) { // Augument tag filter prefix for per-date search instead of global search. if !bytes.HasPrefix(tf.prefix, commonPrefix) { logger.Panicf("BUG: unexpected tf.prefix %q; must start with commonPrefix %q", tf.prefix, commonPrefix) @@ -3073,7 +3074,7 @@ func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTime tfNew.isNegative = false // isNegative for the original tf is handled by the caller. tfNew.prefix = kb.B startTime := time.Now() - metricIDs, err := is.getMetricIDsForTagFilter(&tfNew, maxMetrics) + metricIDs, err := is.getMetricIDsForTagFilter(&tfNew, filter, maxMetrics) kbPool.Put(kb) currentTimestamp := fasttime.UnixTimestamp() if currentTimestamp > lastQueryTimestamp+5 { From 939d5ffc2bac1ad706857fdf86a3e07f6e4e3a95 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 20:22:30 +0200 Subject: [PATCH 11/21] lib/storage: move composite filters to the top during sorting --- lib/storage/tag_filters.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index 67a373f4e..c38747280 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -245,9 +245,19 @@ type tagFilter struct { graphiteReverseSuffix []byte } +func (tf *tagFilter) isComposite() bool { + k := tf.key + return len(k) > 0 && k[0] == compositeTagKeyPrefix +} + func (tf *tagFilter) Less(other *tagFilter) bool { - // Move regexp and negative filters to the end, since they require scanning - // all the entries for the given label. + // Move composite filters to the top, since they usually match lower number of time series. + // Move regexp filters to the bottom, since they require scanning all the entries for the given label. + isCompositeA := tf.isComposite() + isCompositeB := tf.isComposite() + if isCompositeA != isCompositeB { + return isCompositeA + } if tf.matchCost != other.matchCost { return tf.matchCost < other.matchCost } From ce99b48a9ad1c468066fd3be91c1388e0a0c92b1 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 20:40:22 +0200 Subject: [PATCH 12/21] Revert "lib/mergeset: tune lifetime for entries inside block caches" This reverts commit 458c89324de3f7ae1553fc0c05452d1a114d44ff. Production testing revealed zero improvements for memory usage with reduced lifetime for entries in block caches. --- lib/mergeset/part.go | 8 ++++---- lib/storage/part.go | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/mergeset/part.go b/lib/mergeset/part.go index 3bd439399..2b01d94be 100644 --- a/lib/mergeset/part.go +++ b/lib/mergeset/part.go @@ -221,8 +221,8 @@ func (idxbc *indexBlockCache) cleanByTimeout() { currentTime := fasttime.UnixTimestamp() idxbc.mu.Lock() for k, idxbe := range idxbc.m { - // Delete items accessed more than 90 seconds ago. - if currentTime-atomic.LoadUint64(&idxbe.lastAccessTime) > 90 { + // Delete items accessed more than two minutes ago. + if currentTime-atomic.LoadUint64(&idxbe.lastAccessTime) > 2*60 { // do not call putIndexBlock(ibxbc.m[k]), since it // may be used by concurrent goroutines. delete(idxbc.m, k) @@ -375,8 +375,8 @@ func (ibc *inmemoryBlockCache) cleanByTimeout() { currentTime := fasttime.UnixTimestamp() ibc.mu.Lock() for k, ibe := range ibc.m { - // Delete items accessed more than 90 seconds ago. - if currentTime-atomic.LoadUint64(&ibe.lastAccessTime) > 90 { + // Delete items accessed more than a two minutes ago. + if currentTime-atomic.LoadUint64(&ibe.lastAccessTime) > 2*60 { // do not call putInmemoryBlock(ibc.m[k]), since it // may be used by concurrent goroutines. delete(ibc.m, k) diff --git a/lib/storage/part.go b/lib/storage/part.go index e18fa0b97..52b0ec41a 100644 --- a/lib/storage/part.go +++ b/lib/storage/part.go @@ -225,8 +225,8 @@ func (ibc *indexBlockCache) cleanByTimeout() { currentTime := fasttime.UnixTimestamp() ibc.mu.Lock() for k, ibe := range ibc.m { - // Delete items accessed more than 90 seconds ago. - if currentTime-atomic.LoadUint64(&ibe.lastAccessTime) > 90 { + // Delete items accessed more than two minutes ago. + if currentTime-atomic.LoadUint64(&ibe.lastAccessTime) > 2*60 { delete(ibc.m, k) } } From efe1e0cff0f6f0634edaaf184d510f6306083c64 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 21:03:21 +0200 Subject: [PATCH 13/21] docs/CaseStudies.md: actualize Wix numbers --- docs/CaseStudies.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/CaseStudies.md b/docs/CaseStudies.md index 8d17fa847..99e22456b 100644 --- a/docs/CaseStudies.md +++ b/docs/CaseStudies.md @@ -105,13 +105,13 @@ See [Monitoring K8S with VictoriaMetrics](https://docs.google.com/presentation/d Numbers: -* The number of active time series per VictoriaMetrics instance is 40M. -* The total number of time series per VictoriaMetrics instance is 5000M+. -* Ingestion rate per VictoriaMetrics instance is 1M data points per second. +* The number of active time series per VictoriaMetrics instance is 50 millions. +* The total number of time series per VictoriaMetrics instance is 5000 millions. +* Ingestion rate per VictoriaMetrics instance is 1.1 millions data points per second. * The total number of datapoints per VictoriaMetrics instance is 8.5 trillions. -* The average time series churn rate is ~80M per day. -* The average query rate is ~100 per second (mostly alert queries). -* Query duration: median is ~20ms, 99th percentile is ~1.5sec. +* The average churn rate is 150 millions new time series per day. +* The average query rate is ~150 per second (mostly alert queries). +* Query duration: median is ~1ms, 99th percentile is ~1sec. * Retention: 3 months. > Alternatives that we’ve played with before choosing VictoriaMetrics are: federated Prometheus, Cortex, IronDB and Thanos. From 4b110fa21cc153ae45f184090315d555061c61d1 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 21:06:40 +0200 Subject: [PATCH 14/21] docs/CHANGELOG.md: mention that prod binaries are built now with Go1.16 This is a follow-up for 4edfe76befde8941170ed64b6fe0ddd89b2869fe --- docs/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 6473ddc29..25130331d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -4,6 +4,7 @@ * FEATURE: optimize searching for matching metrics for `metric{}` queries if `` contains at least a single filter. For example, the query `up{job="foobar"}` should find the matching time series much faster than previously. * FEATURE: reduce execution times for `q1 q2` queries by executing `q1` and `q2` in parallel. +* FEATURE: switch from Go1.15 to [Go1.16](https://golang.org/doc/go1.16) for building prod binaries. * FEATURE: single-node VictoriaMetrics now accepts requests to handlers with `/prometheus` and `/graphite` prefixes such as `/prometheus/api/v1/query`. This improves compatibility with [handlers from VictoriaMetrics cluster](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format). * FEATURE: expose `process_open_fds` and `process_max_fds` metrics. These metrics can be used for alerting when `process_open_fds` reaches `process_max_fds`. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/402 and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1037 * FEATURE: vmalert: add `-datasource.appendTypePrefix` command-line option for querying both Prometheus and Graphite datasource in cluster version of VictoriaMetrics. See [these docs](https://victoriametrics.github.io/vmalert.html#graphite) for details. From 1d1ba889fe61b2ce55216e616428839261e8d07c Mon Sep 17 00:00:00 2001 From: Nikolay Date: Wed, 17 Feb 2021 22:23:38 +0300 Subject: [PATCH 15/21] adds pushback for fastqueue, (#1075) during shutdown currently sending block was lost, now its pushed back to fast queue and will be flushed on disk, it may lead to data duplication. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1065 --- .../prometheus/with_request_extra_filter.json | 8 ++++++++ app/vmagent/remotewrite/client.go | 18 ++++++++++-------- app/vmagent/remotewrite/remotewrite.go | 4 ++-- 3 files changed, 20 insertions(+), 10 deletions(-) create mode 100644 app/victoria-metrics/testdata/prometheus/with_request_extra_filter.json diff --git a/app/victoria-metrics/testdata/prometheus/with_request_extra_filter.json b/app/victoria-metrics/testdata/prometheus/with_request_extra_filter.json new file mode 100644 index 000000000..5bfbece19 --- /dev/null +++ b/app/victoria-metrics/testdata/prometheus/with_request_extra_filter.json @@ -0,0 +1,8 @@ +{ + "name": "basic_select_with_extra_labels", + "data": ["[{\"labels\":[{\"name\":\"__name__\",\"value\":\"prometheus.tenant.limits\"},{\"name\":\"baz\",\"value\":\"qux\"},{\"name\":\"tenant\",\"value\":\"dev\"}],\"samples\":[{\"value\":100000,\"timestamp\":\"{TIME_MS}\"}]},{\"labels\":[{\"name\":\"__name__\",\"value\":\"prometheus.up\"},{\"name\":\"baz\",\"value\":\"qux\"}],\"samples\":[{\"value\":100000,\"timestamp\":\"{TIME_MS}\"}]}]"], + "query": ["/api/v1/export?match={__name__!=''}&extra_label=tenant=dev"], + "result_metrics": [ + {"metric":{"__name__":"prometheus.tenant.limits","baz":"qux","tenant": "dev"},"values":[100000], "timestamps": ["{TIME_MS}"]} + ] +} diff --git a/app/vmagent/remotewrite/client.go b/app/vmagent/remotewrite/client.go index 2b0332f05..d13c16e43 100644 --- a/app/vmagent/remotewrite/client.go +++ b/app/vmagent/remotewrite/client.go @@ -178,7 +178,9 @@ func (c *client) runWorker() { return } go func() { - c.sendBlock(block) + if delivered := c.sendBlockOk(block); !delivered { + return + } ch <- struct{}{} }() select { @@ -190,17 +192,17 @@ func (c *client) runWorker() { graceDuration := 5 * time.Second select { case <-ch: + logger.Infof("stop ok") // The block has been sent successfully. case <-time.After(graceDuration): - logger.Errorf("couldn't sent block with size %d bytes to %q in %.3f seconds during shutdown; dropping it", - len(block), c.sanitizedURL, graceDuration.Seconds()) + c.fq.MustWriteBlock(block) } return } } } -func (c *client) sendBlock(block []byte) { +func (c *client) sendBlockOk(block []byte) bool { c.rl.register(len(block), c.stopCh) retryDuration := time.Second retriesCount := 0 @@ -236,7 +238,7 @@ again: select { case <-c.stopCh: timerpool.Put(t) - return + return false case <-t.C: timerpool.Put(t) } @@ -247,7 +249,7 @@ again: if statusCode/100 == 2 { _ = resp.Body.Close() c.requestsOKCount.Inc() - return + return true } metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc() if statusCode == 409 { @@ -258,7 +260,7 @@ again: logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+ "response body=%q", len(block), c.sanitizedURL, statusCode, body) c.packetsDropped.Inc() - return + return true } // Unexpected status code returned @@ -279,7 +281,7 @@ again: select { case <-c.stopCh: timerpool.Put(t) - return + return false case <-t.C: timerpool.Put(t) } diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index ee3631519..337772560 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -227,10 +227,10 @@ func (rwctx *remoteWriteCtx) MustStop() { } rwctx.idx = 0 rwctx.pss = nil - rwctx.fq.MustClose() - rwctx.fq = nil rwctx.c.MustStop() rwctx.c = nil + rwctx.fq.MustClose() + rwctx.fq = nil rwctx.relabelMetricsDropped = nil } From edcdc39eb3813a274e0ae65dda717d6e0c1f4c1f Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 21:42:45 +0200 Subject: [PATCH 16/21] app/vmagent/remotewrite: cleanup after 1d1ba889fe61b2ce55216e616428839261e8d07c Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1065 --- app/vmagent/remotewrite/client.go | 31 +++++++++++++++++++------------ docs/CHANGELOG.md | 1 + 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/app/vmagent/remotewrite/client.go b/app/vmagent/remotewrite/client.go index d13c16e43..d8cbfd3c8 100644 --- a/app/vmagent/remotewrite/client.go +++ b/app/vmagent/remotewrite/client.go @@ -171,30 +171,35 @@ func getTLSConfig(argIdx int) (*tls.Config, error) { func (c *client) runWorker() { var ok bool var block []byte - ch := make(chan struct{}) + ch := make(chan bool, 1) for { block, ok = c.fq.MustReadBlock(block[:0]) if !ok { return } go func() { - if delivered := c.sendBlockOk(block); !delivered { - return - } - ch <- struct{}{} + ch <- c.sendBlock(block) }() select { - case <-ch: - // The block has been sent successfully - continue + case ok := <-ch: + if ok { + // The block has been sent successfully + continue + } + // Return unsent block to the queue. + c.fq.MustWriteBlock(block) + return case <-c.stopCh: // c must be stopped. Wait for a while in the hope the block will be sent. graceDuration := 5 * time.Second select { - case <-ch: - logger.Infof("stop ok") - // The block has been sent successfully. + case ok := <-ch: + if !ok { + // Return unsent block to the queue. + c.fq.MustWriteBlock(block) + } case <-time.After(graceDuration): + // Return unsent block to the queue. c.fq.MustWriteBlock(block) } return @@ -202,7 +207,9 @@ func (c *client) runWorker() { } } -func (c *client) sendBlockOk(block []byte) bool { +// sendBlock returns false only if c.stopCh is closed. +// Otherwise it tries sending the block to remote storage indefinitely. +func (c *client) sendBlock(block []byte) bool { c.rl.register(len(block), c.stopCh) retryDuration := time.Second retriesCount := 0 diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 25130331d..44e43252f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -18,6 +18,7 @@ * BUGFIX: do not spam error logs when discovering Docker Swarm targets without dedicated IP. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1028 . * BUGFIX: properly embed timezone data into VictoriaMetrics apps. This should fix `-loggerTimezone` usage inside Docker containers. * BUGFIX: properly build Docker images for non-amd64 architectures (arm, arm64, ppc64le, 386) on [Docker hub](https://hub.docker.com/u/victoriametrics/). Previously these images were incorrectly based on amd64 base image, so they didn't work. +* BUGFIX: vmagent: return back unsent block to the queue during graceful shutdown. Previously this block could be dropped if remote storage is unavailable during vmagent shutdown. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1065 . # [v1.53.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.53.1) From d7de4807e1eca2cfbc0633cb8dcc9d9e4723f815 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Feb 2021 21:52:43 +0200 Subject: [PATCH 17/21] app/victoria-metrics/testdata: add a test for {__graphite__="foo.*.bar"} selector --- .../testdata/graphite/graphite-selector.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 app/victoria-metrics/testdata/graphite/graphite-selector.json diff --git a/app/victoria-metrics/testdata/graphite/graphite-selector.json b/app/victoria-metrics/testdata/graphite/graphite-selector.json new file mode 100644 index 000000000..5ab1c3ce9 --- /dev/null +++ b/app/victoria-metrics/testdata/graphite/graphite-selector.json @@ -0,0 +1,17 @@ +{ + "name": "graphite-selector", + "issue": "", + "data": [ + "graphite-selector.bar.baz 1 {TIME_S-1m}", + "graphite-selector.xxx.yy 2 {TIME_S-1m}", + "graphite-selector.bb.cc 3 {TIME_S-1m}", + "graphite-selector.a.baz 4 {TIME_S-1m}"], + "query": ["/api/v1/query?query=sort({__graphite__='graphite-selector.*.baz'})&time={TIME_S-1m}"], + "result_query": { + "status":"success", + "data":{"resultType":"vector","result":[ + {"metric":{"__name__":"graphite-selector.bar.baz"},"value":["{TIME_S-1m}","1"]}, + {"metric":{"__name__":"graphite-selector.a.baz"},"value":["{TIME_S-1m}","4"]} + ]} + } +} From f95dd67a22aec17131d003a3fdeb0bb8bd1101f5 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 18 Feb 2021 01:25:41 +0200 Subject: [PATCH 18/21] docs/Articles.md: add a link to https://medium.com/alteos-tech-blog/observability-availability-and-doras-research-program-85deb6680e78 --- docs/Articles.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/Articles.md b/docs/Articles.md index 3a42d62fe..ed2a1f486 100644 --- a/docs/Articles.md +++ b/docs/Articles.md @@ -24,6 +24,7 @@ * [Monitoring Kubernetes with VictoriaMetrics+Prometheus](https://speakerdeck.com/bo0km4n/victoriametrics-plus-prometheusdegou-zhu-surufu-shu-kubernetesfalsejian-shi-ji-pan) * [High-performance Graphite storage solution on top of VictoriaMetrics](https://golangexample.com/a-high-performance-graphite-storage-solution/) * [Cloud Native Model Driven Telemetry Stack on OpenShift](https://cer6erus.medium.com/cloud-native-model-driven-telemetry-stack-on-openshift-80712621f5bc) +* [Observability, Availability & DORA’s Research Program](https://medium.com/alteos-tech-blog/observability-availability-and-doras-research-program-85deb6680e78) ## Our articles From 711f8a5b8debce5fcdf828ea82327694e5b9b4ae Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 18 Feb 2021 12:47:36 +0200 Subject: [PATCH 19/21] lib/storage: sort tag filters by the number of loops they need for the execution This metric should work better than the filter execution duration, since it cannot be distorted by concurrently running queries. --- lib/storage/index_db.go | 168 ++++++++++++++++++------------------- lib/storage/table.go | 2 +- lib/storage/tag_filters.go | 4 +- 3 files changed, 85 insertions(+), 89 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index d3e77d90a..aa7da3efb 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "io" - "math" "path/filepath" "sort" "sync" @@ -2047,7 +2046,7 @@ func (is *indexSearch) getTagFilterWithMinMetricIDsCount(tfs *TagFilters, maxMet continue } - metricIDs, err := is.getMetricIDsForTagFilter(tf, nil, maxMetrics) + metricIDs, _, err := is.getMetricIDsForTagFilter(tf, nil, maxMetrics) if err != nil { if err == errFallbackToMetricNameMatch { // Skip tag filters requiring to scan for too many metrics. @@ -2369,34 +2368,36 @@ const ( var uselessTagFilterCacheValue = []byte("1") -func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, filter *uint64set.Set, maxMetrics int) (*uint64set.Set, error) { +func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, filter *uint64set.Set, maxMetrics int) (*uint64set.Set, uint64, error) { if tf.isNegative { logger.Panicf("BUG: isNegative must be false") } metricIDs := &uint64set.Set{} if len(tf.orSuffixes) > 0 { // Fast path for orSuffixes - seek for rows for each value from orSuffixes. - if err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs); err != nil { + loopsCount, err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs) + if err != nil { if err == errFallbackToMetricNameMatch { - return nil, err + return nil, loopsCount, err } - return nil, fmt.Errorf("error when searching for metricIDs for tagFilter in fast path: %w; tagFilter=%s", err, tf) + return nil, loopsCount, fmt.Errorf("error when searching for metricIDs for tagFilter in fast path: %w; tagFilter=%s", err, tf) } - return metricIDs, nil + return metricIDs, loopsCount, nil } // Slow path - scan for all the rows with the given prefix. - maxLoops := maxMetrics * maxIndexScanSlowLoopsPerMetric - if err := is.getMetricIDsForTagFilterSlow(tf, filter, maxLoops, metricIDs.Add); err != nil { + maxLoopsCount := uint64(maxMetrics) * maxIndexScanSlowLoopsPerMetric + loopsCount, err := is.getMetricIDsForTagFilterSlow(tf, filter, maxLoopsCount, metricIDs.Add) + if err != nil { if err == errFallbackToMetricNameMatch { - return nil, err + return nil, loopsCount, err } - return nil, fmt.Errorf("error when searching for metricIDs for tagFilter in slow path: %w; tagFilter=%s", err, tf) + return nil, loopsCount, fmt.Errorf("error when searching for metricIDs for tagFilter in slow path: %w; tagFilter=%s", err, tf) } - return metricIDs, nil + return metricIDs, loopsCount, nil } -func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint64set.Set, maxLoops int, f func(metricID uint64)) error { +func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint64set.Set, maxLoopsCount uint64, f func(metricID uint64)) (uint64, error) { if len(tf.orSuffixes) > 0 { logger.Panicf("BUG: the getMetricIDsForTagFilterSlow must be called only for empty tf.orSuffixes; got %s", tf.orSuffixes) } @@ -2408,40 +2409,40 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint6 mp.Reset() var prevMatchingSuffix []byte var prevMatch bool - loops := 0 + var loopsCount uint64 loopsPaceLimiter := 0 prefix := tf.prefix ts.Seek(prefix) for ts.NextItem() { if loopsPaceLimiter&paceLimiterMediumIterationsMask == 0 { if err := checkSearchDeadlineAndPace(is.deadline); err != nil { - return err + return loopsCount, err } } loopsPaceLimiter++ item := ts.Item if !bytes.HasPrefix(item, prefix) { - return nil + return loopsCount, nil } tail := item[len(prefix):] n := bytes.IndexByte(tail, tagSeparatorChar) if n < 0 { - return fmt.Errorf("invalid tag->metricIDs line %q: cannot find tagSeparatorChar=%d", item, tagSeparatorChar) + return loopsCount, fmt.Errorf("invalid tag->metricIDs line %q: cannot find tagSeparatorChar=%d", item, tagSeparatorChar) } suffix := tail[:n+1] tail = tail[n+1:] if err := mp.InitOnlyTail(item, tail); err != nil { - return err + return loopsCount, err } mp.ParseMetricIDs() + loopsCount += uint64(mp.MetricIDsLen()) + if loopsCount > maxLoopsCount { + return loopsCount, errFallbackToMetricNameMatch + } if prevMatch && string(suffix) == string(prevMatchingSuffix) { // Fast path: the same tag value found. // There is no need in checking it again with potentially // slow tf.matchSuffix, which may call regexp. - loops += mp.MetricIDsLen() - if loops > maxLoops { - return errFallbackToMetricNameMatch - } for _, metricID := range mp.MetricIDs { if filter != nil && !filter.Has(metricID) { continue @@ -2455,11 +2456,11 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint6 // since the current row has no matching metricIDs. continue } - // Slow path: need tf.matchSuffix call. ok, err := tf.matchSuffix(suffix) + loopsCount += reMatchCost if err != nil { - return fmt.Errorf("error when matching %s against suffix %q: %w", tf, suffix, err) + return loopsCount, fmt.Errorf("error when matching %s against suffix %q: %w", tf, suffix, err) } if !ok { prevMatch = false @@ -2474,18 +2475,16 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint6 // The last char in kb.B must be tagSeparatorChar. Just increment it // in order to jump to the next tag value. if len(kb.B) == 0 || kb.B[len(kb.B)-1] != tagSeparatorChar || tagSeparatorChar >= 0xff { - return fmt.Errorf("data corruption: the last char in k=%X must be %X", kb.B, tagSeparatorChar) + return loopsCount, fmt.Errorf("data corruption: the last char in k=%X must be %X", kb.B, tagSeparatorChar) } kb.B[len(kb.B)-1]++ ts.Seek(kb.B) + // Assume that a seek cost is equivalent to 100 ordinary loops. + loopsCount += 100 continue } prevMatch = true prevMatchingSuffix = append(prevMatchingSuffix[:0], suffix...) - loops += mp.MetricIDsLen() - if loops > maxLoops { - return errFallbackToMetricNameMatch - } for _, metricID := range mp.MetricIDs { if filter != nil && !filter.Has(metricID) { continue @@ -2494,29 +2493,32 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, filter *uint6 } } if err := ts.Error(); err != nil { - return fmt.Errorf("error when searching for tag filter prefix %q: %w", prefix, err) + return loopsCount, fmt.Errorf("error when searching for tag filter prefix %q: %w", prefix, err) } - return nil + return loopsCount, nil } -func (is *indexSearch) updateMetricIDsForOrSuffixesNoFilter(tf *tagFilter, maxMetrics int, metricIDs *uint64set.Set) error { +func (is *indexSearch) updateMetricIDsForOrSuffixesNoFilter(tf *tagFilter, maxMetrics int, metricIDs *uint64set.Set) (uint64, error) { if tf.isNegative { logger.Panicf("BUG: isNegative must be false") } kb := kbPool.Get() defer kbPool.Put(kb) + var loopsCount uint64 for _, orSuffix := range tf.orSuffixes { kb.B = append(kb.B[:0], tf.prefix...) kb.B = append(kb.B, orSuffix...) kb.B = append(kb.B, tagSeparatorChar) - if err := is.updateMetricIDsForOrSuffixNoFilter(kb.B, maxMetrics, metricIDs); err != nil { - return err + lc, err := is.updateMetricIDsForOrSuffixNoFilter(kb.B, maxMetrics, metricIDs) + if err != nil { + return loopsCount, err } + loopsCount += lc if metricIDs.Len() >= maxMetrics { - return nil + return loopsCount, nil } } - return nil + return loopsCount, nil } func (is *indexSearch) updateMetricIDsForOrSuffixesWithFilter(tf *tagFilter, metricIDs, filter *uint64set.Set) error { @@ -2534,39 +2536,39 @@ func (is *indexSearch) updateMetricIDsForOrSuffixesWithFilter(tf *tagFilter, met return nil } -func (is *indexSearch) updateMetricIDsForOrSuffixNoFilter(prefix []byte, maxMetrics int, metricIDs *uint64set.Set) error { +func (is *indexSearch) updateMetricIDsForOrSuffixNoFilter(prefix []byte, maxMetrics int, metricIDs *uint64set.Set) (uint64, error) { ts := &is.ts mp := &is.mp mp.Reset() - maxLoops := maxMetrics * maxIndexScanLoopsPerMetric - loops := 0 + maxLoopsCount := uint64(maxMetrics) * maxIndexScanLoopsPerMetric + var loopsCount uint64 loopsPaceLimiter := 0 ts.Seek(prefix) for metricIDs.Len() < maxMetrics && ts.NextItem() { if loopsPaceLimiter&paceLimiterFastIterationsMask == 0 { if err := checkSearchDeadlineAndPace(is.deadline); err != nil { - return err + return loopsCount, err } } loopsPaceLimiter++ item := ts.Item if !bytes.HasPrefix(item, prefix) { - return nil + return loopsCount, nil } if err := mp.InitOnlyTail(item, item[len(prefix):]); err != nil { - return err + return loopsCount, err } - loops += mp.MetricIDsLen() - if loops > maxLoops { - return errFallbackToMetricNameMatch + loopsCount += uint64(mp.MetricIDsLen()) + if loopsCount > maxLoopsCount { + return loopsCount, errFallbackToMetricNameMatch } mp.ParseMetricIDs() metricIDs.AddMulti(mp.MetricIDs) } if err := ts.Error(); err != nil { - return fmt.Errorf("error when searching for tag filter prefix %q: %w", prefix, err) + return loopsCount, fmt.Errorf("error when searching for tag filter prefix %q: %w", prefix, err) } - return nil + return loopsCount, nil } func (is *indexSearch) updateMetricIDsForOrSuffixWithFilter(prefix []byte, metricIDs *uint64set.Set, sortedFilter []uint64, isNegative bool) error { @@ -2578,8 +2580,8 @@ func (is *indexSearch) updateMetricIDsForOrSuffixWithFilter(prefix []byte, metri ts := &is.ts mp := &is.mp mp.Reset() - maxLoops := len(sortedFilter) * maxIndexScanLoopsPerMetric - loops := 0 + maxLoopsCount := uint64(len(sortedFilter)) * maxIndexScanLoopsPerMetric + var loopsCount uint64 loopsPaceLimiter := 0 ts.Seek(prefix) var sf []uint64 @@ -2610,8 +2612,8 @@ func (is *indexSearch) updateMetricIDsForOrSuffixWithFilter(prefix []byte, metri return nil } sf = sortedFilter - loops += mp.MetricIDsLen() - if loops > maxLoops { + loopsCount += uint64(mp.MetricIDsLen()) + if loopsCount > maxLoopsCount { return errFallbackToMetricNameMatch } mp.ParseMetricIDs() @@ -2781,31 +2783,33 @@ func (is *indexSearch) tryUpdatingMetricIDsForDateRange(metricIDs *uint64set.Set } func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilters, maxMetrics int) (*uint64set.Set, error) { - // Sort tfs by the duration from previous queries. + // Sort tfs by loopsCount needed for performing each filter. + // This stats is usually collected from the previous queries. // This way we limit the amount of work below by applying fast filters at first. type tagFilterWithWeight struct { tf *tagFilter - durationSeconds float64 + loopsCount uint64 lastQueryTimestamp uint64 } tfws := make([]tagFilterWithWeight, len(tfs.tfs)) + currentTime := fasttime.UnixTimestamp() for i := range tfs.tfs { tf := &tfs.tfs[i] - durationSeconds, lastQueryTimestamp := is.getDurationAndTimestampForDateFilter(date, tf) - if durationSeconds == 0 { - // Assume that unknown tag filters can take quite big amounts of time. - durationSeconds = 1.0 + loopsCount, lastQueryTimestamp := is.getLoopsCountAndTimestampForDateFilter(date, tf) + if currentTime > lastQueryTimestamp+60*60 { + // Reset loopsCount to 0 every hour for collecting updated stats for the tf. + loopsCount = 0 } tfws[i] = tagFilterWithWeight{ tf: tf, - durationSeconds: durationSeconds, + loopsCount: loopsCount, lastQueryTimestamp: lastQueryTimestamp, } } sort.Slice(tfws, func(i, j int) bool { a, b := &tfws[i], &tfws[j] - if a.durationSeconds != b.durationSeconds { - return a.durationSeconds < b.durationSeconds + if a.loopsCount != b.loopsCount { + return a.loopsCount < b.loopsCount } return a.tf.Less(b.tf) }) @@ -2872,7 +2876,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter // Short circuit - there is no need in applying the remaining filters to an empty set. break } - if float64(metricIDsLen)/metricNameMatchesPerSecond < tfw.durationSeconds { + if uint64(metricIDsLen)*maxIndexScanLoopsPerMetric < tfw.loopsCount { // It should be faster performing metricName match on the remaining filters // instead of scanning big number of entries in the inverted index for these filters. tfsPostponed = append(tfsPostponed, tf) @@ -2908,12 +2912,6 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter return metricIDs, nil } -// The estimated number of per-second loops inside updateMetricIDsByMetricNameMatch -// -// This value is used for determining when matching by metric name must be perfromed instead of matching -// by the remaining tag filters. -const metricNameMatchesPerSecond = 50000 - func (is *indexSearch) storeDateMetricID(date, metricID uint64) error { ii := getIndexItems() defer putIndexItems(ii) @@ -3073,8 +3071,7 @@ func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTime tfNew := *tf tfNew.isNegative = false // isNegative for the original tf is handled by the caller. tfNew.prefix = kb.B - startTime := time.Now() - metricIDs, err := is.getMetricIDsForTagFilter(&tfNew, filter, maxMetrics) + metricIDs, loopsCount, err := is.getMetricIDsForTagFilter(&tfNew, filter, maxMetrics) kbPool.Put(kb) currentTimestamp := fasttime.UnixTimestamp() if currentTimestamp > lastQueryTimestamp+5 { @@ -3082,23 +3079,22 @@ func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTime // Do not update it too frequently. return metricIDs, err } - // Store the duration for tag filter execution in the cache in order to sort tag filters + // Store the loopsCount for tag filter in the cache in order to sort tag filters // in ascending durations on the next search. - durationSeconds := time.Since(startTime).Seconds() - if err != nil && durationSeconds < 10 { - // Set high duration for failing filter, so it is moved to the end of filter list. - durationSeconds = 10 + if err != nil { + // Set high loopsCount for failing filter, so it is moved to the end of filter list. + loopsCount = 1 << 30 } if metricIDs.Len() >= maxMetrics { - // Increase the duration for tag filter matching too many metrics, - // So next time it will be applied after filters matching lower number of metrics. - durationSeconds *= 2 + // Increase loopsCount for tag filter matching too many metrics, + // So next time it is moved to the end of filter list. + loopsCount *= 2 } - is.storeDurationAndTimestampForDateFilter(date, tf, durationSeconds, currentTimestamp) + is.storeLoopsCountAndTimestampForDateFilter(date, tf, loopsCount, currentTimestamp) return metricIDs, err } -func (is *indexSearch) getDurationAndTimestampForDateFilter(date uint64, tf *tagFilter) (float64, uint64) { +func (is *indexSearch) getLoopsCountAndTimestampForDateFilter(date uint64, tf *tagFilter) (uint64, uint64) { is.kb.B = appendDateTagFilterCacheKey(is.kb.B[:0], date, tf) kb := kbPool.Get() defer kbPool.Put(kb) @@ -3106,17 +3102,15 @@ func (is *indexSearch) getDurationAndTimestampForDateFilter(date uint64, tf *tag if len(kb.B) != 16 { return 0, 0 } - n := encoding.UnmarshalUint64(kb.B) - durationSeconds := math.Float64frombits(n) + loopsCount := encoding.UnmarshalUint64(kb.B) timestamp := encoding.UnmarshalUint64(kb.B[8:]) - return durationSeconds, timestamp + return loopsCount, timestamp } -func (is *indexSearch) storeDurationAndTimestampForDateFilter(date uint64, tf *tagFilter, durationSeconds float64, timestamp uint64) { +func (is *indexSearch) storeLoopsCountAndTimestampForDateFilter(date uint64, tf *tagFilter, loopsCount, timestamp uint64) { is.kb.B = appendDateTagFilterCacheKey(is.kb.B[:0], date, tf) - n := math.Float64bits(durationSeconds) kb := kbPool.Get() - kb.B = encoding.MarshalUint64(kb.B[:0], n) + kb.B = encoding.MarshalUint64(kb.B[:0], loopsCount) kb.B = encoding.MarshalUint64(kb.B, timestamp) is.db.durationsPerDateTagFilterCache.Set(is.kb.B, kb.B) kbPool.Put(kb) @@ -3243,8 +3237,8 @@ func (is *indexSearch) intersectMetricIDsWithTagFilterNocache(tf *tagFilter, fil } // Slow path - scan for all the rows with the given prefix. - maxLoops := filter.Len() * maxIndexScanSlowLoopsPerMetric - err := is.getMetricIDsForTagFilterSlow(tf, filter, maxLoops, func(metricID uint64) { + maxLoopsCount := uint64(filter.Len()) * maxIndexScanSlowLoopsPerMetric + _, err := is.getMetricIDsForTagFilterSlow(tf, filter, maxLoopsCount, func(metricID uint64) { if tf.isNegative { // filter must be equal to metricIDs metricIDs.Del(metricID) diff --git a/lib/storage/table.go b/lib/storage/table.go index 646d33074..5049a18e0 100644 --- a/lib/storage/table.go +++ b/lib/storage/table.go @@ -203,7 +203,7 @@ func (tb *table) MustClose() { if n := atomic.LoadUint64(&ptw.refCount); n != 1 { logger.Panicf("BUG: unexpected refCount=%d when closing the partition; probably there are pending searches", n) } - ptw.pt.MustClose() + ptw.decRef() } // Release exclusive lock on the table. diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index c38747280..f4e3a11c0 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -221,7 +221,9 @@ type tagFilter struct { value []byte isNegative bool isRegexp bool - matchCost uint64 + + // matchCost is a cost for matching a filter against a single string. + matchCost uint64 // Prefix always contains {nsPrefixTagToMetricIDs, key}. // Additionally it contains: From e540c020144bceb4d9f87bb7a0decaadebc270e0 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 18 Feb 2021 13:56:50 +0200 Subject: [PATCH 20/21] lib/storage: prevent from running identical heavy tag filters in concurrent queries when measuring the number of loops for such tag filter. This should reduce CPU usage spikes when measuring the number of loops needed for heavy tag filters --- lib/storage/index_db.go | 44 +++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index aa7da3efb..e01efa14c 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2796,10 +2796,20 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter for i := range tfs.tfs { tf := &tfs.tfs[i] loopsCount, lastQueryTimestamp := is.getLoopsCountAndTimestampForDateFilter(date, tf) + origLoopsCount := loopsCount if currentTime > lastQueryTimestamp+60*60 { // Reset loopsCount to 0 every hour for collecting updated stats for the tf. loopsCount = 0 } + if loopsCount == 0 { + // Prevent from possible thundering herd issue when heavy tf is executed from multiple concurrent queries + // by temporary persisting its position in the tag filters list. + if origLoopsCount == 0 { + origLoopsCount = 10e6 + } + lastQueryTimestamp = 0 + is.storeLoopsCountForDateFilter(date, tf, origLoopsCount, lastQueryTimestamp) + } tfws[i] = tagFilterWithWeight{ tf: tf, loopsCount: loopsCount, @@ -2826,7 +2836,8 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter tfwsRemaining = append(tfwsRemaining, tfw) continue } - m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, nil, tfs.commonPrefix, maxDateMetrics) + m, loopsCount, err := is.getMetricIDsForDateTagFilter(tf, date, nil, tfs.commonPrefix, maxDateMetrics) + is.storeLoopsCountForDateFilter(date, tf, loopsCount, tfw.lastQueryTimestamp) if err != nil { return nil, err } @@ -2880,9 +2891,12 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter // It should be faster performing metricName match on the remaining filters // instead of scanning big number of entries in the inverted index for these filters. tfsPostponed = append(tfsPostponed, tf) + // Store stats for non-executed tf, since it could be updated during protection from thundered herd. + is.storeLoopsCountForDateFilter(date, tf, tfw.loopsCount, tfw.lastQueryTimestamp) continue } - m, err := is.getMetricIDsForDateTagFilter(tf, tfw.lastQueryTimestamp, date, metricIDs, tfs.commonPrefix, maxDateMetrics) + m, loopsCount, err := is.getMetricIDsForDateTagFilter(tf, date, metricIDs, tfs.commonPrefix, maxDateMetrics) + is.storeLoopsCountForDateFilter(date, tf, loopsCount, tfw.lastQueryTimestamp) if err != nil { return nil, err } @@ -3058,8 +3072,7 @@ func (is *indexSearch) hasDateMetricID(date, metricID uint64) (bool, error) { return true, nil } -func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTimestamp, date uint64, - filter *uint64set.Set, commonPrefix []byte, maxMetrics int) (*uint64set.Set, error) { +func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, date uint64, filter *uint64set.Set, commonPrefix []byte, maxMetrics int) (*uint64set.Set, uint64, error) { // Augument tag filter prefix for per-date search instead of global search. if !bytes.HasPrefix(tf.prefix, commonPrefix) { logger.Panicf("BUG: unexpected tf.prefix %q; must start with commonPrefix %q", tf.prefix, commonPrefix) @@ -3073,25 +3086,16 @@ func (is *indexSearch) getMetricIDsForDateTagFilter(tf *tagFilter, lastQueryTime tfNew.prefix = kb.B metricIDs, loopsCount, err := is.getMetricIDsForTagFilter(&tfNew, filter, maxMetrics) kbPool.Put(kb) - currentTimestamp := fasttime.UnixTimestamp() - if currentTimestamp > lastQueryTimestamp+5 { - // The cache already contains quite fresh entry for the current (date, tf). - // Do not update it too frequently. - return metricIDs, err - } - // Store the loopsCount for tag filter in the cache in order to sort tag filters - // in ascending durations on the next search. if err != nil { // Set high loopsCount for failing filter, so it is moved to the end of filter list. - loopsCount = 1 << 30 + loopsCount = 1e9 } if metricIDs.Len() >= maxMetrics { // Increase loopsCount for tag filter matching too many metrics, // So next time it is moved to the end of filter list. loopsCount *= 2 } - is.storeLoopsCountAndTimestampForDateFilter(date, tf, loopsCount, currentTimestamp) - return metricIDs, err + return metricIDs, loopsCount, err } func (is *indexSearch) getLoopsCountAndTimestampForDateFilter(date uint64, tf *tagFilter) (uint64, uint64) { @@ -3107,11 +3111,17 @@ func (is *indexSearch) getLoopsCountAndTimestampForDateFilter(date uint64, tf *t return loopsCount, timestamp } -func (is *indexSearch) storeLoopsCountAndTimestampForDateFilter(date uint64, tf *tagFilter, loopsCount, timestamp uint64) { +func (is *indexSearch) storeLoopsCountForDateFilter(date uint64, tf *tagFilter, loopsCount, prevTimestamp uint64) { + currentTimestamp := fasttime.UnixTimestamp() + if currentTimestamp < prevTimestamp+5 { + // The cache already contains quite fresh entry for the current (date, tf). + // Do not update it too frequently. + return + } is.kb.B = appendDateTagFilterCacheKey(is.kb.B[:0], date, tf) kb := kbPool.Get() kb.B = encoding.MarshalUint64(kb.B[:0], loopsCount) - kb.B = encoding.MarshalUint64(kb.B, timestamp) + kb.B = encoding.MarshalUint64(kb.B, currentTimestamp) is.db.durationsPerDateTagFilterCache.Set(is.kb.B, kb.B) kbPool.Put(kb) } From a7697cc88b3556cc19cb9463938b9c6bc82e6f52 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 18 Feb 2021 14:52:38 +0200 Subject: [PATCH 21/21] docs/CHANGELOG.md: cut v1.54.0 --- docs/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 44e43252f..3688633f1 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -2,6 +2,9 @@ # tip + +# [v1.54.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.54.0) + * FEATURE: optimize searching for matching metrics for `metric{}` queries if `` contains at least a single filter. For example, the query `up{job="foobar"}` should find the matching time series much faster than previously. * FEATURE: reduce execution times for `q1 q2` queries by executing `q1` and `q2` in parallel. * FEATURE: switch from Go1.15 to [Go1.16](https://golang.org/doc/go1.16) for building prod binaries.