From 6d5a8c28cdd09cd63ca6ed621a15a0b36d02a584 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Fri, 11 Jun 2021 13:25:53 +0300 Subject: [PATCH 01/26] Vmalert docs (#1372) * vmalert: mention what happens if `for` is set to 0 or omitted * vmalert: add more context to docs --- app/vmalert/README.md | 82 ++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/app/vmalert/README.md b/app/vmalert/README.md index f79b7d766..4eb8d4aeb 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -1,8 +1,9 @@ # vmalert -`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +`vmalert` executes a list of the given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -rules against configured address. +rules against configured address. It is heavily inspired by [Prometheus](https://prometheus.io/docs/alerting/latest/overview/) +implementation and aims to be compatible with its syntax. ## Features * Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB; @@ -40,21 +41,23 @@ To start using `vmalert` you will need the following things: * datasource address - reachable VictoriaMetrics instance for rules execution; * notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing, aggregating alerts and sending notifications. -* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations) -compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional. +* remote write address [optional] - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations) +compatible storage address for storing recording rules results and alerts state in for of timeseries. Then configure `vmalert` accordingly: ``` -./bin/vmalert -rule=alert.rules \ +./bin/vmalert -rule=alert.rules \ # Path to the file with rules configuration. Supports wildcard -datasource.url=http://localhost:8428 \ # PromQL compatible datasource -notifier.url=http://localhost:9093 \ # AlertManager URL -notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL - -remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules - -remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from + -remoteWrite.url=http://localhost:8428 \ # Remote write compatible storage to persist rules + -remoteRead.url=http://localhost:8428 \ # MetricsQL compatible datasource to restore alerts state from -external.label=cluster=east-1 \ # External label to be applied for each rule -external.label=replica=a # Multiple external labels may be set ``` +See the fill list of configuration flags in [configuration](#configuration) section. + If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts. @@ -62,7 +65,7 @@ Configuration for [recording](https://prometheus.io/docs/prometheus/latest/confi and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very similar to Prometheus rules and configured using YAML. Configuration examples may be found in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder. -Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups: +Every `rule` belongs to a `group` and every configuration file may contain arbitrary number of groups: ```yaml groups: [ - ] @@ -70,15 +73,15 @@ groups: ### Groups -Each group has following attributes: +Each group has the following attributes: ```yaml # The name of the group. Must be unique within a file. name: # How often rules in the group are evaluated. -[ interval: | default = global.evaluation_interval ] +[ interval: | default = -evaluationInterval flag ] -# How many rules execute at once. Increasing concurrency may speed +# How many rules execute at once within a group. Increasing concurrency may speed # up round execution speed. [ concurrency: | default = 1 ] @@ -98,20 +101,25 @@ rules: ### Rules +Every rule contains `expr` field for [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/) +or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expression. Vmalert will execute the configured +expression and then act according to the Rule type. + There are two types of Rules: * [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) - -Alerting rules allows to define alert conditions via [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) -and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager). +Alerting rules allows to define alert conditions via `expr` field and to send notifications +[Alertmanager](https://github.com/prometheus/alertmanager) if execution result is not empty. * [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) - -Recording rules allow you to precompute frequently needed or computationally expensive expressions -and save their result as a new set of time series. +Recording rules allows to define `expr` which result will be than backfilled to configured +`-remoteWrite.url`. Recording rules are used to precompute frequently needed or computationally +expensive expressions and save their result as a new set of time series. `vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels within one group. #### Alerting rules -The syntax for alerting rule is following: +The syntax for alerting rule is the following: ```yaml # The name of the alert. Must be a valid metric name. alert: @@ -121,12 +129,14 @@ alert: [ type: ] # The expression to evaluate. The expression language depends on the type value. -# By default MetricsQL expression is used. If type="graphite", then the expression +# By default PromQL/MetricsQL expression is used. If type="graphite", then the expression # must contain valid Graphite expression. expr: # Alerts are considered firing once they have been returned for this long. # Alerts which have not yet fired for long enough are considered pending. +# If param is omitted or set to 0 then alerts will be immediately considered +# as firing once they return. [ for: | default = 0s ] # Labels to add or overwrite for each alert. @@ -164,12 +174,12 @@ labels: [ : ] ``` -For recording rules to work `-remoteWrite.url` must specified. +For recording rules to work `-remoteWrite.url` must be specified. ### Alerts state on restarts -`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert` +`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after restart of `vmalert` the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags: * `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol. @@ -179,17 +189,27 @@ The state stored to the configured address on every rule evaluation. from configured address by querying time series with name `ALERTS_FOR_STATE`. Both flags are required for the proper state restoring. Restore process may fail if time series are missing -in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert` -rules configuration. +in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`) +or received state doesn't match current `vmalert` rules configuration. ### Multitenancy -There are the following approaches for alerting and recording rules across [multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) exist: +There are the following approaches for alerting and recording rules across +[multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy): -* To run a separate `vmalert` instance per each tenant. The corresponding tenant must be specified in `-datasource.url` command-line flag according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format). For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus` would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line flag must contain the url for the specific tenant as well. For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording rules to `AccountID=123`. +* To run a separate `vmalert` instance per each tenant. + The corresponding tenant must be specified in `-datasource.url` command-line flag + according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format). + For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus` + would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line + flag must contain the url for the specific tenant as well. + For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording + rules to `AccountID=123`. -* To specify `tenant` parameter per each alerting and recording group if [enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used with `-clusterMode` command-line flag. For example: +* To specify `tenant` parameter per each alerting and recording group if + [enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used + with `-clusterMode` command-line flag. For example: ```yaml groups: @@ -204,9 +224,13 @@ groups: # Rules for accountID=456, projectID=789 ``` -If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481` . `vmselect` automatically adds the specified tenant to urls per each recording rule in this case. +If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must +contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481`. +`vmselect` automatically adds the specified tenant to urls per each recording rule in this case. -The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise` tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags). +The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files +at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise` +tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags). ### WEB @@ -318,6 +342,9 @@ See full description for these flags in `./vmalert --help`. ## Configuration +Pass `-help` to `vmalert` in order to see the full list of supported +command-line flags with their descriptions. + The shortlist of configuration flags is the following: ``` -datasource.appendTypePrefix @@ -510,9 +537,6 @@ The shortlist of configuration flags is the following: Show VictoriaMetrics version ``` -Pass `-help` to `vmalert` in order to see the full list of supported -command-line flags with their descriptions. - `vmalert` supports "hot" config reload via the following methods: * send SIGHUP signal to `vmalert` process; * send GET request to `/-/reload` endpoint; From 8a519f151822a84b938d7fd2bd13d07529e8b449 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 11:37:26 +0300 Subject: [PATCH 02/26] docs/vmalert.md: follow-up after 6d5a8c28cdd09cd63ca6ed621a15a0b36d02a584 --- docs/vmalert.md | 82 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/docs/vmalert.md b/docs/vmalert.md index f9b0b7e86..602d989e6 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -4,9 +4,10 @@ sort: 4 # vmalert -`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +`vmalert` executes a list of the given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -rules against configured address. +rules against configured address. It is heavily inspired by [Prometheus](https://prometheus.io/docs/alerting/latest/overview/) +implementation and aims to be compatible with its syntax. ## Features * Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB; @@ -44,21 +45,23 @@ To start using `vmalert` you will need the following things: * datasource address - reachable VictoriaMetrics instance for rules execution; * notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing, aggregating alerts and sending notifications. -* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations) -compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional. +* remote write address [optional] - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations) +compatible storage address for storing recording rules results and alerts state in for of timeseries. Then configure `vmalert` accordingly: ``` -./bin/vmalert -rule=alert.rules \ +./bin/vmalert -rule=alert.rules \ # Path to the file with rules configuration. Supports wildcard -datasource.url=http://localhost:8428 \ # PromQL compatible datasource -notifier.url=http://localhost:9093 \ # AlertManager URL -notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL - -remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules - -remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from + -remoteWrite.url=http://localhost:8428 \ # Remote write compatible storage to persist rules + -remoteRead.url=http://localhost:8428 \ # MetricsQL compatible datasource to restore alerts state from -external.label=cluster=east-1 \ # External label to be applied for each rule -external.label=replica=a # Multiple external labels may be set ``` +See the fill list of configuration flags in [configuration](#configuration) section. + If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts. @@ -66,7 +69,7 @@ Configuration for [recording](https://prometheus.io/docs/prometheus/latest/confi and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very similar to Prometheus rules and configured using YAML. Configuration examples may be found in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder. -Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups: +Every `rule` belongs to a `group` and every configuration file may contain arbitrary number of groups: ```yaml groups: [ - ] @@ -74,15 +77,15 @@ groups: ### Groups -Each group has following attributes: +Each group has the following attributes: ```yaml # The name of the group. Must be unique within a file. name: # How often rules in the group are evaluated. -[ interval: | default = global.evaluation_interval ] +[ interval: | default = -evaluationInterval flag ] -# How many rules execute at once. Increasing concurrency may speed +# How many rules execute at once within a group. Increasing concurrency may speed # up round execution speed. [ concurrency: | default = 1 ] @@ -102,20 +105,25 @@ rules: ### Rules +Every rule contains `expr` field for [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/) +or [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expression. Vmalert will execute the configured +expression and then act according to the Rule type. + There are two types of Rules: * [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) - -Alerting rules allows to define alert conditions via [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) -and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager). +Alerting rules allows to define alert conditions via `expr` field and to send notifications +[Alertmanager](https://github.com/prometheus/alertmanager) if execution result is not empty. * [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) - -Recording rules allow you to precompute frequently needed or computationally expensive expressions -and save their result as a new set of time series. +Recording rules allows to define `expr` which result will be than backfilled to configured +`-remoteWrite.url`. Recording rules are used to precompute frequently needed or computationally +expensive expressions and save their result as a new set of time series. `vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels within one group. #### Alerting rules -The syntax for alerting rule is following: +The syntax for alerting rule is the following: ```yaml # The name of the alert. Must be a valid metric name. alert: @@ -125,12 +133,14 @@ alert: [ type: ] # The expression to evaluate. The expression language depends on the type value. -# By default MetricsQL expression is used. If type="graphite", then the expression +# By default PromQL/MetricsQL expression is used. If type="graphite", then the expression # must contain valid Graphite expression. expr: # Alerts are considered firing once they have been returned for this long. # Alerts which have not yet fired for long enough are considered pending. +# If param is omitted or set to 0 then alerts will be immediately considered +# as firing once they return. [ for: | default = 0s ] # Labels to add or overwrite for each alert. @@ -168,12 +178,12 @@ labels: [ : ] ``` -For recording rules to work `-remoteWrite.url` must specified. +For recording rules to work `-remoteWrite.url` must be specified. ### Alerts state on restarts -`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert` +`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after restart of `vmalert` the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags: * `-remoteWrite.url` - URL to VictoriaMetrics (Single) or vminsert (Cluster). `vmalert` will persist alerts state into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol. @@ -183,17 +193,27 @@ The state stored to the configured address on every rule evaluation. from configured address by querying time series with name `ALERTS_FOR_STATE`. Both flags are required for the proper state restoring. Restore process may fail if time series are missing -in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert` -rules configuration. +in configured `-remoteRead.url`, weren't updated in the last `1h` (controlled by `-remoteRead.lookback`) +or received state doesn't match current `vmalert` rules configuration. ### Multitenancy -There are the following approaches for alerting and recording rules across [multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) exist: +There are the following approaches for alerting and recording rules across +[multiple tenants](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy): -* To run a separate `vmalert` instance per each tenant. The corresponding tenant must be specified in `-datasource.url` command-line flag according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format). For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus` would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line flag must contain the url for the specific tenant as well. For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording rules to `AccountID=123`. +* To run a separate `vmalert` instance per each tenant. + The corresponding tenant must be specified in `-datasource.url` command-line flag + according to [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format). + For example, `/path/to/vmalert -datasource.url=http://vmselect:8481/select/123/prometheus` + would run alerts against `AccountID=123`. For recording rules the `-remoteWrite.url` command-line + flag must contain the url for the specific tenant as well. + For example, `-remoteWrite.url=http://vminsert:8480/insert/123/prometheus` would write recording + rules to `AccountID=123`. -* To specify `tenant` parameter per each alerting and recording group if [enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used with `-clusterMode` command-line flag. For example: +* To specify `tenant` parameter per each alerting and recording group if + [enterprise version of vmalert](https://victoriametrics.com/enterprise.html) is used + with `-clusterMode` command-line flag. For example: ```yaml groups: @@ -208,9 +228,13 @@ groups: # Rules for accountID=456, projectID=789 ``` -If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481` . `vmselect` automatically adds the specified tenant to urls per each recording rule in this case. +If `-clusterMode` is enabled, then `-datasource.url`, `-remoteRead.url` and `-remoteWrite.url` must +contain only the hostname without tenant id. For example: `-datasource.url=http://vmselect:8481`. +`vmselect` automatically adds the specified tenant to urls per each recording rule in this case. -The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise` tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags). +The enterprise version of vmalert is available in `vmutils-*-enterprise.tar.gz` files +at [release page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) and in `*-enterprise` +tags at [Docker Hub](https://hub.docker.com/r/victoriametrics/vmalert/tags). ### WEB @@ -322,6 +346,9 @@ See full description for these flags in `./vmalert --help`. ## Configuration +Pass `-help` to `vmalert` in order to see the full list of supported +command-line flags with their descriptions. + The shortlist of configuration flags is the following: ``` -datasource.appendTypePrefix @@ -514,9 +541,6 @@ The shortlist of configuration flags is the following: Show VictoriaMetrics version ``` -Pass `-help` to `vmalert` in order to see the full list of supported -command-line flags with their descriptions. - `vmalert` supports "hot" config reload via the following methods: * send SIGHUP signal to `vmalert` process; * send GET request to `/-/reload` endpoint; From 3c4366806c1b90fa04ef8dc06bf809c439112e45 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 12:15:30 +0300 Subject: [PATCH 03/26] lib/protoparser/common: log the duration for reading a block of data in ReadLinesBlockExt on error This may help debugging issues like https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365 --- lib/protoparser/common/lines_reader.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/protoparser/common/lines_reader.go b/lib/protoparser/common/lines_reader.go index 170bb317a..87d8228ef 100644 --- a/lib/protoparser/common/lines_reader.go +++ b/lib/protoparser/common/lines_reader.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" ) @@ -41,6 +42,7 @@ func ReadLinesBlockExt(r io.Reader, dstBuf, tailBuf []byte, maxLineLen int) ([]b dstBuf = append(dstBuf[:0], tailBuf...) tailBuf = tailBuf[:0] again: + startTime := time.Now() n, err := r.Read(dstBuf[len(dstBuf):cap(dstBuf)]) // Check for error only if zero bytes read from r, i.e. no forward progress made. // Otherwise process the read data. @@ -55,6 +57,9 @@ again: // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/60 . return dstBuf, tailBuf, nil } + if err != io.EOF { + err = fmt.Errorf("cannot read data in %.3fs: %w", time.Since(startTime).Seconds(), err) + } return dstBuf, tailBuf, err } dstBuf = dstBuf[:len(dstBuf)+n] From 3e5b6bae66574100cc4cdc4f62fe8c48a1a53b1c Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 12:21:12 +0300 Subject: [PATCH 04/26] docs/Cluster-VictoriaMetrics.md: add lists for command-line flags for cluster components --- docs/Cluster-VictoriaMetrics.md | 336 ++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 30728c421..7c7531ffc 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -452,6 +452,342 @@ Due to `KISS`, cluster version of VictoriaMetrics has no the following "features Report bugs and propose new features [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues). +## List of command-line flags + +* [List of command-line flags for vminsert](#list-of-command-line-flags-for-vminsert) +* [List of command-line flags for vmselect](#list-of-command-line-flags-for-vmselect) +* [List of command-line flags for vmstorage](#list-of-command-line-flags-for-vmstorage) + + +### List of command-line flags for vminsert + +Below is the output for `/path/to/vminsert -help`: + +``` + -clusternativeListenAddr string + TCP address to listen for data from other vminsert nodes in multi-level cluster setup. See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multi-level-cluster-setup . Usually :8400 must be set. Doesn't work if empty + -csvTrimTimestamp duration + Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) + -disableRerouting + Whether to disable re-routing when some of vmstorage nodes accept incoming data at slower speed compared to other storage nodes. By default the re-routing is enabled. Disabled re-routing limits the ingestion rate by the slowest vmstorage node. On the other side, disabled re-routing minimizes the number of active time series in the cluster + -enableTCP6 + Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used + -envflag.enable + Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set + -envflag.prefix string + Prefix for environment variables if -envflag.enable is set + -fs.disableMmap + Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() + -graphiteListenAddr string + TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty + -graphiteTrimTimestamp duration + Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s) + -http.connTimeout duration + Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s) + -http.disableResponseCompression + Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth + -http.idleConnTimeout duration + Timeout for incoming idle http connections (default 1m0s) + -http.maxGracefulShutdownDuration duration + The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s) + -http.pathPrefix string + An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus + -http.shutdownDelay duration + Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers + -httpListenAddr string + Address to listen for http connections (default ":8480") + -import.maxLineLen size + The maximum length in bytes of a single line accepted by /api/v1/import; the line length can be limited with 'max_rows_per_line' query arg passed to /api/v1/export + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 104857600) + -influx.databaseNames array + Comma-separated list of database names to return from /query and /influx/query API. This can be needed for accepting data from Telegraf plugins such as https://github.com/fangli/fluent-plugin-influxdb + Supports an array of values separated by comma or specified via multiple flags. + -influx.maxLineSize size + The maximum size in bytes for a single Influx line during parsing + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 262144) + -influxListenAddr string + TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. This flag isn't needed when ingesting data over HTTP - just send it to http://:8480/insert//influx/write + -influxMeasurementFieldSeparator string + Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol (default "_") + -influxSkipMeasurement + Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator' + -influxSkipSingleField + Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field + -influxTrimTimestamp duration + Trim timestamps for Influx line protocol data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) + -insert.maxQueueDuration duration + The maximum duration for waiting in the queue for insert requests due to -maxConcurrentInserts (default 1m0s) + -loggerDisableTimestamps + Whether to disable writing timestamps in logs + -loggerErrorsPerSecondLimit int + Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit + -loggerFormat string + Format for logs. Possible values: default, json (default "default") + -loggerLevel string + Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO") + -loggerOutput string + Output for the logs. Supported values: stderr, stdout (default "stderr") + -loggerTimezone string + Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC") + -loggerWarnsPerSecondLimit int + Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit + -maxConcurrentInserts int + The maximum number of concurrent inserts. Default value should work for most cases, since it minimizes the overhead for concurrent inserts. This option is tigthly coupled with -insert.maxQueueDuration (default 16) + -maxInsertRequestSize size + The maximum size in bytes of a single Prometheus remote_write API request + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432) + -maxLabelsPerTimeseries int + The maximum number of labels accepted per time series. Superfluous labels are dropped (default 30) + -memory.allowedBytes size + Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0) + -memory.allowedPercent float + Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60) + -opentsdbHTTPListenAddr string + TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty + -opentsdbListenAddr string + TCP and UDP address to listen for OpentTSDB metrics. Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. Usually :4242 must be set. Doesn't work if empty + -opentsdbTrimTimestamp duration + Trim timestamps for OpenTSDB 'telnet put' data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s) + -opentsdbhttp.maxInsertRequestSize size + The maximum size of OpenTSDB HTTP put request + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 33554432) + -opentsdbhttpTrimTimestamp duration + Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) + -relabelConfig string + Optional path to a file with relabeling rules, which are applied to all the ingested metrics. See https://docs.victoriametrics.com/#relabeling for details + -relabelDebug + Whether to log metrics before and after relabeling with -relabelConfig. If the -relabelDebug is enabled, then the metrics aren't sent to storage. This is useful for debugging the relabeling configs + -replicationFactor int + Replication factor for the ingested data, i.e. how many copies to make among distinct -storageNode instances. Note that vmselect must run with -dedup.minScrapeInterval=1ms for data de-duplication when replicationFactor is greater than 1. Higher values for -dedup.minScrapeInterval at vmselect is OK (default 1) + -rpc.disableCompression + Whether to disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage + -sortLabels + Whether to sort labels for incoming samples before writing them to storage. This may be needed for reducing memory usage at storage when the order of labels in incoming samples is random. For example, if m{k1="v1",k2="v2"} may be sent as m{k2="v2",k1="v1"}. Enabled sorting for labels can slow down ingestion performance a bit + -storageNode array + Address of vmstorage nodes; usage: -storageNode=vmstorage-host1:8400 -storageNode=vmstorage-host2:8400 + Supports an array of values separated by comma or specified via multiple flags. + -tls + Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set + -tlsCertFile string + Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower + -tlsKeyFile string + Path to file with TLS key. Used only if -tls is set + -version + Show VictoriaMetrics version +``` + +### List of command-line flags for vmselect + +Below is the output for `/path/to/vmselect -help`: + +``` + -cacheDataPath string + Path to directory for cache files. Cache isn't saved if empty + -dedup.minScrapeInterval duration + Remove superflouos samples from time series if they are located closer to each other than this duration. This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. Deduplication is disabled if the -dedup.minScrapeInterval is 0 + -enableTCP6 + Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used + -envflag.enable + Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set + -envflag.prefix string + Prefix for environment variables if -envflag.enable is set + -fs.disableMmap + Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() + -graphiteTrimTimestamp duration + Trim timestamps for Graphite data to this duration. Minimum practical duration is 1s. Higher duration (i.e. 1m) may be used for reducing disk space usage for timestamp data (default 1s) + -http.connTimeout duration + Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s) + -http.disableResponseCompression + Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth + -http.idleConnTimeout duration + Timeout for incoming idle http connections (default 1m0s) + -http.maxGracefulShutdownDuration duration + The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s) + -http.pathPrefix string + An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus + -http.shutdownDelay duration + Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers + -httpListenAddr string + Address to listen for http connections (default ":8481") + -loggerDisableTimestamps + Whether to disable writing timestamps in logs + -loggerErrorsPerSecondLimit int + Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit + -loggerFormat string + Format for logs. Possible values: default, json (default "default") + -loggerLevel string + Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO") + -loggerOutput string + Output for the logs. Supported values: stderr, stdout (default "stderr") + -loggerTimezone string + Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC") + -loggerWarnsPerSecondLimit int + Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit + -memory.allowedBytes size + Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0) + -memory.allowedPercent float + Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60) + -replicationFactor int + How many copies of every time series is available on vmstorage nodes. See -replicationFactor command-line flag for vminsert nodes (default 1) + -search.cacheTimestampOffset duration + The maximum duration since the current time for response data, which is always queried from the original raw data, without using the response cache. Increase this value if you see gaps in responses due to time synchronization issues between VictoriaMetrics and data sources (default 5m0s) + -search.denyPartialResponse + Whether to deny partial responses if a part of -storageNode instances fail to perform queries; this trades availability over consistency; see also -search.maxQueryDuration + -search.disableCache + Whether to disable response caching. This may be useful during data backfilling + -search.latencyOffset duration + The time when data points become visible in query results after the collection. Too small value can result in incomplete last points for query results (default 30s) + -search.logSlowQueryDuration duration + Log queries with execution time exceeding this value. Zero disables slow query logging (default 5s) + -search.maxConcurrentRequests int + The maximum number of concurrent search requests. It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration (default 8) + -search.maxExportDuration duration + The maximum duration for /api/v1/export call (default 720h0m0s) + -search.maxLookback duration + Synonym to -search.lookback-delta from Prometheus. The value is dynamically detected from interval between time series datapoints if not set. It can be overridden on per-query basis via max_lookback arg. See also '-search.maxStalenessInterval' flag, which has the same meaining due to historical reasons + -search.maxPointsPerTimeseries int + The maximum points per a single timeseries returned from /api/v1/query_range. This option doesn't limit the number of scanned raw samples in the database. The main purpose of this option is to limit the number of per-series points returned to graphing UI such as Grafana. There is no sense in setting this limit to values bigger than the horizontal resolution of the graph (default 30000) + -search.maxQueryDuration duration + The maximum duration for query execution (default 30s) + -search.maxQueryLen size + The maximum search query length in bytes + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384) + -search.maxQueueDuration duration + The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s) + -search.maxStalenessInterval duration + The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons + -search.maxStatusRequestDuration duration + The maximum duration for /api/v1/status/* requests (default 5m0s) + -search.maxStepForPointsAdjustment duration + The maximum step when /api/v1/query_range handler adjusts points with timestamps closer than -search.latencyOffset to the current time. The adjustment is needed because such points may contain incomplete data (default 1m0s) + -search.minStalenessInterval duration + The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval' + -search.queryStats.lastQueriesCount int + Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000) + -search.queryStats.minQueryDuration duration + The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats + -search.resetCacheAuthKey string + Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call + -search.treatDotsAsIsInRegexps + Whether to treat dots as is in regexp label filters used in queries. For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter + -selectNode array + Addresses of vmselect nodes; usage: -selectNode=vmselect-host1:8481 -selectNode=vmselect-host2:8481 + Supports an array of values separated by comma or specified via multiple flags. + -storageNode array + Addresses of vmstorage nodes; usage: -storageNode=vmstorage-host1:8401 -storageNode=vmstorage-host2:8401 + Supports an array of values separated by comma or specified via multiple flags. + -tls + Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set + -tlsCertFile string + Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower + -tlsKeyFile string + Path to file with TLS key. Used only if -tls is set + -version + Show VictoriaMetrics version +``` + +### List of command-line flags for vmstorage + +Below is the output for `/path/to/vmstorage -help`: + +``` + -bigMergeConcurrency int + The maximum number of CPU cores to use for big merges. Default value is used if set to 0 + -dedup.minScrapeInterval duration + Remove superflouos samples from time series if they are located closer to each other than this duration. This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. Deduplication is disabled if the -dedup.minScrapeInterval is 0 + -denyQueriesOutsideRetention + Whether to deny queries outside of the configured -retentionPeriod. When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. This may be useful when multiple data sources with distinct retentions are hidden behind query-tee + -enableTCP6 + Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP and UDP is used + -envflag.enable + Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set + -envflag.prefix string + Prefix for environment variables if -envflag.enable is set + -finalMergeDelay duration + The delay before starting final merge for per-month partition after no new data is ingested into it. Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. Zero value disables final merge + -forceFlushAuthKey string + authKey, which must be passed in query string to /internal/force_flush pages + -forceMergeAuthKey string + authKey, which must be passed in query string to /internal/force_merge pages + -fs.disableMmap + Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() + -http.connTimeout duration + Incoming http connections are closed after the configured timeout. This may help to spread the incoming load among a cluster of services behind a load balancer. Please note that the real timeout may be bigger by up to 10% as a protection against the thundering herd problem (default 2m0s) + -http.disableResponseCompression + Disable compression of HTTP responses to save CPU resources. By default compression is enabled to save network bandwidth + -http.idleConnTimeout duration + Timeout for incoming idle http connections (default 1m0s) + -http.maxGracefulShutdownDuration duration + The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s) + -http.pathPrefix string + An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus + -http.shutdownDelay duration + Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers + -httpListenAddr string + Address to listen for http connections (default ":8482") + -logNewSeries + Whether to log new series. This option is for debug purposes only. It can lead to performance issues when big number of new series are ingested into VictoriaMetrics + -loggerDisableTimestamps + Whether to disable writing timestamps in logs + -loggerErrorsPerSecondLimit int + Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit + -loggerFormat string + Format for logs. Possible values: default, json (default "default") + -loggerLevel string + Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO") + -loggerOutput string + Output for the logs. Supported values: stderr, stdout (default "stderr") + -loggerTimezone string + Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC") + -loggerWarnsPerSecondLimit int + Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit + -memory.allowedBytes size + Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache resulting in higher disk IO usage + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0) + -memory.allowedPercent float + Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from OS page cache which will result in higher disk IO usage (default 60) + -precisionBits int + The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64) + -retentionPeriod value + Data with timestamps outside the retentionPeriod is automatically deleted + The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 1) + -rpc.disableCompression + Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage + -search.maxTagKeys int + The maximum number of tag keys returned per search (default 100000) + -search.maxTagValueSuffixesPerSearch int + The maximum number of tag value suffixes returned from /metrics/find (default 100000) + -search.maxTagValues int + The maximum number of tag values returned per search (default 100000) + -search.maxUniqueTimeseries int + The maximum number of unique time series each search can scan (default 300000) + -smallMergeConcurrency int + The maximum number of CPU cores to use for small merges. Default value is used if set to 0 + -snapshotAuthKey string + authKey, which must be passed in query string to /snapshot* pages + -storage.maxDailySeries int + The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries + -storage.maxHourlySeries int + The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries + -storageDataPath string + Path to storage data (default "vmstorage-data") + -tls + Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set + -tlsCertFile string + Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower + -tlsKeyFile string + Path to file with TLS key. Used only if -tls is set + -version + Show VictoriaMetrics version + -vminsertAddr string + TCP address to accept connections from vminsert services (default ":8400") + -vmselectAddr string + TCP address to accept connections from vmselect services (default ":8401") +``` + + ## VictoriaMetrics Logo [Zip](VM_logo.zip) contains three folders with different image orientation (main color and inverted version). From 48210130aca4eb4c537e775c5295f7a60b7607bc Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 12:25:43 +0300 Subject: [PATCH 05/26] lib/protoparser: measure the duration for reading the whole block of data instead of a single read operation Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365 --- lib/protoparser/common/lines_reader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/protoparser/common/lines_reader.go b/lib/protoparser/common/lines_reader.go index 87d8228ef..19395eac4 100644 --- a/lib/protoparser/common/lines_reader.go +++ b/lib/protoparser/common/lines_reader.go @@ -36,13 +36,13 @@ func ReadLinesBlock(r io.Reader, dstBuf, tailBuf []byte) ([]byte, []byte, error) // // It is expected that read timeout on r exceeds 1 second. func ReadLinesBlockExt(r io.Reader, dstBuf, tailBuf []byte, maxLineLen int) ([]byte, []byte, error) { + startTime := time.Now() if cap(dstBuf) < defaultBlockSize { dstBuf = bytesutil.Resize(dstBuf, defaultBlockSize) } dstBuf = append(dstBuf[:0], tailBuf...) tailBuf = tailBuf[:0] again: - startTime := time.Now() n, err := r.Read(dstBuf[len(dstBuf):cap(dstBuf)]) // Check for error only if zero bytes read from r, i.e. no forward progress made. // Otherwise process the read data. @@ -58,7 +58,7 @@ again: return dstBuf, tailBuf, nil } if err != io.EOF { - err = fmt.Errorf("cannot read data in %.3fs: %w", time.Since(startTime).Seconds(), err) + err = fmt.Errorf("cannot read a block of data in %.3fs: %w", time.Since(startTime).Seconds(), err) } return dstBuf, tailBuf, err } From 06b8e7d148de04c1a3d13f275187deab4666b254 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 12:28:07 +0300 Subject: [PATCH 06/26] lib/promscrape: increase the duration for reading the full response in stream parsing mode Increase the duration from 10x to 30x of the configured `scrape_interval'. This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1365 --- lib/promscrape/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go index 55e856cea..6d12be760 100644 --- a/lib/promscrape/client.go +++ b/lib/promscrape/client.go @@ -128,10 +128,10 @@ func newClient(sw *ScrapeWork) *client { ResponseHeaderTimeout: sw.ScrapeTimeout, }, - // Set 10x bigger timeout than the sw.ScrapeTimeout, since the duration for reading the full response + // Set 30x bigger timeout than the sw.ScrapeTimeout, since the duration for reading the full response // can be much bigger because of stream parsing. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1017#issuecomment-767235047 - Timeout: 10 * sw.ScrapeTimeout, + Timeout: 30 * sw.ScrapeTimeout, } if sw.DenyRedirects { sc.CheckRedirect = func(req *http.Request, via []*http.Request) error { From b8526e88d3682ddcae810dda69be9503604c7b32 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 14 Jun 2021 13:03:23 +0300 Subject: [PATCH 07/26] Dashboard single (#1374) * dashboard: update single version dash The update contains the following changes: * display anonymous memory usage metric. This metric suppose to reflect memory usage of the process which can't be freed by OS; * add legends to all panels. This is important for cases when users share the screenshots; * modify panels for Grafana v8.0.0 * dashboard: update single version dash tags * dashboard: update vmagent dash The update contains the following changes: * display anonymous memory usage metric. This metric suppose to reflect memory usage of the process which can't be freed by OS; * add legends to all panels. This is important for cases when users share the screenshots; * modify panels for Grafana v8.0.0 --- dashboards/victoriametrics.json | 1784 +++++++++++++++---------------- dashboards/vmagent.json | 674 +++++++----- 2 files changed, 1308 insertions(+), 1150 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index bfaea3df7..1741e6bd5 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -5,12 +5,12 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "7.1.1" + "version": "8.0.0" }, { "type": "panel", "id": "graph", - "name": "Graph", + "name": "Graph (old)", "version": "" }, { @@ -21,15 +21,15 @@ }, { "type": "panel", - "id": "singlestat", - "name": "Singlestat", + "id": "stat", + "name": "Stat", "version": "" }, { "type": "panel", "id": "text", "name": "Text", - "version": "7.1.0" + "version": "" } ], "annotations": { @@ -50,7 +50,7 @@ "gnetId": 10229, "graphTooltip": 0, "id": null, - "iteration": 1616956884194, + "iteration": 1623413472435, "links": [ { "icon": "doc", @@ -80,8 +80,12 @@ ], "panels": [ { - "collapsed": false, + "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -89,689 +93,501 @@ "y": 0 }, "id": 6, - "panels": [], + "panels": [ + { + "datasource": "$ds", + "description": "", + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "links": [ + { + "targetBlank": true, + "title": "VictoriaMetrics releases", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases" + } + ], + "options": { + "content": "
$version
", + "mode": "html" + }, + "pluginVersion": "8.0.0", + "timeFrom": null, + "timeShift": null, + "title": "Version", + "type": "text" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "How many datapoints are in storage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 26, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total datapoints", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "The size of the free disk space left", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 80, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\", path=\"/storage\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Free disk space", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total size of available memory for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 78, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Available memory", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 3 + }, + "id": 8, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 3 + }, + "id": 38, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Index size", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total number of available CPUs for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 3 + }, + "id": 77, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Available CPU", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total size of allowed memory via flag `-memory.allowedPercent`", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 3 + }, + "id": 79, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Allowed memory", + "type": "stat" + } + ], "title": "Configuration", "type": "row" }, - { - "content": "
$version
", - "datasource": "$ds", - "description": "", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 0, - "y": 1 - }, - "id": 2, - "links": [ - { - "targetBlank": true, - "title": "VictoriaMetrics releases", - "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases" - } - ], - "mode": "html", - "options": { - "content": "
$version
", - "mode": "html" - }, - "pluginVersion": "7.1.0", - "timeFrom": null, - "timeShift": null, - "title": "Version", - "type": "text" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "How many datapoints are in storage", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 6, - "y": 1 - }, - "id": 26, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Total datapoints", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "The size of the free disk space left", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 12, - "y": 1 - }, - "id": 80, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\", path=\"/storage\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Free disk space", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "Total size of available memory for VM process", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 78, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Available memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 0, - "y": 3 - }, - "id": 8, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}", - "targets": [ - { - "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 6, - "y": 3 - }, - "id": 38, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Index size", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "Total number of available CPUs for VM process", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 12, - "y": 3 - }, - "id": 77, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Available CPU", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "Total size of allowed memory via flag `-memory.allowedPercent`", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 18, - "y": 3 - }, - "id": 79, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Allowed memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, { "collapsed": false, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 5 + "y": 1 }, "id": 24, "panels": [], @@ -787,7 +603,6 @@ "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -798,7 +613,7 @@ "h": 8, "w": 12, "x": 0, - "y": 6 + "y": 2 }, "hiddenSeries": false, "id": 12, @@ -806,7 +621,7 @@ "alignAsTable": true, "avg": true, "current": true, - "max": false, + "max": true, "min": false, "show": true, "sort": "current", @@ -818,8 +633,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -887,7 +705,6 @@ "description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -898,7 +715,7 @@ "h": 8, "w": 12, "x": 12, - "y": 6 + "y": 2 }, "hiddenSeries": false, "id": 22, @@ -906,7 +723,7 @@ "alignAsTable": true, "avg": true, "current": true, - "max": false, + "max": true, "min": false, "show": true, "sort": "current", @@ -918,8 +735,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -986,7 +806,6 @@ "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -997,18 +816,21 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 10 }, "hiddenSeries": false, "id": 51, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1020,8 +842,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1088,7 +913,6 @@ "description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1099,25 +923,31 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 10 }, "hiddenSeries": false, "id": 33, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1198,7 +1028,6 @@ "description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1209,7 +1038,7 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 18 }, "hiddenSeries": false, "id": 59, @@ -1231,8 +1060,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1314,7 +1146,6 @@ "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1325,7 +1156,7 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 18 }, "hiddenSeries": false, "id": 35, @@ -1345,8 +1176,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1356,8 +1190,10 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vm_http_request_errors_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (path) > 0", "format": "time_series", + "interval": "", "intervalFactor": 1, "legendFormat": "{{path}}", "refId": "A" @@ -1407,11 +1243,15 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 30 + "y": 26 }, "id": 14, "panels": [ @@ -1424,7 +1264,6 @@ "description": "How many datapoints are inserted into storage per second", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1435,7 +1274,7 @@ "h": 8, "w": 12, "x": 0, - "y": 40 + "y": 31 }, "hiddenSeries": false, "id": 10, @@ -1447,6 +1286,8 @@ "max": false, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, @@ -1454,8 +1295,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1524,7 +1368,6 @@ "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1535,7 +1378,7 @@ "h": 8, "w": 12, "x": 12, - "y": 40 + "y": 31 }, "hiddenSeries": false, "id": 73, @@ -1545,8 +1388,10 @@ "current": true, "hideZero": true, "max": false, - "min": false, - "show": false, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, @@ -1554,8 +1399,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1625,7 +1473,6 @@ "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1636,25 +1483,31 @@ "h": 8, "w": 12, "x": 0, - "y": 48 + "y": 39 }, "hiddenSeries": false, "id": 30, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1736,7 +1589,6 @@ "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1747,25 +1599,31 @@ "h": 8, "w": 12, "x": 12, - "y": 48 + "y": 39 }, "hiddenSeries": false, "id": 34, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1847,7 +1705,6 @@ "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1858,18 +1715,20 @@ "h": 8, "w": 12, "x": 0, - "y": 56 + "y": 47 }, "hiddenSeries": false, "id": 53, "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": true, - "max": false, + "max": true, "min": false, "rightSide": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, @@ -1877,8 +1736,11 @@ "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1954,7 +1816,6 @@ "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1965,25 +1826,31 @@ "h": 8, "w": 12, "x": 12, - "y": 56 + "y": 47 }, "hiddenSeries": false, "id": 36, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2050,7 +1917,6 @@ "description": "Shows amount of on-disk space occupied by inverted index.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2061,25 +1927,31 @@ "h": 8, "w": 12, "x": 0, - "y": 64 + "y": 55 }, "hiddenSeries": false, "id": 55, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2089,9 +1961,12 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", "format": "time_series", + "interval": "", "intervalFactor": 1, + "legendFormat": "disk space used", "refId": "A" } ], @@ -2145,7 +2020,6 @@ "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2156,24 +2030,30 @@ "h": 8, "w": 12, "x": 12, - "y": 64 + "y": 55 }, "hiddenSeries": false, "id": 62, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2239,7 +2119,6 @@ "description": "Shows the number of bytes read/write from the storage layer.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2250,25 +2129,31 @@ "h": 8, "w": 12, "x": 0, - "y": 72 + "y": 63 }, "hiddenSeries": false, "id": 76, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2352,7 +2237,6 @@ "description": "The number of rows merged per second by storage nodes.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2363,24 +2247,30 @@ "h": 8, "w": 12, "x": 12, - "y": 72 + "y": 63 }, "hiddenSeries": false, "id": 64, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2415,6 +2305,7 @@ }, "yaxes": [ { + "$$hashKey": "object:867", "decimals": 0, "format": "short", "label": null, @@ -2424,6 +2315,7 @@ "show": true }, { + "$$hashKey": "object:868", "format": "short", "label": null, "logBase": 1, @@ -2446,7 +2338,6 @@ "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2457,25 +2348,31 @@ "h": 8, "w": 12, "x": 0, - "y": 80 + "y": 71 }, "hiddenSeries": false, "id": 58, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2485,9 +2382,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", + "exemplar": true, + "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason)", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{reason}}", "refId": "A" @@ -2544,7 +2443,6 @@ "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2555,25 +2453,31 @@ "h": 8, "w": 12, "x": 12, - "y": 80 + "y": 71 }, "hiddenSeries": false, "id": 67, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2640,11 +2544,15 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 31 + "y": 27 }, "id": 71, "panels": [ @@ -2657,7 +2565,6 @@ "description": "Shows the rate and total number of new series created over last 24h.\n\nHigh churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nThe higher churn rate is, the more resources required to handle it. Consider to keep the churn rate as low as possible.\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2673,19 +2580,25 @@ "hiddenSeries": false, "id": 66, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2762,7 +2675,6 @@ "description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2778,20 +2690,26 @@ "hiddenSeries": false, "id": 60, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2860,7 +2778,6 @@ "description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2876,20 +2793,26 @@ "hiddenSeries": false, "id": 68, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2959,7 +2882,6 @@ "description": "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n\nThis prevents from ingesting metrics with too many labels. The value of `maxLabelsPerTimeseries` must be adjusted for your workload.\n\nWhen limit is exceeded (graph is > 0) - extra labels are dropped, which could result in unexpected identical time series.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2975,20 +2897,24 @@ "hiddenSeries": false, "id": 74, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": false, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2998,12 +2924,13 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=\"$job\", instance=\"$instance\"}[5m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "limit exceeded", "refId": "A" } ], @@ -3056,11 +2983,15 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 32 + "y": 28 }, "id": 46, "panels": [ @@ -3073,7 +3004,6 @@ "description": "", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3084,25 +3014,31 @@ "h": 8, "w": 12, "x": 0, - "y": 103 + "y": 33 }, "hiddenSeries": false, "id": 44, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3143,6 +3079,16 @@ "intervalFactor": 1, "legendFormat": "resident", "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(process_resident_memory_anon_bytes{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "resident anonymous", + "refId": "E" } ], "thresholds": [], @@ -3194,7 +3140,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3205,25 +3150,31 @@ "h": 8, "w": 12, "x": 12, - "y": 103 + "y": 33 }, "hiddenSeries": false, "id": 57, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3291,7 +3242,6 @@ "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3302,25 +3252,31 @@ "h": 8, "w": 12, "x": 0, - "y": 111 + "y": 41 }, "hiddenSeries": false, "id": 75, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3402,7 +3358,6 @@ "description": "Shows avg GC duration", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3413,25 +3368,31 @@ "h": 8, "w": 12, "x": 12, - "y": 111 + "y": 41 }, "hiddenSeries": false, "id": 42, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3497,7 +3458,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3508,25 +3468,31 @@ "h": 8, "w": 12, "x": 0, - "y": 119 + "y": 49 }, "hiddenSeries": false, "id": 47, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3594,7 +3560,6 @@ "description": "", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3605,25 +3570,31 @@ "h": 8, "w": 12, "x": 12, - "y": 119 + "y": 49 }, "hiddenSeries": false, "id": 37, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3691,7 +3662,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3702,25 +3672,31 @@ "h": 8, "w": 12, "x": 0, - "y": 127 + "y": 57 }, "hiddenSeries": false, "id": 48, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3788,7 +3764,6 @@ "description": "", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3799,25 +3774,31 @@ "h": 8, "w": 12, "x": 12, - "y": 127 + "y": 57 }, "hiddenSeries": false, "id": 49, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3883,9 +3864,12 @@ } ], "refresh": "30s", - "schemaVersion": 26, + "schemaVersion": 30, "style": "dark", - "tags": [], + "tags": [ + "victoriametrics", + "vmsingle" + ], "templating": { "list": [ { @@ -3894,6 +3878,8 @@ "text": "VictoriaMetrics", "value": "VictoriaMetrics" }, + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, @@ -3912,19 +3898,23 @@ "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "job", "options": [], - "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "query": { + "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "refId": "VictoriaMetrics-job-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3934,19 +3924,23 @@ "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "description": null, + "error": null, "hide": 2, "includeAll": false, "label": null, "multi": false, "name": "version", "options": [], - "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "query": { + "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "refId": "VictoriaMetrics-version-Variable-Query" + }, "refresh": 1, "regex": "/.*-tags-(v\\d+\\.\\d+\\.\\d+)/", "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3956,19 +3950,23 @@ "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "instance", "options": [], - "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "VictoriaMetrics-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -4007,4 +4005,4 @@ "title": "VictoriaMetrics", "uid": "wNf0q_kZk", "version": 1 -} +} \ No newline at end of file diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 0082e7e93..67ef5ce01 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -5,12 +5,12 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "7.1.1" + "version": "8.0.0" }, { "type": "panel", "id": "graph", - "name": "Graph", + "name": "Graph (old)", "version": "" }, { @@ -56,7 +56,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1616957263139, + "iteration": 1623414948941, "links": [ { "icon": "doc", @@ -88,6 +88,10 @@ { "collapsed": false, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -104,7 +108,6 @@ "description": "Shows total number of all configured scrape targets in state \"up\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -137,9 +140,10 @@ "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -158,7 +162,6 @@ "description": "Shows total number of all configured scrape targets in state \"down\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -201,9 +204,10 @@ "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -222,7 +226,6 @@ "description": "Shows number of generated error messages in logs over last 30m. Non-zero value may be a sign of connectivity or missconfiguration errors.", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "min": 0, "thresholds": { @@ -268,9 +271,10 @@ "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -289,7 +293,6 @@ "description": "Persistent queue size shows size of pending samples in bytes which hasn't been flushed to remote storage yet. \nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -327,9 +330,10 @@ "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -346,12 +350,6 @@ { "columns": [], "datasource": "$ds", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "fontSize": "100%", "gridPos": { "h": 7, @@ -364,7 +362,7 @@ "scroll": true, "showHeader": true, "sort": { - "col": null, + "col": 3, "desc": false }, "styles": [ @@ -448,7 +446,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -482,8 +479,11 @@ "lines": true, "linewidth": 1, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -553,7 +553,6 @@ "description": "Shows in/out samples rate including push and pull models. \n\nThe out-rate could be different to in-rate because of replication or additional timeseries added by vmagent for every scraped target.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -569,19 +568,25 @@ "hiddenSeries": false, "id": 5, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -659,7 +664,6 @@ "description": "Shows the rate of requests served by vmagent HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -675,20 +679,26 @@ "hiddenSeries": false, "id": 15, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -754,7 +764,6 @@ "description": "Network usage shows the bytes rate for data accepted by vmagent and pushed via remotewrite protocol.\nDiscrepancies are possible because of different protocols used for ingesting, scraping and writing data.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -770,19 +779,26 @@ "hiddenSeries": false, "id": 7, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -859,7 +875,6 @@ "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -875,13 +890,16 @@ "hiddenSeries": false, "id": 69, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -893,8 +911,11 @@ } ], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -984,7 +1005,6 @@ "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1000,13 +1020,16 @@ "hiddenSeries": false, "id": 49, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1018,8 +1041,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1085,7 +1111,6 @@ "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1101,13 +1126,16 @@ "hiddenSeries": false, "id": 17, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1118,8 +1146,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1185,7 +1216,6 @@ "description": "Shows the rate of dropped samples due to relabeling. \nMetric tracks drops for `-remoteWrite.relabelConfig` configuration only.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1201,13 +1231,16 @@ "hiddenSeries": false, "id": 18, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1219,8 +1252,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1292,7 +1328,6 @@ "description": "Shows the rate of dropped data blocks in cases when remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.\n\nSee https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1308,20 +1343,26 @@ "hiddenSeries": false, "id": 79, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1331,9 +1372,10 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", "interval": "", - "legendFormat": "", + "legendFormat": "dropped", "refId": "A" } ], @@ -1381,6 +1423,10 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -1397,7 +1443,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1413,19 +1458,25 @@ "hiddenSeries": false, "id": 48, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1435,7 +1486,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type)", + "exemplar": true, + "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type) > 0", "format": "time_series", "interval": "", "legendFormat": "{{type}}", @@ -1452,7 +1504,6 @@ "sort": 2, "value_type": "individual" }, - "transparent": true, "type": "graph", "xaxis": { "buckets": null, @@ -1492,7 +1543,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1508,19 +1558,25 @@ "hiddenSeries": false, "id": 76, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1530,6 +1586,7 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"}) by(type) > 0", "format": "time_series", "interval": "", @@ -1547,7 +1604,6 @@ "sort": 2, "value_type": "individual" }, - "transparent": true, "type": "graph", "xaxis": { "buckets": null, @@ -1587,7 +1643,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1603,19 +1658,25 @@ "hiddenSeries": false, "id": 20, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1691,7 +1752,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1707,19 +1767,25 @@ "hiddenSeries": false, "id": 31, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1802,7 +1868,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1818,19 +1883,25 @@ "hiddenSeries": false, "id": 46, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1909,12 +1980,6 @@ "dataFormat": "tsbuckets", "datasource": "$ds", "description": "works in vm only disclaimer", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -1972,6 +2037,10 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -1989,7 +2058,6 @@ "description": "Shows the rate of write requests served by ingestserver (UDP, TCP connections) and HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2005,20 +2073,26 @@ "hiddenSeries": false, "id": 73, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2028,13 +2102,15 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)", + "exemplar": true, + "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0", "interval": "", "legendFormat": "{{ type }} ({{net}})", "refId": "A" }, { - "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)", + "exemplar": true, + "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0", "interval": "", "legendFormat": "{{ protocol }} (http)", "refId": "B" @@ -2090,7 +2166,6 @@ "description": "Shows the rate of write errors in ingestserver (UDP, TCP connections) and HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2106,20 +2181,26 @@ "hiddenSeries": false, "id": 77, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2129,13 +2210,15 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)", + "exemplar": true, + "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0", "interval": "", "legendFormat": "{{ type }} ({{net}})", "refId": "A" }, { - "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)", + "exemplar": true, + "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0", "interval": "", "legendFormat": "{{ protocol }} (http)", "refId": "B" @@ -2191,7 +2274,6 @@ "description": "Shows the rate of parsed rows from write or scrape requests.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2207,20 +2289,26 @@ "hiddenSeries": false, "id": 78, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2230,9 +2318,10 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vm_protoparser_rows_read_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)", "interval": "", - "legendFormat": "{{ type }} ({{net}})", + "legendFormat": "{{ type }}", "refId": "A" } ], @@ -2286,7 +2375,6 @@ "description": "Tracks the rate of dropped invalid rows because of errors while unmarshaling write requests. The exact errors messages will be printed in logs.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2302,19 +2390,25 @@ "hiddenSeries": false, "id": 50, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2324,7 +2418,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)", + "exemplar": true, + "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type) > 0", "interval": "", "legendFormat": "{{type}}", "refId": "A" @@ -2378,6 +2473,10 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -2395,7 +2494,6 @@ "description": "Shows the rate of requests to configured remote write endpoints by url and status code.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2411,19 +2509,25 @@ "hiddenSeries": false, "id": 60, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2490,7 +2594,6 @@ "description": "Shows the global rate for number of written bytes via remote write connections.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2506,19 +2609,25 @@ "hiddenSeries": false, "id": 66, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2584,7 +2693,6 @@ "description": "Shows requests retry rate by url. Number of retries is unlimited but protected with delays up to 1m between attempts.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2600,19 +2708,25 @@ "hiddenSeries": false, "id": 61, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2622,9 +2736,10 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url)", "interval": "", - "legendFormat": "{{ url }}", + "legendFormat": "", "refId": "A" } ], @@ -2678,7 +2793,6 @@ "description": "Shows current number of established connections to remote write endpoints.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2694,19 +2808,25 @@ "hiddenSeries": false, "id": 65, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2778,12 +2898,6 @@ "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request block size distribution in rows.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -2849,12 +2963,6 @@ "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request block size distribution in bytes.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -2920,12 +3028,6 @@ "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 24, @@ -2983,6 +3085,10 @@ { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -3000,7 +3106,6 @@ "description": "Shows the CPU usage per vmagent instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3016,13 +3121,16 @@ "hiddenSeries": false, "id": 35, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -3034,8 +3142,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3099,10 +3210,9 @@ "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Amount of used memory (resident)\n\nIf you think that usage is abnormal or unexpected pls file an issue and attach memory profile if possible.", + "description": "Amount of used memory\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3118,13 +3228,16 @@ "hiddenSeries": false, "id": 37, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -3136,8 +3249,11 @@ } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3147,10 +3263,19 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)", "interval": "", - "legendFormat": "{{instance}}", + "legendFormat": "resident {{instance}}", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)", + "hide": false, + "interval": "", + "legendFormat": "anonymous {{instance}}", + "refId": "B" } ], "thresholds": [], @@ -3203,7 +3328,6 @@ "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3219,20 +3343,26 @@ "hiddenSeries": false, "id": 83, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3313,7 +3443,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3329,20 +3458,26 @@ "hiddenSeries": false, "id": 39, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3411,7 +3546,6 @@ "description": "Shows the number of bytes read/write from the storage layer when vmagent has to buffer data on disk or read already buffered data.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3427,20 +3561,26 @@ "hiddenSeries": false, "id": 81, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3523,7 +3663,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3539,20 +3678,26 @@ "hiddenSeries": false, "id": 41, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3619,7 +3764,6 @@ "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3635,20 +3779,26 @@ "hiddenSeries": false, "id": 43, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3712,7 +3862,7 @@ } ], "refresh": false, - "schemaVersion": 26, + "schemaVersion": 30, "style": "dark", "tags": [ "vmagent", @@ -3726,6 +3876,8 @@ "text": "VictoriaMetrics", "value": "VictoriaMetrics" }, + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, @@ -3744,19 +3896,23 @@ "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": true, "name": "job", "options": [], - "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "query": { + "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "refId": "VictoriaMetrics-job-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3766,19 +3922,23 @@ "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "description": null, + "error": null, "hide": 0, "includeAll": true, "label": null, "multi": true, "name": "instance", "options": [], - "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "VictoriaMetrics-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3806,4 +3966,4 @@ "title": "vmagent", "uid": "G7Z9GzMGz", "version": 1 -} +} \ No newline at end of file From 729c4eeb9cf2e54c64fcf6d3480011eec6d22a2c Mon Sep 17 00:00:00 2001 From: Nikolay Date: Mon, 14 Jun 2021 13:15:04 +0300 Subject: [PATCH 08/26] adds digital ocean sd (#1376) * adds digital ocean sd config * adds digital ocean sd https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367 * typo fix --- README.md | 3 + app/vmagent/README.md | 4 + docs/vmagent.md | 4 + lib/promscrape/config.go | 50 ++- lib/promscrape/discovery/digitalocean/api.go | 92 +++++ .../discovery/digitalocean/api_test.go | 349 ++++++++++++++++++ .../discovery/digitalocean/digitalocean.go | 148 ++++++++ lib/promscrape/scraper.go | 4 + 8 files changed, 644 insertions(+), 10 deletions(-) create mode 100644 lib/promscrape/discovery/digitalocean/api.go create mode 100644 lib/promscrape/discovery/digitalocean/api_test.go create mode 100644 lib/promscrape/discovery/digitalocean/digitalocean.go diff --git a/README.md b/README.md index 8f8b1642d..94e86d5bb 100644 --- a/README.md +++ b/README.md @@ -343,6 +343,7 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la * [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) * [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) * [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) +* [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) Other `*_sd_config` types will be supported in the future. @@ -1721,6 +1722,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li Wait time used by Consul service discovery. Default value is used if not set -promscrape.consulSDCheckInterval duration Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s) + -promscrape.digitaloceanSDCheckInterval duration + Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s) -promscrape.disableCompression Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control -promscrape.disableKeepAlive diff --git a/app/vmagent/README.md b/app/vmagent/README.md index 6bac2bf56..901309994 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -177,6 +177,8 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details. * `eureka_sd_configs` - is for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka). See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details. +* `digitalocean_sd_configs` is for scraping targerts registered in [DigitalOcean](https://www.digitalocean.com/) + See [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) for details. Please file feature requests to [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`. @@ -627,6 +629,8 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . Wait time used by Consul service discovery. Default value is used if not set -promscrape.consulSDCheckInterval duration Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s) + -promscrape.digitaloceanSDCheckInterval duration + Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s) -promscrape.disableCompression Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control -promscrape.disableKeepAlive diff --git a/docs/vmagent.md b/docs/vmagent.md index 165793b76..d257d34e2 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -181,6 +181,8 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details. * `eureka_sd_configs` - is for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka). See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details. +* `digitalocean_sd_configs` is for scraping targerts registered in [DigitalOcean](https://www.digitalocean.com/) + See [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) for details. Please file feature requests to [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`. @@ -631,6 +633,8 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . Wait time used by Consul service discovery. Default value is used if not set -promscrape.consulSDCheckInterval duration Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s) + -promscrape.digitaloceanSDCheckInterval duration + Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s) -promscrape.disableCompression Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control -promscrape.disableKeepAlive diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go index e5a088ebf..6316971d1 100644 --- a/lib/promscrape/config.go +++ b/lib/promscrape/config.go @@ -19,6 +19,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/digitalocean" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/dns" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/dockerswarm" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/ec2" @@ -106,16 +107,17 @@ type ScrapeConfig struct { MetricRelabelConfigs []promrelabel.RelabelConfig `yaml:"metric_relabel_configs,omitempty"` SampleLimit int `yaml:"sample_limit,omitempty"` - StaticConfigs []StaticConfig `yaml:"static_configs,omitempty"` - FileSDConfigs []FileSDConfig `yaml:"file_sd_configs,omitempty"` - KubernetesSDConfigs []kubernetes.SDConfig `yaml:"kubernetes_sd_configs,omitempty"` - OpenStackSDConfigs []openstack.SDConfig `yaml:"openstack_sd_configs,omitempty"` - ConsulSDConfigs []consul.SDConfig `yaml:"consul_sd_configs,omitempty"` - EurekaSDConfigs []eureka.SDConfig `yaml:"eureka_sd_configs,omitempty"` - DockerSwarmSDConfigs []dockerswarm.SDConfig `yaml:"dockerswarm_sd_configs,omitempty"` - DNSSDConfigs []dns.SDConfig `yaml:"dns_sd_configs,omitempty"` - EC2SDConfigs []ec2.SDConfig `yaml:"ec2_sd_configs,omitempty"` - GCESDConfigs []gce.SDConfig `yaml:"gce_sd_configs,omitempty"` + StaticConfigs []StaticConfig `yaml:"static_configs,omitempty"` + FileSDConfigs []FileSDConfig `yaml:"file_sd_configs,omitempty"` + KubernetesSDConfigs []kubernetes.SDConfig `yaml:"kubernetes_sd_configs,omitempty"` + OpenStackSDConfigs []openstack.SDConfig `yaml:"openstack_sd_configs,omitempty"` + ConsulSDConfigs []consul.SDConfig `yaml:"consul_sd_configs,omitempty"` + EurekaSDConfigs []eureka.SDConfig `yaml:"eureka_sd_configs,omitempty"` + DockerSwarmSDConfigs []dockerswarm.SDConfig `yaml:"dockerswarm_sd_configs,omitempty"` + DNSSDConfigs []dns.SDConfig `yaml:"dns_sd_configs,omitempty"` + EC2SDConfigs []ec2.SDConfig `yaml:"ec2_sd_configs,omitempty"` + GCESDConfigs []gce.SDConfig `yaml:"gce_sd_configs,omitempty"` + DigitaloceanSDConfigs []digitalocean.SDConfig `yaml:"digitalocean_sd_configs,omitempty"` // These options are supported only by lib/promscrape. RelabelDebug bool `yaml:"relabel_debug,omitempty"` @@ -488,6 +490,34 @@ func (cfg *Config) getGCESDScrapeWork(prev []*ScrapeWork) []*ScrapeWork { return dst } +// getDigitalOceanDScrapeWork returns `digitalocean_sd_configs` ScrapeWork from cfg. +func (cfg *Config) getDigitalOceanDScrapeWork(prev []*ScrapeWork) []*ScrapeWork { + swsPrevByJob := getSWSByJob(prev) + dst := make([]*ScrapeWork, 0, len(prev)) + for i := range cfg.ScrapeConfigs { + sc := &cfg.ScrapeConfigs[i] + dstLen := len(dst) + ok := true + for j := range sc.DigitaloceanSDConfigs { + sdc := &sc.DigitaloceanSDConfigs[j] + var okLocal bool + dst, okLocal = appendSDScrapeWork(dst, sdc, cfg.baseDir, sc.swc, "digitalocean_sd_config") + if ok { + ok = okLocal + } + } + if ok { + continue + } + swsPrev := swsPrevByJob[sc.swc.jobName] + if len(swsPrev) > 0 { + logger.Errorf("there were errors when discovering digitalocean targets for job %q, so preserving the previous targets", sc.swc.jobName) + dst = append(dst[:dstLen], swsPrev...) + } + } + return dst +} + // getFileSDScrapeWork returns `file_sd_configs` ScrapeWork from cfg. func (cfg *Config) getFileSDScrapeWork(prev []*ScrapeWork) []*ScrapeWork { // Create a map for the previous scrape work. diff --git a/lib/promscrape/discovery/digitalocean/api.go b/lib/promscrape/discovery/digitalocean/api.go new file mode 100644 index 000000000..b42cc9840 --- /dev/null +++ b/lib/promscrape/discovery/digitalocean/api.go @@ -0,0 +1,92 @@ +package digitalocean + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils" +) + +var configMap = discoveryutils.NewConfigMap() + +type apiConfig struct { + client *discoveryutils.Client + port int +} + +func newAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { + ac, err := sdc.HTTPClientConfig.NewConfig(baseDir) + if err != nil { + return nil, fmt.Errorf("cannot parse auth config: %w", err) + } + + apiServer := sdc.Server + if apiServer == "" { + apiServer = "https://api.digitalocean.com" + } + if !strings.Contains(apiServer, "://") { + scheme := "http" + if sdc.HTTPClientConfig.TLSConfig != nil { + scheme = "https" + } + apiServer = scheme + "://" + apiServer + } + proxyAC, err := sdc.ProxyClientConfig.NewConfig(baseDir) + if err != nil { + return nil, fmt.Errorf("cannot parse proxy auth config: %w", err) + } + client, err := discoveryutils.NewClient(apiServer, ac, sdc.ProxyURL, proxyAC) + if err != nil { + return nil, fmt.Errorf("cannot create HTTP client for %q: %w", apiServer, err) + } + cfg := &apiConfig{ + client: client, + port: sdc.Port, + } + if cfg.port == 0 { + cfg.port = 80 + } + return cfg, nil +} + +func getAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { + v, err := configMap.Get(sdc, func() (interface{}, error) { return newAPIConfig(sdc, baseDir) }) + if err != nil { + return nil, err + } + return v.(*apiConfig), nil + +} + +const dropletsAPIPath = "/v2/droplets" + +func getDroplets(getAPIResponse func(string) ([]byte, error)) ([]droplet, error) { + var droplets []droplet + + nextAPIURL := dropletsAPIPath + for nextAPIURL != "" { + data, err := getAPIResponse(nextAPIURL) + if err != nil { + return nil, fmt.Errorf("cannot fetch data from digitalocean list api: %w", err) + } + apiResp, err := parseAPIResponse(data) + if err != nil { + return nil, err + } + droplets = append(droplets, apiResp.Droplets...) + nextAPIURL, err = apiResp.nextURLPath() + if err != nil { + return nil, err + } + } + return droplets, nil +} + +func parseAPIResponse(data []byte) (*listDropletResponse, error) { + var dps listDropletResponse + if err := json.Unmarshal(data, &dps); err != nil { + return nil, fmt.Errorf("failed parse digitalocean api response: %q, err: %w", data, err) + } + return &dps, nil +} diff --git a/lib/promscrape/discovery/digitalocean/api_test.go b/lib/promscrape/discovery/digitalocean/api_test.go new file mode 100644 index 000000000..f9833e4f6 --- /dev/null +++ b/lib/promscrape/discovery/digitalocean/api_test.go @@ -0,0 +1,349 @@ +package digitalocean + +import ( + "reflect" + "testing" +) + +func Test_parseAPIResponse(t *testing.T) { + type args struct { + data []byte + } + tests := []struct { + name string + args args + want *listDropletResponse + wantErr bool + }{ + + { + name: "simple parse", + args: args{data: []byte(`{ + "droplets": [ + { + "id": 3164444, + "name": "example.com", + "memory": 1024, + "vcpus": 1, + "status": "active", + "kernel": { + "id": 2233, + "name": "Ubuntu 14.04 x64 vmlinuz-3.13.0-37-generic", + "version": "3.13.0-37-generic" + }, + "features": [ + "backups", + "ipv6", + "virtio" + ], + "snapshot_ids": [], + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64", + "public": true, + "regions": [ + "nyc1" + ] + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.182", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ], + "v6": [ + { + "ip_address": "2604:A880:0800:0010:0000:0000:02DD:4001", + "netmask": 64, + "gateway": "2604:A880:0800:0010:0000:0000:0000:0001", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3", + "features": [ + "private_networking", + "backups", + "ipv6" + ] + }, + "tags": [ + "tag1", + "tag2" + ], + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + } + ], + "links": { + "pages": { + "last": "https://api.digitalocean.com/v2/droplets?page=3&per_page=1", + "next": "https://api.digitalocean.com/v2/droplets?page=2&per_page=1" + } + } +}`)}, + want: &listDropletResponse{ + Droplets: []droplet{ + { + Image: struct { + Name string `json:"name"` + Slug string `json:"slug"` + }(struct { + Name string + Slug string + }{Name: "14.04 x64", Slug: "ubuntu-16-04-x64"}), + Region: struct { + Slug string `json:"slug"` + }(struct{ Slug string }{Slug: "nyc3"}), + Networks: networks{ + V6: []network{ + { + IPAddress: "2604:A880:0800:0010:0000:0000:02DD:4001", + Type: "public", + }, + }, + V4: []network{ + { + IPAddress: "104.236.32.182", + Type: "public", + }, + }, + }, + SizeSlug: "s-1vcpu-1gb", + Features: []string{"backups", "ipv6", "virtio"}, + Tags: []string{"tag1", "tag2"}, + Status: "active", + Name: "example.com", + ID: 3164444, + VpcUUID: "f9b0769c-e118-42fb-a0c4-fed15ef69662", + }, + }, + Links: links{ + Pages: struct { + Last string `json:"last,omitempty"` + Next string `json:"next,omitempty"` + }(struct { + Last string + Next string + }{Last: "https://api.digitalocean.com/v2/droplets?page=3&per_page=1", Next: "https://api.digitalocean.com/v2/droplets?page=2&per_page=1"}), + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseAPIResponse(tt.args.data) + if (err != nil) != tt.wantErr { + t.Errorf("parseAPIResponse() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("parseAPIResponse() got = \n%v\n, \nwant \n%v\n", got, tt.want) + } + }) + } +} + +func Test_getDroplets(t *testing.T) { + type args struct { + getAPIResponse func(string) ([]byte, error) + } + tests := []struct { + name string + args args + wantDropletCount int + wantErr bool + }{ + { + name: "get 4 droples", + args: args{ + func(s string) ([]byte, error) { + var resp []byte + switch s { + case dropletsAPIPath: + // return next + resp = []byte(`{ "droplets": [ + { + "id": 3164444, + "name": "example.com", + "status": "active", + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64", + "public": true, + "regions": [ + "nyc1" + ] + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.182", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3" + }, + "tags": [ + "tag1", + "tag2" + ], + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + }, + { + "id": 3164444, + "name": "example.com", + "status": "active", + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64" + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.183", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3" + }, + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + }, + { + "id": 3164444, + "name": "example.com", + "status": "active", + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64" + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.183", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3" + }, + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + } + ], + "links": { + "pages": { + "last": "https://api.digitalocean.com/v2/droplets?page=3&per_page=1", + "next": "https://api.digitalocean.com/v2/droplets?page=2&per_page=1" + } + } +}`) + default: + // return with empty next + resp = []byte(`{ "droplets": [ + { + "id": 3164444, + "name": "example.com", + "status": "active", + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64" + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.183", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3" + }, + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + }, + { + "id": 3164444, + "name": "example.com", + "status": "active", + "image": { + "id": 6918990, + "name": "14.04 x64", + "distribution": "Ubuntu", + "slug": "ubuntu-16-04-x64" + }, + "size_slug": "s-1vcpu-1gb", + "networks": { + "v4": [ + { + "ip_address": "104.236.32.183", + "netmask": "255.255.192.0", + "gateway": "104.236.0.1", + "type": "public" + } + ] + }, + "region": { + "name": "New York 3", + "slug": "nyc3" + }, + "vpc_uuid": "f9b0769c-e118-42fb-a0c4-fed15ef69662" + } + ] +}`) + } + return resp, nil + }, + }, + wantDropletCount: 5, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := getDroplets(tt.args.getAPIResponse) + if (err != nil) != tt.wantErr { + t.Errorf("getDroplets() error = %v, wantErr %v", err, tt.wantErr) + return + } + if len(got) != tt.wantDropletCount { + t.Fatalf("unexpected droplets count: %d, want: %d, \n droplets: %v\n", len(got), tt.wantDropletCount, got) + } + + }) + } +} diff --git a/lib/promscrape/discovery/digitalocean/digitalocean.go b/lib/promscrape/discovery/digitalocean/digitalocean.go new file mode 100644 index 000000000..e498f201a --- /dev/null +++ b/lib/promscrape/discovery/digitalocean/digitalocean.go @@ -0,0 +1,148 @@ +package digitalocean + +import ( + "fmt" + "net/url" + "strings" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy" +) + +// SDConfig represents service discovery config for digital ocean. +// +// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config +type SDConfig struct { + Server string `yaml:"server,omitempty"` + HTTPClientConfig promauth.HTTPClientConfig `yaml:",inline"` + ProxyURL proxy.URL `yaml:"proxy_url,omitempty"` + ProxyClientConfig promauth.ProxyClientConfig `yaml:",inline"` + Port int `yaml:"port,omitempty"` +} + +// GetLabels returns Digital Ocean droplet labels according to sdc. +func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) { + cfg, err := getAPIConfig(sdc, baseDir) + if err != nil { + return nil, fmt.Errorf("cannot get API config: %w", err) + } + droplets, err := getDroplets(cfg.client.GetAPIResponse) + if err != nil { + return nil, err + } + + return addDropletLabels(droplets, cfg.port), nil +} + +// https://developers.digitalocean.com/documentation/v2/#retrieve-an-existing-droplet-by-id +type droplet struct { + ID int `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + + Features []string `json:"features"` + Image struct { + Name string `json:"name"` + Slug string `json:"slug"` + } `json:"image"` + SizeSlug string `json:"size_slug"` + Networks networks `json:"networks"` + Region struct { + Slug string `json:"slug"` + } `json:"region"` + Tags []string `json:"tags"` + VpcUUID string `json:"vpc_uuid"` +} + +func (d *droplet) getIPByNet(netVersion, netType string) string { + var dropletNetworks []network + switch netVersion { + case "v4": + dropletNetworks = d.Networks.V4 + case "v6": + dropletNetworks = d.Networks.V6 + default: + logger.Fatalf("BUG, unexpected network type: %s, want v4 or v6", netVersion) + } + for _, net := range dropletNetworks { + if net.Type == netType { + return net.IPAddress + } + } + return "" +} + +type networks struct { + V4 []network `json:"v4"` + V6 []network `json:"v6"` +} +type network struct { + IPAddress string `json:"ip_address"` + // private | public. + Type string `json:"type"` +} + +// https://developers.digitalocean.com/documentation/v2/#list-all-droplets +type listDropletResponse struct { + Droplets []droplet `json:"droplets,omitempty"` + Links links `json:"links,omitempty"` +} + +type links struct { + Pages struct { + Last string `json:"last,omitempty"` + Next string `json:"next,omitempty"` + } `json:"pages,omitempty"` +} + +func (r *listDropletResponse) nextURLPath() (string, error) { + if r.Links.Pages.Next == "" { + return "", nil + } + u, err := url.Parse(r.Links.Pages.Next) + if err != nil { + return "", fmt.Errorf("cannot parse digital ocean next url: %s, err: %s", r.Links.Pages.Next, err) + } + return u.RequestURI(), nil +} + +func addDropletLabels(droplets []droplet, defaultPort int) []map[string]string { + var ms []map[string]string + for _, droplet := range droplets { + if len(droplet.Networks.V4) == 0 { + continue + } + + privateIPv4 := droplet.getIPByNet("v4", "private") + publicIPv4 := droplet.getIPByNet("v4", "public") + publicIPv6 := droplet.getIPByNet("v6", "public") + + addr := discoveryutils.JoinHostPort(publicIPv4, defaultPort) + m := map[string]string{ + "__address__": addr, + "__meta_digitalocean_droplet_id": fmt.Sprintf("%d", droplet.ID), + "__meta_digitalocean_droplet_name": droplet.Name, + "__meta_digitalocean_image": droplet.Image.Slug, + "__meta_digitalocean_image_name": droplet.Image.Name, + "__meta_digitalocean_private_ipv4": privateIPv4, + "__meta_digitalocean_public_ipv4": publicIPv4, + "__meta_digitalocean_public_ipv6": publicIPv6, + "__meta_digitalocean_region": droplet.Region.Slug, + "__meta_digitalocean_size": droplet.SizeSlug, + "__meta_digitalocean_status": droplet.Status, + "__meta_digitalocean_vpc": droplet.VpcUUID, + } + if len(droplet.Features) > 0 { + features := fmt.Sprintf(",%s,", strings.Join(droplet.Features, ",")) + m["__meta_digitalocean_vpc"] = features + } + if len(droplet.Tags) > 0 { + tags := fmt.Sprintf(",%s,", strings.Join(droplet.Features, ",")) + m["__meta_digitalocean_tags"] = tags + } + ms = append(ms, m) + } + return ms +} diff --git a/lib/promscrape/scraper.go b/lib/promscrape/scraper.go index 89493be47..cbb225ef0 100644 --- a/lib/promscrape/scraper.go +++ b/lib/promscrape/scraper.go @@ -41,6 +41,9 @@ var ( dockerswarmSDCheckInterval = flag.Duration("promscrape.dockerswarmSDCheckInterval", 30*time.Second, "Interval for checking for changes in dockerswarm. "+ "This works only if dockerswarm_sd_configs is configured in '-promscrape.config' file. "+ "See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config for details") + digitaloceanSDCheckInterval = flag.Duration("promscrape.digitaloceanSDCheckInterval", time.Minute, "Interval for checking for changes in digital ocean. "+ + "This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. "+ + "See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details") promscrapeConfigFile = flag.String("promscrape.config", "", "Optional path to Prometheus config file with 'scrape_configs' section containing targets to scrape. "+ "See https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter for details") suppressDuplicateScrapeTargetErrors = flag.Bool("promscrape.suppressDuplicateScrapeTargetErrors", false, "Whether to suppress 'duplicate scrape target' errors; "+ @@ -111,6 +114,7 @@ func runScraper(configFile string, pushData func(wr *prompbmarshal.WriteRequest) scs.add("ec2_sd_configs", *ec2SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getEC2SDScrapeWork(swsPrev) }) scs.add("gce_sd_configs", *gceSDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getGCESDScrapeWork(swsPrev) }) scs.add("dockerswarm_sd_configs", *dockerswarmSDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getDockerSwarmSDScrapeWork(swsPrev) }) + scs.add("digitalocean_sd_configs", *digitaloceanSDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getDigitalOceanDScrapeWork(swsPrev) }) var tickerCh <-chan time.Time if *configCheckInterval > 0 { From 05bc9667c1db22f380fd07aad4d5582c25bb2d10 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 13:18:13 +0300 Subject: [PATCH 09/26] docs/CHANGELOG.md: document the addition of DigitalOcean service discovery Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367 --- docs/CHANGELOG.md | 2 ++ docs/Single-server-VictoriaMetrics.md | 3 +++ 2 files changed, 5 insertions(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 3dee1bf8a..0abaae99f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -6,6 +6,8 @@ sort: 15 ## tip +* FEATURE: vmagent: add service discovery for DigitalOcean (aka [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367). + ## [v1.61.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.61.1) diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index ad9d01f43..012c133de 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -347,6 +347,7 @@ Currently the following [scrape_config](https://prometheus.io/docs/prometheus/la * [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) * [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) * [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) +* [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config) Other `*_sd_config` types will be supported in the future. @@ -1725,6 +1726,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li Wait time used by Consul service discovery. Default value is used if not set -promscrape.consulSDCheckInterval duration Interval for checking for changes in Consul. This works only if consul_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config for details (default 30s) + -promscrape.digitaloceanSDCheckInterval duration + Interval for checking for changes in digital ocean. This works only if digitalocean_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config for details (default 1m0s) -promscrape.disableCompression Whether to disable sending 'Accept-Encoding: gzip' request headers to all the scrape targets. This may reduce CPU usage on scrape targets at the cost of higher network bandwidth utilization. It is possible to set 'disable_compression: true' individually per each 'scrape_config' section in '-promscrape.config' for fine grained control -promscrape.disableKeepAlive From a90012ef26a9f4724291bb01b21b4cab1c25c952 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 14 Jun 2021 13:31:59 +0300 Subject: [PATCH 10/26] dashboard: bump version requirements (#1378) --- dashboards/victoriametrics.json | 2 +- dashboards/vmagent.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 1741e6bd5..889522dce 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -45,7 +45,7 @@ } ] }, - "description": "Overview for single node VictoriaMetrics v1.56.0 or higher", + "description": "Overview for single node VictoriaMetrics v1.57.0 or higher", "editable": true, "gnetId": 10229, "graphTooltip": 0, diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 67ef5ce01..7e211cf2d 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -51,7 +51,7 @@ } ] }, - "description": "Overview for VictoriaMetrics vmagent v1.56.0 or higher", + "description": "Overview for VictoriaMetrics vmagent v1.57.0 or higher", "editable": true, "gnetId": null, "graphTooltip": 1, From 7b283ee91cfcdbf73f64feb5b17cebd4ea7eefa5 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 13:41:28 +0300 Subject: [PATCH 11/26] vendor: update github.com/klauspost/compress from v1.13.0 to v1.13.1 --- go.mod | 2 +- go.sum | 4 +-- .../klauspost/compress/zstd/blockdec.go | 4 +-- .../compress/zstd/decoder_options.go | 25 +++++++++++++-- .../klauspost/compress/zstd/framedec.go | 32 ++++++++++--------- vendor/modules.txt | 2 +- 6 files changed, 45 insertions(+), 24 deletions(-) diff --git a/go.mod b/go.mod index 23e1399eb..6a9963ace 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.3 github.com/influxdata/influxdb v1.9.2 - github.com/klauspost/compress v1.13.0 + github.com/klauspost/compress v1.13.1 github.com/mattn/go-isatty v0.0.13 // indirect github.com/mattn/go-runewidth v0.0.13 // indirect github.com/prometheus/common v0.28.0 // indirect diff --git a/go.sum b/go.sum index e8b57b041..8b2371376 100644 --- a/go.sum +++ b/go.sum @@ -588,8 +588,8 @@ github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0 github.com/klauspost/compress v1.10.7/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/compress v1.11.0/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= -github.com/klauspost/compress v1.13.0 h1:2T7tUoQrQT+fQWdaY5rjWztFGAFwbGD04iPJg90ZiOs= -github.com/klauspost/compress v1.13.0/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= +github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ= +github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg= github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go index e30af505c..8a98c4562 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockdec.go +++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go @@ -168,10 +168,10 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { // Read block data. if cap(b.dataStorage) < cSize { - if b.lowMem { + if b.lowMem || cSize > maxCompressedBlockSize { b.dataStorage = make([]byte, 0, cSize) } else { - b.dataStorage = make([]byte, 0, maxBlockSize) + b.dataStorage = make([]byte, 0, maxCompressedBlockSize) } } if cap(b.dst) <= maxSize { diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go index c0fd058c2..95cc9b8b8 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go @@ -17,14 +17,16 @@ type decoderOptions struct { lowMem bool concurrent int maxDecodedSize uint64 + maxWindowSize uint64 dicts []dict } func (o *decoderOptions) setDefault() { *o = decoderOptions{ // use less ram: true for now, but may change. - lowMem: true, - concurrent: runtime.GOMAXPROCS(0), + lowMem: true, + concurrent: runtime.GOMAXPROCS(0), + maxWindowSize: MaxWindowSize, } o.maxDecodedSize = 1 << 63 } @@ -52,7 +54,6 @@ func WithDecoderConcurrency(n int) DOption { // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory // non-streaming operations or maximum window size for streaming operations. // This can be used to control memory usage of potentially hostile content. -// For streaming operations, the maximum window size is capped at 1<<30 bytes. // Maximum and default is 1 << 63 bytes. func WithDecoderMaxMemory(n uint64) DOption { return func(o *decoderOptions) error { @@ -81,3 +82,21 @@ func WithDecoderDicts(dicts ...[]byte) DOption { return nil } } + +// WithDecoderMaxWindow allows to set a maximum window size for decodes. +// This allows rejecting packets that will cause big memory usage. +// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting. +// If WithDecoderMaxMemory is set to a lower value, that will be used. +// Default is 512MB, Maximum is ~3.75 TB as per zstandard spec. +func WithDecoderMaxWindow(size uint64) DOption { + return func(o *decoderOptions) error { + if size < MinWindowSize { + return errors.New("WithMaxWindowSize must be at least 1KB, 1024 bytes") + } + if size > (1<<41)+7*(1<<38) { + return errors.New("WithMaxWindowSize must be less than (1<<41) + 7*(1<<38) ~ 3.75TB") + } + o.maxWindowSize = size + return nil + } +} diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go index e8cc9a2c2..989c79f8c 100644 --- a/vendor/github.com/klauspost/compress/zstd/framedec.go +++ b/vendor/github.com/klauspost/compress/zstd/framedec.go @@ -22,10 +22,6 @@ type frameDec struct { WindowSize uint64 - // maxWindowSize is the maximum windows size to support. - // should never be bigger than max-int. - maxWindowSize uint64 - // In order queue of blocks being decoded. decoding chan *blockDec @@ -50,8 +46,11 @@ type frameDec struct { } const ( - // The minimum Window_Size is 1 KB. + // MinWindowSize is the minimum Window Size, which is 1 KB. MinWindowSize = 1 << 10 + + // MaxWindowSize is the maximum encoder window size + // and the default decoder maximum window size. MaxWindowSize = 1 << 29 ) @@ -61,12 +60,11 @@ var ( ) func newFrameDec(o decoderOptions) *frameDec { - d := frameDec{ - o: o, - maxWindowSize: MaxWindowSize, + if o.maxWindowSize > o.maxDecodedSize { + o.maxWindowSize = o.maxDecodedSize } - if d.maxWindowSize > o.maxDecodedSize { - d.maxWindowSize = o.maxDecodedSize + d := frameDec{ + o: o, } return &d } @@ -251,13 +249,17 @@ func (d *frameDec) reset(br byteBuffer) error { } } - if d.WindowSize > d.maxWindowSize { - printf("window size %d > max %d\n", d.WindowSize, d.maxWindowSize) + if d.WindowSize > uint64(d.o.maxWindowSize) { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } return ErrWindowSizeExceeded } // The minimum Window_Size is 1 KB. if d.WindowSize < MinWindowSize { - println("got window size: ", d.WindowSize) + if debugDecoder { + println("got window size: ", d.WindowSize) + } return ErrWindowSizeTooSmall } d.history.windowSize = int(d.WindowSize) @@ -352,8 +354,8 @@ func (d *frameDec) checkCRC() error { func (d *frameDec) initAsync() { if !d.o.lowMem && !d.SingleSegment { - // set max extra size history to 10MB. - d.history.maxSize = d.history.windowSize + maxBlockSize*5 + // set max extra size history to 2MB. + d.history.maxSize = d.history.windowSize + maxBlockSize } // re-alloc if more than one extra block size. if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize { diff --git a/vendor/modules.txt b/vendor/modules.txt index e8002b783..034d92db9 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -128,7 +128,7 @@ github.com/jmespath/go-jmespath github.com/jstemmer/go-junit-report github.com/jstemmer/go-junit-report/formatter github.com/jstemmer/go-junit-report/parser -# github.com/klauspost/compress v1.13.0 +# github.com/klauspost/compress v1.13.1 ## explicit github.com/klauspost/compress/flate github.com/klauspost/compress/fse From 36d55bff6638795466a2c4f716754b7b355f5b58 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 14:01:13 +0300 Subject: [PATCH 12/26] lib/promscrape: show the number of samples collected during the last scrape at /targets and /api/v1/targets pages Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377 --- docs/CHANGELOG.md | 1 + lib/promscrape/scrapework.go | 4 +- lib/promscrape/targets_response.qtpl | 7 +- lib/promscrape/targets_response.qtpl.go | 280 ++++++++++++------------ lib/promscrape/targetstatus.go | 11 +- 5 files changed, 160 insertions(+), 143 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 0abaae99f..212840203 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,7 @@ sort: 15 ## tip * FEATURE: vmagent: add service discovery for DigitalOcean (aka [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367). +* FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377). ## [v1.61.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.61.1) diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index 792234429..281a0e57e 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -324,7 +324,7 @@ func (sw *scrapeWork) scrapeInternal(scrapeTimestamp, realTimestamp int64) error // body must be released only after wc is released, since wc refers to body. sw.prevBodyLen = len(body.B) leveledbytebufferpool.Put(body) - tsmGlobal.Update(sw.Config, sw.ScrapeGroup, up == 1, realTimestamp, int64(duration*1000), err) + tsmGlobal.Update(sw.Config, sw.ScrapeGroup, up == 1, realTimestamp, int64(duration*1000), samplesScraped, err) return err } @@ -391,7 +391,7 @@ func (sw *scrapeWork) scrapeStream(scrapeTimestamp, realTimestamp int64) error { sw.prevLabelsLen = len(wc.labels) wc.reset() writeRequestCtxPool.Put(wc) - tsmGlobal.Update(sw.Config, sw.ScrapeGroup, up == 1, realTimestamp, int64(duration*1000), err) + tsmGlobal.Update(sw.Config, sw.ScrapeGroup, up == 1, realTimestamp, int64(duration*1000), samplesScraped, err) return err } diff --git a/lib/promscrape/targets_response.qtpl b/lib/promscrape/targets_response.qtpl index fabd7b103..6120e3678 100644 --- a/lib/promscrape/targets_response.qtpl +++ b/lib/promscrape/targets_response.qtpl @@ -19,7 +19,8 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) {% if showOriginLabels %}, originalLabels={%s= ol %}{% endif %}, last_scrape={%f.3 ts.lastScrapeTime.Seconds() %}s ago, scrape_duration={%f.3 ts.scrapeDuration.Seconds() %}s, - error={%q= ts.error %} + samples_scraped={%d ts.samplesScraped %}, + error={%q= ts.errMsg %} {% newline %} {% endfor %} {% endfor %} @@ -62,6 +63,7 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) Labels Last Scrape Scrape Duration + Samples Scraped Error @@ -76,7 +78,8 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) {%f.3 ts.lastScrapeTime.Seconds() %}s ago {%f.3 ts.scrapeDuration.Seconds() %}s - {%s ts.error %} + {%d ts.samplesScraped %} + {%s ts.errMsg %} {% endfor %} diff --git a/lib/promscrape/targets_response.qtpl.go b/lib/promscrape/targets_response.qtpl.go index 0c9d9699a..bce59cdcd 100644 --- a/lib/promscrape/targets_response.qtpl.go +++ b/lib/promscrape/targets_response.qtpl.go @@ -94,280 +94,288 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatu //line lib/promscrape/targets_response.qtpl:21 qw422016.N().FPrec(ts.scrapeDuration.Seconds(), 3) //line lib/promscrape/targets_response.qtpl:21 - qw422016.N().S(`s, error=`) + qw422016.N().S(`s, samples_scraped=`) //line lib/promscrape/targets_response.qtpl:22 - qw422016.N().Q(ts.error) + qw422016.N().D(ts.samplesScraped) //line lib/promscrape/targets_response.qtpl:22 - qw422016.N().S(` `) + qw422016.N().S(`, error=`) //line lib/promscrape/targets_response.qtpl:23 + qw422016.N().Q(ts.errMsg) +//line lib/promscrape/targets_response.qtpl:23 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:24 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:23 +//line lib/promscrape/targets_response.qtpl:24 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:24 +//line lib/promscrape/targets_response.qtpl:25 } -//line lib/promscrape/targets_response.qtpl:24 +//line lib/promscrape/targets_response.qtpl:25 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:25 - } -//line lib/promscrape/targets_response.qtpl:25 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:26 + } +//line lib/promscrape/targets_response.qtpl:26 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:27 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:26 +//line lib/promscrape/targets_response.qtpl:27 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 } -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 func WriteTargetsResponsePlain(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, showOriginLabels bool) { -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 StreamTargetsResponsePlain(qw422016, jts, showOriginLabels) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 } -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 func TargetsResponsePlain(jts []jobTargetsStatuses, showOriginLabels bool) string { -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 WriteTargetsResponsePlain(qb422016, jts, showOriginLabels) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 return qs422016 -//line lib/promscrape/targets_response.qtpl:28 +//line lib/promscrape/targets_response.qtpl:29 } -//line lib/promscrape/targets_response.qtpl:30 +//line lib/promscrape/targets_response.qtpl:31 func StreamTargetsResponseHTML(qw422016 *qt422016.Writer, jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) { -//line lib/promscrape/targets_response.qtpl:30 +//line lib/promscrape/targets_response.qtpl:31 qw422016.N().S(` Scrape targets

Scrape targets

`) -//line lib/promscrape/targets_response.qtpl:51 +//line lib/promscrape/targets_response.qtpl:52 for _, js := range jts { -//line lib/promscrape/targets_response.qtpl:51 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:52 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:53 if onlyUnhealthy && js.upCount == js.targetsTotal { -//line lib/promscrape/targets_response.qtpl:52 +//line lib/promscrape/targets_response.qtpl:53 continue -//line lib/promscrape/targets_response.qtpl:52 +//line lib/promscrape/targets_response.qtpl:53 } -//line lib/promscrape/targets_response.qtpl:52 +//line lib/promscrape/targets_response.qtpl:53 qw422016.N().S(`

`) -//line lib/promscrape/targets_response.qtpl:55 +//line lib/promscrape/targets_response.qtpl:56 qw422016.E().S(js.job) -//line lib/promscrape/targets_response.qtpl:55 +//line lib/promscrape/targets_response.qtpl:56 qw422016.N().S(` (`) -//line lib/promscrape/targets_response.qtpl:55 +//line lib/promscrape/targets_response.qtpl:56 qw422016.N().D(js.upCount) -//line lib/promscrape/targets_response.qtpl:55 +//line lib/promscrape/targets_response.qtpl:56 qw422016.N().S(`/`) -//line lib/promscrape/targets_response.qtpl:55 +//line lib/promscrape/targets_response.qtpl:56 qw422016.N().D(js.targetsTotal) -//line lib/promscrape/targets_response.qtpl:55 - qw422016.N().S(` up)

`) -//line lib/promscrape/targets_response.qtpl:69 +//line lib/promscrape/targets_response.qtpl:56 + qw422016.N().S(` up)
Endpoint State Labels Last Scrape Scrape Duration Error
`) +//line lib/promscrape/targets_response.qtpl:71 for _, ts := range js.targetsStatus { -//line lib/promscrape/targets_response.qtpl:69 +//line lib/promscrape/targets_response.qtpl:71 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:70 +//line lib/promscrape/targets_response.qtpl:72 if onlyUnhealthy && ts.up { -//line lib/promscrape/targets_response.qtpl:70 +//line lib/promscrape/targets_response.qtpl:72 continue -//line lib/promscrape/targets_response.qtpl:70 +//line lib/promscrape/targets_response.qtpl:72 } -//line lib/promscrape/targets_response.qtpl:70 +//line lib/promscrape/targets_response.qtpl:72 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:81 +//line lib/promscrape/targets_response.qtpl:84 } -//line lib/promscrape/targets_response.qtpl:81 +//line lib/promscrape/targets_response.qtpl:84 qw422016.N().S(`
Endpoint State Labels Last Scrape Scrape Duration Samples Scraped Error
`) -//line lib/promscrape/targets_response.qtpl:72 +//line lib/promscrape/targets_response.qtpl:74 qw422016.E().S(ts.endpoint) -//line lib/promscrape/targets_response.qtpl:72 +//line lib/promscrape/targets_response.qtpl:74 qw422016.N().S(`
`) -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 if ts.up { -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 qw422016.N().S(`UP`) -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 } else { -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 qw422016.N().S(`DOWN`) -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 } -//line lib/promscrape/targets_response.qtpl:73 +//line lib/promscrape/targets_response.qtpl:75 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:77 streamformatLabel(qw422016, ts.labels) -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:77 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:77 +//line lib/promscrape/targets_response.qtpl:79 qw422016.N().FPrec(ts.lastScrapeTime.Seconds(), 3) -//line lib/promscrape/targets_response.qtpl:77 +//line lib/promscrape/targets_response.qtpl:79 qw422016.N().S(`s ago `) -//line lib/promscrape/targets_response.qtpl:78 +//line lib/promscrape/targets_response.qtpl:80 qw422016.N().FPrec(ts.scrapeDuration.Seconds(), 3) -//line lib/promscrape/targets_response.qtpl:78 +//line lib/promscrape/targets_response.qtpl:80 qw422016.N().S(`s `) -//line lib/promscrape/targets_response.qtpl:79 - qw422016.E().S(ts.error) -//line lib/promscrape/targets_response.qtpl:79 +//line lib/promscrape/targets_response.qtpl:81 + qw422016.N().D(ts.samplesScraped) +//line lib/promscrape/targets_response.qtpl:81 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:82 + qw422016.E().S(ts.errMsg) +//line lib/promscrape/targets_response.qtpl:82 qw422016.N().S(`
`) -//line lib/promscrape/targets_response.qtpl:85 +//line lib/promscrape/targets_response.qtpl:88 } -//line lib/promscrape/targets_response.qtpl:85 +//line lib/promscrape/targets_response.qtpl:88 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 } -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 func WriteTargetsResponseHTML(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) { -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 StreamTargetsResponseHTML(qw422016, jts, redirectPath, onlyUnhealthy) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 } -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 func TargetsResponseHTML(jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) string { -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 WriteTargetsResponseHTML(qb422016, jts, redirectPath, onlyUnhealthy) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 return qs422016 -//line lib/promscrape/targets_response.qtpl:88 +//line lib/promscrape/targets_response.qtpl:91 } -//line lib/promscrape/targets_response.qtpl:90 +//line lib/promscrape/targets_response.qtpl:93 func streamformatLabel(qw422016 *qt422016.Writer, labels []prompbmarshal.Label) { -//line lib/promscrape/targets_response.qtpl:90 +//line lib/promscrape/targets_response.qtpl:93 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:94 for _, label := range labels { -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:94 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.E().S(label.Name) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.N().S(`=`) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.E().Q(label.Value) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:92 +//line lib/promscrape/targets_response.qtpl:95 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:93 +//line lib/promscrape/targets_response.qtpl:96 } -//line lib/promscrape/targets_response.qtpl:93 +//line lib/promscrape/targets_response.qtpl:96 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 } -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 func writeformatLabel(qq422016 qtio422016.Writer, labels []prompbmarshal.Label) { -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 streamformatLabel(qw422016, labels) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 } -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 func formatLabel(labels []prompbmarshal.Label) string { -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 writeformatLabel(qb422016, labels) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 return qs422016 -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:97 } diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go index 21998f47b..eee6a548a 100644 --- a/lib/promscrape/targetstatus.go +++ b/lib/promscrape/targetstatus.go @@ -88,7 +88,7 @@ func (tsm *targetStatusMap) Unregister(sw *ScrapeWork) { tsm.mu.Unlock() } -func (tsm *targetStatusMap) Update(sw *ScrapeWork, group string, up bool, scrapeTime, scrapeDuration int64, err error) { +func (tsm *targetStatusMap) Update(sw *ScrapeWork, group string, up bool, scrapeTime, scrapeDuration int64, samplesScraped int, err error) { tsm.mu.Lock() ts := tsm.m[sw] if ts == nil { @@ -101,6 +101,7 @@ func (tsm *targetStatusMap) Update(sw *ScrapeWork, group string, up bool, scrape ts.scrapeGroup = group ts.scrapeTime = scrapeTime ts.scrapeDuration = scrapeDuration + ts.samplesScraped = samplesScraped ts.err = err tsm.mu.Unlock() } @@ -156,6 +157,7 @@ func (tsm *targetStatusMap) WriteActiveTargetsJSON(w io.Writer) { fmt.Fprintf(w, `,"lastError":%q`, errMsg) fmt.Fprintf(w, `,"lastScrape":%q`, time.Unix(st.scrapeTime/1000, (st.scrapeTime%1000)*1e6).Format(time.RFC3339Nano)) fmt.Fprintf(w, `,"lastScrapeDuration":%g`, (time.Millisecond * time.Duration(st.scrapeDuration)).Seconds()) + fmt.Fprintf(w, `,"lastSamplesScraped":%d`, st.samplesScraped) state := "up" if !st.up { state = "down" @@ -185,6 +187,7 @@ type targetStatus struct { scrapeGroup string scrapeTime int64 scrapeDuration int64 + samplesScraped int err error } @@ -270,7 +273,8 @@ type jobTargetStatus struct { originalLabels []prompbmarshal.Label lastScrapeTime time.Duration scrapeDuration time.Duration - error string + samplesScraped int + errMsg string } type jobTargetsStatuses struct { @@ -313,7 +317,8 @@ func (tsm *targetStatusMap) getTargetsStatusByJob() []jobTargetsStatuses { originalLabels: st.sw.OriginalLabels, lastScrapeTime: st.getDurationFromLastScrape(), scrapeDuration: time.Duration(st.scrapeDuration) * time.Millisecond, - error: errMsg, + samplesScraped: st.samplesScraped, + errMsg: errMsg, }) } jts = append(jts, jobTargetsStatuses{ From af90c3c43b69a7cda3a9cfa6697cc90afbf9936d Mon Sep 17 00:00:00 2001 From: faceair Date: Mon, 14 Jun 2021 20:10:58 +0800 Subject: [PATCH 13/26] lib/protoparser: stop read when callback error (#1380) --- lib/protoparser/prometheus/streamparser.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/protoparser/prometheus/streamparser.go b/lib/protoparser/prometheus/streamparser.go index 91009692f..abc937d5c 100644 --- a/lib/protoparser/prometheus/streamparser.go +++ b/lib/protoparser/prometheus/streamparser.go @@ -56,7 +56,7 @@ func ParseStream(r io.Reader, defaultTimestamp int64, isGzipped bool, callback f func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.CallbackError() != nil { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf) @@ -77,7 +77,7 @@ type streamContext struct { err error wg sync.WaitGroup - callbackErrLock sync.Mutex + callbackErrLock sync.RWMutex callbackErr error } @@ -88,6 +88,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) CallbackError() error { + ctx.callbackErrLock.RLock() + callbackErr := ctx.callbackErr + ctx.callbackErrLock.RUnlock() + return callbackErr +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] From e028ad241ac972ee1c5223a6249398e9357fe237 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 14 Jun 2021 15:18:46 +0300 Subject: [PATCH 14/26] lib/protoparser: stop reading the input stream as soon as the callback provided by the caller returns error This is a follow-up for af90c3c43b69a7cda3a9cfa6697cc90afbf9936d --- lib/protoparser/csvimport/streamparser.go | 9 ++++++++- lib/protoparser/graphite/streamparser.go | 9 ++++++++- lib/protoparser/influx/streamparser.go | 9 ++++++++- lib/protoparser/opentsdb/streamparser.go | 9 ++++++++- lib/protoparser/prometheus/streamparser.go | 14 +++++++------- lib/protoparser/vmimport/streamparser.go | 9 ++++++++- 6 files changed, 47 insertions(+), 12 deletions(-) diff --git a/lib/protoparser/csvimport/streamparser.go b/lib/protoparser/csvimport/streamparser.go index 918f40baf..7f5d5ed19 100644 --- a/lib/protoparser/csvimport/streamparser.go +++ b/lib/protoparser/csvimport/streamparser.go @@ -69,7 +69,7 @@ func ParseStream(req *http.Request, callback func(rows []Row) error) error { func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf) @@ -107,6 +107,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] diff --git a/lib/protoparser/graphite/streamparser.go b/lib/protoparser/graphite/streamparser.go index 93ac45394..010771c4d 100644 --- a/lib/protoparser/graphite/streamparser.go +++ b/lib/protoparser/graphite/streamparser.go @@ -54,7 +54,7 @@ func ParseStream(r io.Reader, callback func(rows []Row) error) error { func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf) @@ -86,6 +86,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] diff --git a/lib/protoparser/influx/streamparser.go b/lib/protoparser/influx/streamparser.go index 3da3858b8..c8479cbbc 100644 --- a/lib/protoparser/influx/streamparser.go +++ b/lib/protoparser/influx/streamparser.go @@ -82,7 +82,7 @@ func ParseStream(r io.Reader, isGzipped bool, precision, db string, callback fun func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlockExt(ctx.br, ctx.reqBuf, ctx.tailBuf, maxLineSize.N) @@ -120,6 +120,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] diff --git a/lib/protoparser/opentsdb/streamparser.go b/lib/protoparser/opentsdb/streamparser.go index 14a476e22..c66ad5fca 100644 --- a/lib/protoparser/opentsdb/streamparser.go +++ b/lib/protoparser/opentsdb/streamparser.go @@ -53,7 +53,7 @@ func ParseStream(r io.Reader, callback func(rows []Row) error) error { func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf) @@ -85,6 +85,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] diff --git a/lib/protoparser/prometheus/streamparser.go b/lib/protoparser/prometheus/streamparser.go index abc937d5c..ef7781950 100644 --- a/lib/protoparser/prometheus/streamparser.go +++ b/lib/protoparser/prometheus/streamparser.go @@ -56,7 +56,7 @@ func ParseStream(r io.Reader, defaultTimestamp int64, isGzipped bool, callback f func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil || ctx.CallbackError() != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf) @@ -77,7 +77,7 @@ type streamContext struct { err error wg sync.WaitGroup - callbackErrLock sync.RWMutex + callbackErrLock sync.Mutex callbackErr error } @@ -88,11 +88,11 @@ func (ctx *streamContext) Error() error { return ctx.err } -func (ctx *streamContext) CallbackError() error { - ctx.callbackErrLock.RLock() - callbackErr := ctx.callbackErr - ctx.callbackErrLock.RUnlock() - return callbackErr +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok } func (ctx *streamContext) reset() { diff --git a/lib/protoparser/vmimport/streamparser.go b/lib/protoparser/vmimport/streamparser.go index 7646d6cee..f1ad59709 100644 --- a/lib/protoparser/vmimport/streamparser.go +++ b/lib/protoparser/vmimport/streamparser.go @@ -59,7 +59,7 @@ func ParseStream(req *http.Request, callback func(rows []Row) error) error { func (ctx *streamContext) Read() bool { readCalls.Inc() - if ctx.err != nil { + if ctx.err != nil || ctx.hasCallbackError() { return false } ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlockExt(ctx.br, ctx.reqBuf, ctx.tailBuf, maxLineLen.N) @@ -97,6 +97,13 @@ func (ctx *streamContext) Error() error { return ctx.err } +func (ctx *streamContext) hasCallbackError() bool { + ctx.callbackErrLock.Lock() + ok := ctx.callbackErr != nil + ctx.callbackErrLock.Unlock() + return ok +} + func (ctx *streamContext) reset() { ctx.br.Reset(nil) ctx.reqBuf = ctx.reqBuf[:0] From 84fb59b0baa2bfe0ebc9495d220b5658f3b0c799 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 15 Jun 2021 14:56:51 +0300 Subject: [PATCH 15/26] lib/storage: move deletedMetricIDs set from indexDB to Storage This makes consitent the list of deleted metricIDs when it is used from both the current indexDB and the previous indexDB (aka extDB). This should fix the issue, which could lead to storing new samples under deleted metricIDs after indexDB rotation. See more details at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136 . Thanks to @tangqipengleoo for the initial analysis and the pull request - https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1383 . This commit resolves the issue in more generic way compared to https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1383 . The downside of the commit is the deletedMetricIDs set isn't cleaned from the metricIDs outside the retention. It needs app restart. This should be OK in most cases. --- docs/CHANGELOG.md | 2 ++ lib/storage/index_db.go | 64 ++++++++++-------------------------- lib/storage/index_db_test.go | 4 ++- lib/storage/storage.go | 40 +++++++++++++++++++--- 4 files changed, 58 insertions(+), 52 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 212840203..2b0d1848a 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -9,6 +9,8 @@ sort: 15 * FEATURE: vmagent: add service discovery for DigitalOcean (aka [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367). * FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377). +* BUGFIX: prevent from adding new samples to deleted time series after the rotation of the inverted index (the rotation is performed once per `-retentionPeriod`). See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136) for details. + ## [v1.61.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.61.1) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index f2244438a..5123f98c3 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -103,15 +103,6 @@ type indexDB struct { loopsPerDateTagFilterCache *workingsetcache.Cache indexSearchPool sync.Pool - - // An inmemory set of deleted metricIDs. - // - // The set holds deleted metricIDs for the current db and for the extDB. - // - // It is safe to keep the set in memory even for big number of deleted - // metricIDs, since it usually requires 1 bit per deleted metricID. - deletedMetricIDs atomic.Value - deletedMetricIDsUpdateLock sync.Mutex } // openIndexDB opens index db from the given path with the given caches. @@ -140,14 +131,6 @@ func openIndexDB(path string, s *Storage) (*indexDB, error) { uselessTagFiltersCache: workingsetcache.New(mem/128, time.Hour), loopsPerDateTagFilterCache: workingsetcache.New(mem/128, time.Hour), } - - is := db.getIndexSearch(noDeadline) - dmis, err := is.loadDeletedMetricIDs() - db.putIndexSearch(is) - if err != nil { - return nil, fmt.Errorf("cannot load deleted metricIDs: %w", err) - } - db.setDeletedMetricIDs(dmis) return db, nil } @@ -214,7 +197,7 @@ func (db *indexDB) UpdateMetrics(m *IndexDBMetrics) { m.UselessTagFiltersCacheRequests += cs.GetCalls m.UselessTagFiltersCacheMisses += cs.Misses - m.DeletedMetricsCount += uint64(db.getDeletedMetricIDs().Len()) + m.DeletedMetricsCount += uint64(db.s.getDeletedMetricIDs().Len()) m.IndexDBRefCount += atomic.LoadUint64(&db.refCount) m.NewTimeseriesCreated += atomic.LoadUint64(&db.newTimeseriesCreated) @@ -260,12 +243,6 @@ func (db *indexDB) doExtDB(f func(extDB *indexDB)) bool { // // It decrements refCount for the previous extDB. func (db *indexDB) SetExtDB(extDB *indexDB) { - // Add deleted metricIDs from extDB to db. - if extDB != nil { - dmisExt := extDB.getDeletedMetricIDs() - db.updateDeletedMetricIDs(dmisExt) - } - db.extDBLock.Lock() prevExtDB := db.extDB db.extDB = extDB @@ -737,7 +714,7 @@ func (is *indexSearch) searchTagKeysOnDate(tks map[string]struct{}, date uint64, kb := &is.kb mp := &is.mp mp.Reset() - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() loopsPaceLimiter := 0 kb.B = is.marshalCommonPrefix(kb.B[:0], nsPrefixDateTagToMetricIDs) kb.B = encoding.MarshalUint64(kb.B, date) @@ -817,7 +794,7 @@ func (is *indexSearch) searchTagKeys(tks map[string]struct{}, maxTagKeys int) er kb := &is.kb mp := &is.mp mp.Reset() - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() loopsPaceLimiter := 0 kb.B = is.marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs) prefix := kb.B @@ -935,7 +912,7 @@ func (is *indexSearch) searchTagValuesOnDate(tvs map[string]struct{}, tagKey []b kb := &is.kb mp := &is.mp mp.Reset() - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() loopsPaceLimiter := 0 kb.B = is.marshalCommonPrefix(kb.B[:0], nsPrefixDateTagToMetricIDs) kb.B = encoding.MarshalUint64(kb.B, date) @@ -1021,7 +998,7 @@ func (is *indexSearch) searchTagValues(tvs map[string]struct{}, tagKey []byte, m kb := &is.kb mp := &is.mp mp.Reset() - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() loopsPaceLimiter := 0 kb.B = is.marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs) kb.B = marshalTagValue(kb.B, tagKey) @@ -1175,7 +1152,7 @@ func (is *indexSearch) searchTagValueSuffixesForPrefix(tvss map[string]struct{}, ts := &is.ts mp := &is.mp mp.Reset() - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() loopsPaceLimiter := 0 ts.Seek(prefix) for len(tvss) < maxTagValueSuffixes && ts.NextItem() { @@ -1616,7 +1593,7 @@ func (db *indexDB) deleteMetricIDs(metricIDs []uint64) error { // atomically add deleted metricIDs to an inmemory map. dmis := &uint64set.Set{} dmis.AddMulti(metricIDs) - db.updateDeletedMetricIDs(dmis) + db.s.updateDeletedMetricIDs(dmis) // Reset TagFilters -> TSIDS cache, since it may contain deleted TSIDs. invalidateTagCache() @@ -1643,21 +1620,14 @@ func (db *indexDB) deleteMetricIDs(metricIDs []uint64) error { return err } -func (db *indexDB) getDeletedMetricIDs() *uint64set.Set { - return db.deletedMetricIDs.Load().(*uint64set.Set) -} - -func (db *indexDB) setDeletedMetricIDs(dmis *uint64set.Set) { - db.deletedMetricIDs.Store(dmis) -} - -func (db *indexDB) updateDeletedMetricIDs(metricIDs *uint64set.Set) { - db.deletedMetricIDsUpdateLock.Lock() - dmisOld := db.getDeletedMetricIDs() - dmisNew := dmisOld.Clone() - dmisNew.Union(metricIDs) - db.setDeletedMetricIDs(dmisNew) - db.deletedMetricIDsUpdateLock.Unlock() +func (db *indexDB) loadDeletedMetricIDs() (*uint64set.Set, error) { + is := db.getIndexSearch(noDeadline) + dmis, err := is.loadDeletedMetricIDs() + db.putIndexSearch(is) + if err != nil { + return nil, err + } + return dmis, nil } func (is *indexSearch) loadDeletedMetricIDs() (*uint64set.Set, error) { @@ -1751,7 +1721,7 @@ func (db *indexDB) searchTSIDs(tfss []*TagFilters, tr TimeRange, maxMetrics int, var tagFiltersKeyBufPool bytesutil.ByteBufferPool func (is *indexSearch) getTSIDByMetricName(dst *TSID, metricName []byte) error { - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() ts := &is.ts kb := &is.kb kb.B = append(kb.B[:0], nsPrefixMetricNameToTSID) @@ -2315,7 +2285,7 @@ func (is *indexSearch) searchMetricIDs(tfss []*TagFilters, tr TimeRange, maxMetr sortedMetricIDs := metricIDs.AppendTo(nil) // Filter out deleted metricIDs. - dmis := is.db.getDeletedMetricIDs() + dmis := is.db.s.getDeletedMetricIDs() if dmis.Len() > 0 { metricIDsFiltered := sortedMetricIDs[:0] for _, metricID := range sortedMetricIDs { diff --git a/lib/storage/index_db_test.go b/lib/storage/index_db_test.go index 681dd8525..6cdeac28a 100644 --- a/lib/storage/index_db_test.go +++ b/lib/storage/index_db_test.go @@ -1711,13 +1711,15 @@ func toTFPointers(tfs []tagFilter) []*tagFilter { } func newTestStorage() *Storage { - return &Storage{ + s := &Storage{ cachePath: "test-storage-cache", metricIDCache: workingsetcache.New(1234, time.Hour), metricNameCache: workingsetcache.New(1234, time.Hour), tsidCache: workingsetcache.New(1234, time.Hour), } + s.setDeletedMetricIDs(&uint64set.Set{}) + return s } func stopTestStorage(s *Storage) { diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 239dce203..bcf129207 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -120,6 +120,13 @@ type Storage struct { // The minimum timestamp when composite index search can be used. minTimestampForCompositeIndex int64 + + // An inmemory set of deleted metricIDs. + // + // It is safe to keep the set in memory even for big number of deleted + // metricIDs, since it usually requires 1 bit per deleted metricID. + deletedMetricIDs atomic.Value + deletedMetricIDsUpdateLock sync.Mutex } // OpenStorage opens storage on the given path with the given retentionMsecs. @@ -208,6 +215,18 @@ func OpenStorage(path string, retentionMsecs int64, maxHourlySeries, maxDailySer idbCurr.SetExtDB(idbPrev) s.idbCurr.Store(idbCurr) + // Load deleted metricIDs from idbCurr and idbPrev + dmisCurr, err := idbCurr.loadDeletedMetricIDs() + if err != nil { + return nil, fmt.Errorf("cannot load deleted metricIDs for the current indexDB: %w", err) + } + dmisPrev, err := idbPrev.loadDeletedMetricIDs() + if err != nil { + return nil, fmt.Errorf("cannot load deleted metricIDs for the previous indexDB: %w", err) + } + s.setDeletedMetricIDs(dmisCurr) + s.updateDeletedMetricIDs(dmisPrev) + // Load data tablePath := path + "/data" tb, err := openTable(tablePath, s.getDeletedMetricIDs, retentionMsecs) @@ -224,16 +243,29 @@ func OpenStorage(path string, retentionMsecs int64, maxHourlySeries, maxDailySer return s, nil } +func (s *Storage) getDeletedMetricIDs() *uint64set.Set { + return s.deletedMetricIDs.Load().(*uint64set.Set) +} + +func (s *Storage) setDeletedMetricIDs(dmis *uint64set.Set) { + s.deletedMetricIDs.Store(dmis) +} + +func (s *Storage) updateDeletedMetricIDs(metricIDs *uint64set.Set) { + s.deletedMetricIDsUpdateLock.Lock() + dmisOld := s.getDeletedMetricIDs() + dmisNew := dmisOld.Clone() + dmisNew.Union(metricIDs) + s.setDeletedMetricIDs(dmisNew) + s.deletedMetricIDsUpdateLock.Unlock() +} + // DebugFlush flushes recently added storage data, so it becomes visible to search. func (s *Storage) DebugFlush() { s.tb.flushRawRows() s.idb().tb.DebugFlush() } -func (s *Storage) getDeletedMetricIDs() *uint64set.Set { - return s.idb().getDeletedMetricIDs() -} - // CreateSnapshot creates snapshot for s and returns the snapshot name. func (s *Storage) CreateSnapshot() (string, error) { logger.Infof("creating Storage snapshot for %q...", s.path) From 6a8369f0fcf1aaf74851705505096e4d3ed48c72 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 15 Jun 2021 17:32:52 +0300 Subject: [PATCH 16/26] docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics works great with APM workloads (aka Application Performance Monitoring) --- README.md | 2 +- docs/Single-server-VictoriaMetrics.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 94e86d5bb..c3ed564a8 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ Alphabetically sorted links to case studies: * [Arbitrary CSV data](#how-to-import-csv-data). * Supports metrics' relabeling. See [these docs](#relabeling) for details. * Can deal with high cardinality and high churn rate issues using [series limiter](#cardinality-limiter). -* Ideally works with big amounts of time series data from Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads. +* Ideally works with big amounts of time series data from APM, Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads. * Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster). * See also technical [Articles about VictoriaMetrics](https://docs.victoriametrics.com/Articles.html). diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 012c133de..234a49807 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -98,7 +98,7 @@ Alphabetically sorted links to case studies: * [Arbitrary CSV data](#how-to-import-csv-data). * Supports metrics' relabeling. See [these docs](#relabeling) for details. * Can deal with high cardinality and high churn rate issues using [series limiter](#cardinality-limiter). -* Ideally works with big amounts of time series data from Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads. +* Ideally works with big amounts of time series data from APM, Kubernetes, IoT sensors, connected cars, industrial telemetry, financial data and various Enterprise workloads. * Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster). * See also technical [Articles about VictoriaMetrics](https://docs.victoriametrics.com/Articles.html). From fb4f758715c60909e8e10048f22ae9b63e673999 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 15 Jun 2021 17:37:19 +0300 Subject: [PATCH 17/26] promql: fix `increase_pure` calculation for cases with stale series (#1381) Due to staleness handling, increase_pure were using incorrect previous value during calculation in cases where series disappears for period longer than staleness period and then returns back. The fix suppose to account for a real datapoint value before staleness takes place. The fix should remove unexpected spikes while using `increase_pure` for staled series. --- app/vmselect/promql/rollup.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/vmselect/promql/rollup.go b/app/vmselect/promql/rollup.go index 4bdb73fca..4e9eb763f 100644 --- a/app/vmselect/promql/rollup.go +++ b/app/vmselect/promql/rollup.go @@ -1332,7 +1332,8 @@ func rollupIncreasePure(rfa *rollupFuncArg) float64 { // There is no need in handling NaNs here, since they must be cleaned up // before calling rollup funcs. values := rfa.values - prevValue := rfa.prevValue + // restore to the real value because of potential staleness reset + prevValue := rfa.realPrevValue if math.IsNaN(prevValue) { if len(values) == 0 { return nan From 0a796f7c3a81852e9e7610ff34d365f1c0c284d9 Mon Sep 17 00:00:00 2001 From: Zongyang <8848479+xiaozongyang@users.noreply.github.com> Date: Wed, 16 Jun 2021 17:16:44 +0800 Subject: [PATCH 18/26] Change default value of '-remoteWrite.queues' to cgroup.AvailableCPUs * 2 (#1385) * Change default value of '-remoteWrite.queues' to cgroup.AvailableCPUS() * 2 to reduce scrape interval Default value of vmagent option '-remotewrite.queues' is 4 and default size of vmagent ScheudleUnmarshalWorkers is number of CPUs, when available CPUs is much greater than 4, e.g 32, worker are competing push queues which will increase scrape interval and may cause scrape timeout. * Update README and flag description Co-authored-by: xiaozy --- app/vmagent/README.md | 2 +- app/vmagent/remotewrite/remotewrite.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/vmagent/README.md b/app/vmagent/README.md index 901309994..641a7ab9a 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -719,7 +719,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234 Supports an array of values separated by comma or specified via multiple flags. -remoteWrite.queues int - The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 4) + The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 2 * numberOfAvailableCPUs) -remoteWrite.rateLimit array Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage Supports array of values separated by comma or specified via multiple flags. diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index 1b5b9b3f5..c187515b1 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -28,8 +28,8 @@ var ( "Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems") tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+ "See also -remoteWrite.maxDiskUsagePerURL") - queues = flag.Int("remoteWrite.queues", 4, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+ - "isn't enough for sending high volume of collected data to remote storage") + queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs() * 2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+ + "isn't enough for sending high volume of collected data to remote storage. Default value if 2 * numberOfAvailableCPUs") showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+ "It is hidden by default, since it can contain sensitive info such as auth key") maxPendingBytesPerURL = flagutil.NewBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+ From 6d17a4e12d7f50a21bfb5a044ef1ba22d3c63c79 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 16 Jun 2021 12:35:39 +0300 Subject: [PATCH 19/26] docs/CHANGELOG.md: document the changed `-remoteWrite.queues` value This is a follow-up for 0a796f7c3a81852e9e7610ff34d365f1c0c284d9 See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1385 --- docs/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2b0d1848a..8e4ea7b22 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -8,6 +8,7 @@ sort: 15 * FEATURE: vmagent: add service discovery for DigitalOcean (aka [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367). * FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377). +* FEATURE: vmagent: change the default value for `-remoteWrite.queues` from 4 to `2 * numCPUs`. This should reduce scrape duration for highly loaded vmagent, which scrapes tens of thousands of targets. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1385). * BUGFIX: prevent from adding new samples to deleted time series after the rotation of the inverted index (the rotation is performed once per `-retentionPeriod`). See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136) for details. From 9eb3fc346f9ca2dd735823d3c5dcb11f1b8a35ec Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 16 Jun 2021 12:36:49 +0300 Subject: [PATCH 20/26] docs/vmagent.md: sync with app/vmagent/README.md via `make docs-sync` --- docs/vmagent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vmagent.md b/docs/vmagent.md index d257d34e2..1f1a29463 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -723,7 +723,7 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234 Supports an array of values separated by comma or specified via multiple flags. -remoteWrite.queues int - The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 4) + The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage (default 2 * numberOfAvailableCPUs) -remoteWrite.rateLimit array Optional rate limit in bytes per second for data sent to -remoteWrite.url. By default the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data is sent after temporary unavailability of the remote storage Supports array of values separated by comma or specified via multiple flags. From 12a83d25bf35744718d7cd958c776dc20fd4e95d Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 17 Jun 2021 13:26:35 +0300 Subject: [PATCH 21/26] app/vmagent/remotewrite: `go fmt` after 0a796f7c3a81852e9e7610ff34d365f1c0c284d9 --- app/vmagent/remotewrite/remotewrite.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index c187515b1..29a5ef1ff 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -28,7 +28,7 @@ var ( "Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems") tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored. "+ "See also -remoteWrite.maxDiskUsagePerURL") - queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs() * 2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+ + queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs()*2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+ "isn't enough for sending high volume of collected data to remote storage. Default value if 2 * numberOfAvailableCPUs") showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+ "It is hidden by default, since it can contain sensitive info such as auth key") From aa9b56a046b6ae8083fa659df35dd5e994bf9115 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 17 Jun 2021 13:42:32 +0300 Subject: [PATCH 22/26] lib/{mergeset,storage}: reduce the number of fsync calls on data ingestion path on systems with many cpu cores VictoriaMetrics maintains a buffer per CPU core for the ingested data. These buffers are flushed to disk every second. These buffers are flushed to disk in parallel starting from the commit 56b6b893ce821eaa14ce6a6730b26e0f342b3670 . This resulted in increased write disk IO usage on systems with many cpu cores as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338#issuecomment-863046999 . This commit merges the per-CPU buffers into bigger in-memory buffers before flushing them to disk. This should reduce the rate of fsync syscalls and, consequently, the write disk IO on systems with many CPU cores. This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338 See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1244 --- lib/mergeset/table.go | 88 ++++++++++++++++++------------------ lib/storage/partition.go | 96 ++++++++++++---------------------------- 2 files changed, 75 insertions(+), 109 deletions(-) diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index ba12ada94..00d585cda 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -177,7 +177,7 @@ func (ris *rawItemsShard) Len() int { func (ris *rawItemsShard) addItems(tb *Table, items [][]byte) error { var err error - var blocksToMerge []*inmemoryBlock + var blocksToFlush []*inmemoryBlock ris.mu.Lock() ibs := ris.ibs @@ -200,19 +200,16 @@ func (ris *rawItemsShard) addItems(tb *Table, items [][]byte) error { } } if len(ibs) >= maxBlocksPerShard { - blocksToMerge = ibs - ris.ibs = make([]*inmemoryBlock, 0, maxBlocksPerShard) + blocksToFlush = append(blocksToFlush, ibs...) + for i := range ibs { + ibs[i] = nil + } + ris.ibs = ibs[:0] ris.lastFlushTime = fasttime.UnixTimestamp() } ris.mu.Unlock() - if blocksToMerge == nil { - // Fast path. - return err - } - - // Slow path: merge blocksToMerge. - tb.mergeRawItemsBlocks(blocksToMerge) + tb.mergeRawItemsBlocks(blocksToFlush) return err } @@ -586,58 +583,65 @@ func (riss *rawItemsShards) flush(tb *Table, isFinal bool) { tb.rawItemsPendingFlushesWG.Add(1) defer tb.rawItemsPendingFlushesWG.Done() - var wg sync.WaitGroup - wg.Add(len(riss.shards)) + var blocksToFlush []*inmemoryBlock for i := range riss.shards { - go func(ris *rawItemsShard) { - ris.flush(tb, isFinal) - wg.Done() - }(&riss.shards[i]) + blocksToFlush = riss.shards[i].appendBlocksToFlush(blocksToFlush, tb, isFinal) } - wg.Wait() + tb.mergeRawItemsBlocks(blocksToFlush) } -func (ris *rawItemsShard) flush(tb *Table, isFinal bool) { - mustFlush := false +func (ris *rawItemsShard) appendBlocksToFlush(dst []*inmemoryBlock, tb *Table, isFinal bool) []*inmemoryBlock { currentTime := fasttime.UnixTimestamp() flushSeconds := int64(rawItemsFlushInterval.Seconds()) if flushSeconds <= 0 { flushSeconds = 1 } - var blocksToMerge []*inmemoryBlock ris.mu.Lock() if isFinal || currentTime-ris.lastFlushTime > uint64(flushSeconds) { - mustFlush = true - blocksToMerge = ris.ibs - ris.ibs = make([]*inmemoryBlock, 0, maxBlocksPerShard) + ibs := ris.ibs + dst = append(dst, ibs...) + for i := range ibs { + ibs[i] = nil + } + ris.ibs = ibs[:0] ris.lastFlushTime = currentTime } ris.mu.Unlock() - if mustFlush { - tb.mergeRawItemsBlocks(blocksToMerge) - } + return dst } -func (tb *Table) mergeRawItemsBlocks(blocksToMerge []*inmemoryBlock) { +func (tb *Table) mergeRawItemsBlocks(ibs []*inmemoryBlock) { + if len(ibs) == 0 { + return + } tb.partMergersWG.Add(1) defer tb.partMergersWG.Done() - pws := make([]*partWrapper, 0, (len(blocksToMerge)+defaultPartsToMerge-1)/defaultPartsToMerge) - for len(blocksToMerge) > 0 { + pws := make([]*partWrapper, 0, (len(ibs)+defaultPartsToMerge-1)/defaultPartsToMerge) + var pwsLock sync.Mutex + var wg sync.WaitGroup + for len(ibs) > 0 { n := defaultPartsToMerge - if n > len(blocksToMerge) { - n = len(blocksToMerge) + if n > len(ibs) { + n = len(ibs) } - pw := tb.mergeInmemoryBlocks(blocksToMerge[:n]) - blocksToMerge = blocksToMerge[n:] - if pw == nil { - continue - } - pw.isInMerge = true - pws = append(pws, pw) + wg.Add(1) + go func(ibsPart []*inmemoryBlock) { + defer wg.Done() + pw := tb.mergeInmemoryBlocks(ibsPart) + if pw == nil { + return + } + pw.isInMerge = true + pwsLock.Lock() + pws = append(pws, pw) + pwsLock.Unlock() + }(ibs[:n]) + ibs = ibs[n:] } + wg.Wait() if len(pws) > 0 { if err := tb.mergeParts(pws, nil, true); err != nil { logger.Panicf("FATAL: cannot merge raw parts: %s", err) @@ -672,10 +676,10 @@ func (tb *Table) mergeRawItemsBlocks(blocksToMerge []*inmemoryBlock) { } } -func (tb *Table) mergeInmemoryBlocks(blocksToMerge []*inmemoryBlock) *partWrapper { - // Convert blocksToMerge into inmemoryPart's - mps := make([]*inmemoryPart, 0, len(blocksToMerge)) - for _, ib := range blocksToMerge { +func (tb *Table) mergeInmemoryBlocks(ibs []*inmemoryBlock) *partWrapper { + // Convert ibs into inmemoryPart's + mps := make([]*inmemoryPart, 0, len(ibs)) + for _, ib := range ibs { if len(ib.items) == 0 { continue } diff --git a/lib/storage/partition.go b/lib/storage/partition.go index 3288112f2..11d62c980 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -4,7 +4,6 @@ import ( "errors" "fmt" "io/ioutil" - "math/bits" "os" "path/filepath" "sort" @@ -478,11 +477,12 @@ func (rrs *rawRowsShard) Len() int { } func (rrs *rawRowsShard) addRows(pt *partition, rows []rawRow) { - var rrss []*rawRows + var rowsToFlush []rawRow rrs.mu.Lock() if cap(rrs.rows) == 0 { - rrs.rows = getRawRowsMaxSize().rows + n := getMaxRawRowsPerShard() + rrs.rows = make([]rawRow, 0, n) } maxRowsCount := cap(rrs.rows) for { @@ -494,65 +494,35 @@ func (rrs *rawRowsShard) addRows(pt *partition, rows []rawRow) { } // Slow path - rows don't fit capacity. - // Fill rawRows to capacity and convert it to a part. - rrs.rows = append(rrs.rows, rows[:capacity]...) - rows = rows[capacity:] - rr := getRawRowsMaxSize() - rrs.rows, rr.rows = rr.rows, rrs.rows - rrss = append(rrss, rr) + // Put rrs.rows and rows to rowsToFlush and convert it to a part. + rowsToFlush = append(rowsToFlush, rrs.rows...) + rowsToFlush = append(rowsToFlush, rows...) + rrs.rows = rrs.rows[:0] rrs.lastFlushTime = fasttime.UnixTimestamp() } rrs.mu.Unlock() - for _, rr := range rrss { - pt.addRowsPart(rr.rows) - putRawRows(rr) - } + pt.flushRowsToParts(rowsToFlush) } -type rawRows struct { - rows []rawRow -} - -func getRawRowsMaxSize() *rawRows { - size := getMaxRawRowsPerShard() - return getRawRowsWithSize(size) -} - -func getRawRowsWithSize(size int) *rawRows { - p, sizeRounded := getRawRowsPool(size) - v := p.Get() - if v == nil { - return &rawRows{ - rows: make([]rawRow, 0, sizeRounded), +func (pt *partition) flushRowsToParts(rows []rawRow) { + maxRows := getMaxRawRowsPerShard() + var wg sync.WaitGroup + for len(rows) > 0 { + n := maxRows + if n > len(rows) { + n = len(rows) } + wg.Add(1) + go func(rowsPart []rawRow) { + defer wg.Done() + pt.addRowsPart(rowsPart) + }(rows[:n]) + rows = rows[n:] } - return v.(*rawRows) + wg.Wait() } -func putRawRows(rr *rawRows) { - rr.rows = rr.rows[:0] - size := cap(rr.rows) - p, _ := getRawRowsPool(size) - p.Put(rr) -} - -func getRawRowsPool(size int) (*sync.Pool, int) { - size-- - if size < 0 { - size = 0 - } - bucketIdx := 64 - bits.LeadingZeros64(uint64(size)) - if bucketIdx >= len(rawRowsPools) { - bucketIdx = len(rawRowsPools) - 1 - } - p := &rawRowsPools[bucketIdx] - sizeRounded := 1 << uint(bucketIdx) - return p, sizeRounded -} - -var rawRowsPools [19]sync.Pool - func (pt *partition) addRowsPart(rows []rawRow) { if len(rows) == 0 { return @@ -749,19 +719,14 @@ func (pt *partition) flushRawRows(isFinal bool) { } func (rrss *rawRowsShards) flush(pt *partition, isFinal bool) { - var wg sync.WaitGroup - wg.Add(len(rrss.shards)) + var rowsToFlush []rawRow for i := range rrss.shards { - go func(rrs *rawRowsShard) { - rrs.flush(pt, isFinal) - wg.Done() - }(&rrss.shards[i]) + rowsToFlush = rrss.shards[i].appendRawRowsToFlush(rowsToFlush, pt, isFinal) } - wg.Wait() + pt.flushRowsToParts(rowsToFlush) } -func (rrs *rawRowsShard) flush(pt *partition, isFinal bool) { - var rr *rawRows +func (rrs *rawRowsShard) appendRawRowsToFlush(dst []rawRow, pt *partition, isFinal bool) []rawRow { currentTime := fasttime.UnixTimestamp() flushSeconds := int64(rawRowsFlushInterval.Seconds()) if flushSeconds <= 0 { @@ -770,15 +735,12 @@ func (rrs *rawRowsShard) flush(pt *partition, isFinal bool) { rrs.mu.Lock() if isFinal || currentTime-rrs.lastFlushTime > uint64(flushSeconds) { - rr = getRawRowsMaxSize() - rrs.rows, rr.rows = rr.rows, rrs.rows + dst = append(dst, rrs.rows...) + rrs.rows = rrs.rows[:0] } rrs.mu.Unlock() - if rr != nil { - pt.addRowsPart(rr.rows) - putRawRows(rr) - } + return dst } func (pt *partition) startInmemoryPartsFlusher() { From dcbc22552f0dcfa3075982eabeda44b0a28fd428 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 17 Jun 2021 14:27:14 +0300 Subject: [PATCH 23/26] lib/storage: fix infinite loop introduced in aa9b56a046b6ae8083fa659df35dd5e994bf9115 --- lib/storage/partition.go | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lib/storage/partition.go b/lib/storage/partition.go index 11d62c980..3b6213d9c 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -485,14 +485,11 @@ func (rrs *rawRowsShard) addRows(pt *partition, rows []rawRow) { rrs.rows = make([]rawRow, 0, n) } maxRowsCount := cap(rrs.rows) - for { - capacity := maxRowsCount - len(rrs.rows) - if capacity >= len(rows) { - // Fast path - rows fit capacity. - rrs.rows = append(rrs.rows, rows...) - break - } - + capacity := maxRowsCount - len(rrs.rows) + if capacity >= len(rows) { + // Fast path - rows fit capacity. + rrs.rows = append(rrs.rows, rows...) + } else { // Slow path - rows don't fit capacity. // Put rrs.rows and rows to rowsToFlush and convert it to a part. rowsToFlush = append(rowsToFlush, rrs.rows...) From 6c434b260e608129827b78a74f0949121f2dbbfa Mon Sep 17 00:00:00 2001 From: Nikolay Date: Thu, 17 Jun 2021 15:12:20 +0300 Subject: [PATCH 24/26] fixes DO service discovery labels (#1389) adds test for digitalocean sd --- .../discovery/digitalocean/digitalocean.go | 4 +- .../digitalocean/digitalocean_test.go | 98 +++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 lib/promscrape/discovery/digitalocean/digitalocean_test.go diff --git a/lib/promscrape/discovery/digitalocean/digitalocean.go b/lib/promscrape/discovery/digitalocean/digitalocean.go index e498f201a..3c68c2b72 100644 --- a/lib/promscrape/discovery/digitalocean/digitalocean.go +++ b/lib/promscrape/discovery/digitalocean/digitalocean.go @@ -136,10 +136,10 @@ func addDropletLabels(droplets []droplet, defaultPort int) []map[string]string { } if len(droplet.Features) > 0 { features := fmt.Sprintf(",%s,", strings.Join(droplet.Features, ",")) - m["__meta_digitalocean_vpc"] = features + m["__meta_digitalocean_features"] = features } if len(droplet.Tags) > 0 { - tags := fmt.Sprintf(",%s,", strings.Join(droplet.Features, ",")) + tags := fmt.Sprintf(",%s,", strings.Join(droplet.Tags, ",")) m["__meta_digitalocean_tags"] = tags } ms = append(ms, m) diff --git a/lib/promscrape/discovery/digitalocean/digitalocean_test.go b/lib/promscrape/discovery/digitalocean/digitalocean_test.go new file mode 100644 index 000000000..ecb177fbf --- /dev/null +++ b/lib/promscrape/discovery/digitalocean/digitalocean_test.go @@ -0,0 +1,98 @@ +package digitalocean + +import ( + "reflect" + "testing" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils" +) + +func Test_addDropletLabels(t *testing.T) { + type args struct { + droplets []droplet + defaultPort int + } + tests := []struct { + name string + args args + want [][]prompbmarshal.Label + }{ + { + name: "base labels add test", + args: args{ + droplets: []droplet{ + { + ID: 15, + Tags: []string{"private", "test"}, + Status: "active", + Name: "ubuntu-1", + Region: struct { + Slug string `json:"slug"` + }(struct{ Slug string }{Slug: "do"}), + Features: []string{"feature-1", "feature-2"}, + SizeSlug: "base-1", + VpcUUID: "vpc-1", + Image: struct { + Name string `json:"name"` + Slug string `json:"slug"` + }(struct { + Name string + Slug string + }{Name: "ubuntu", Slug: "18"}), + Networks: networks{ + V4: []network{ + { + Type: "public", + IPAddress: "100.100.100.100", + }, + { + Type: "private", + IPAddress: "10.10.10.10", + }, + }, + V6: []network{ + { + Type: "public", + IPAddress: "::1", + }, + }, + }, + }, + }, + defaultPort: 9100, + }, + want: [][]prompbmarshal.Label{ + discoveryutils.GetSortedLabels(map[string]string{ + "__address__": "100.100.100.100:9100", + "__meta_digitalocean_droplet_id": "15", + "__meta_digitalocean_droplet_name": "ubuntu-1", + "__meta_digitalocean_features": ",feature-1,feature-2,", + "__meta_digitalocean_image": "18", + "__meta_digitalocean_image_name": "ubuntu", + "__meta_digitalocean_private_ipv4": "10.10.10.10", + "__meta_digitalocean_public_ipv4": "100.100.100.100", + "__meta_digitalocean_public_ipv6": "::1", + "__meta_digitalocean_region": "do", + "__meta_digitalocean_size": "base-1", + "__meta_digitalocean_status": "active", + "__meta_digitalocean_tags": ",private,test,", + "__meta_digitalocean_vpc": "vpc-1", + }), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := addDropletLabels(tt.args.droplets, tt.args.defaultPort) + var sortedLabelss [][]prompbmarshal.Label + for _, labels := range got { + sortedLabelss = append(sortedLabelss, discoveryutils.GetSortedLabels(labels)) + } + if !reflect.DeepEqual(sortedLabelss, tt.want) { + t.Errorf("addTasksLabels() \ngot \n%v\n, \nwant \n%v\n", sortedLabelss, tt.want) + } + + }) + } +} From d8ab40941850ec5d492b931fa531839f0fd9ed2a Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 17 Jun 2021 17:18:47 +0300 Subject: [PATCH 25/26] docs/{vmgateway,vmbackupmanager}: explicitly mention that these components are a part of an enterprise package --- app/vmbackupmanager/README.md | 6 +++--- app/vmgateway/README.md | 2 ++ docs/vmbackupmanager.md | 6 +++--- docs/vmgateway.md | 2 ++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/app/vmbackupmanager/README.md b/app/vmbackupmanager/README.md index c11b32fef..9087e8a89 100644 --- a/app/vmbackupmanager/README.md +++ b/app/vmbackupmanager/README.md @@ -1,8 +1,8 @@ ## vmbackupmanager -VictoriaMetrics backup manager +***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/enterprise.html)*** -This service automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed. +The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed. The backup service makes a backup every hour and puts it to the latest folder and then copies data to the folders which represent the backup intervals (hourly, daily, weekly and monthly) The required flags for running the service are as follows: @@ -49,7 +49,7 @@ There are two flags which could help with performance tuning: * -concurrency - The number of concurrent workers. Higher concurrency may improve upload speed (default 10) -### Example of Usage +## Example of Usage GCS and cluster version. You need to have a credentials file in json format with following structure diff --git a/app/vmgateway/README.md b/app/vmgateway/README.md index 31cfedd7a..dfe7fe81b 100644 --- a/app/vmgateway/README.md +++ b/app/vmgateway/README.md @@ -1,5 +1,7 @@ # vmgateway +***vmgateway is a part of [enterprise package](https://victoriametrics.com/enterprise.html)*** + vmgateway diff --git a/docs/vmbackupmanager.md b/docs/vmbackupmanager.md index 7be66246b..bdda531fe 100644 --- a/docs/vmbackupmanager.md +++ b/docs/vmbackupmanager.md @@ -4,9 +4,9 @@ sort: 10 ## vmbackupmanager -VictoriaMetrics backup manager +***vmbackupmanager is a part of [enterprise package](https://victoriametrics.com/enterprise.html)*** -This service automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed. +The VictoriaMetrics backup manager automates regular backup procedures. It supports the following backup intervals: **hourly**, **daily**, **weekly** and **monthly**. Multiple backup intervals may be configured simultaneously. I.e. the backup manager creates hourly backups every hour, while it creates daily backups every day, etc. Backup manager must have read access to the storage data, so best practice is to install it on the same machine (or as a sidecar) where the storage node is installed. The backup service makes a backup every hour and puts it to the latest folder and then copies data to the folders which represent the backup intervals (hourly, daily, weekly and monthly) The required flags for running the service are as follows: @@ -53,7 +53,7 @@ There are two flags which could help with performance tuning: * -concurrency - The number of concurrent workers. Higher concurrency may improve upload speed (default 10) -### Example of Usage +## Example of Usage GCS and cluster version. You need to have a credentials file in json format with following structure diff --git a/docs/vmgateway.md b/docs/vmgateway.md index 2aacb16d4..d39b7fcfc 100644 --- a/docs/vmgateway.md +++ b/docs/vmgateway.md @@ -4,6 +4,8 @@ sort: 9 # vmgateway +***vmgateway is a part of [enterprise package](https://victoriametrics.com/enterprise.html)*** + vmgateway From fb72a2133f1d14a4c106bf426ae80e038404f341 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Fri, 18 Jun 2021 10:53:10 +0300 Subject: [PATCH 26/26] lib/promscrape: show jobs with empty scrape targets on /targets page --- docs/CHANGELOG.md | 1 + lib/promscrape/config.go | 11 + lib/promscrape/targets_response.qtpl | 47 ++- lib/promscrape/targets_response.qtpl.go | 418 ++++++++++++------------ lib/promscrape/targetstatus.go | 41 ++- 5 files changed, 294 insertions(+), 224 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 8e4ea7b22..f23b9ad17 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -9,6 +9,7 @@ sort: 15 * FEATURE: vmagent: add service discovery for DigitalOcean (aka [digitalocean_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#digitalocean_sd_config)). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1367). * FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377). * FEATURE: vmagent: change the default value for `-remoteWrite.queues` from 4 to `2 * numCPUs`. This should reduce scrape duration for highly loaded vmagent, which scrapes tens of thousands of targets. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1385). +* FEATURE: vmagent: show jobs with zero discovered targets on `/targets` page. This should help debugging improperly configured scrape configs. * BUGFIX: prevent from adding new samples to deleted time series after the rotation of the inverted index (the rotation is performed once per `-retentionPeriod`). See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136) for details. diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go index 6316971d1..02af9abc8 100644 --- a/lib/promscrape/config.go +++ b/lib/promscrape/config.go @@ -67,6 +67,8 @@ func (cfg *Config) mustStart() { for i := range cfg.ScrapeConfigs { cfg.ScrapeConfigs[i].mustStart(cfg.baseDir) } + jobNames := cfg.getJobNames() + tsmGlobal.registerJobNames(jobNames) logger.Infof("started service discovery routines in %.3f seconds", time.Since(startTime).Seconds()) } @@ -79,6 +81,15 @@ func (cfg *Config) mustStop() { logger.Infof("stopped service discovery routines in %.3f seconds", time.Since(startTime).Seconds()) } +// getJobNames returns all the scrape job names from the cfg. +func (cfg *Config) getJobNames() []string { + a := make([]string, 0, len(cfg.ScrapeConfigs)) + for i := range cfg.ScrapeConfigs { + a = append(a, cfg.ScrapeConfigs[i].JobName) + } + return a +} + // GlobalConfig represents essential parts for `global` section of Prometheus config. // // See https://prometheus.io/docs/prometheus/latest/configuration/configuration/ diff --git a/lib/promscrape/targets_response.qtpl b/lib/promscrape/targets_response.qtpl index 6120e3678..9583130e9 100644 --- a/lib/promscrape/targets_response.qtpl +++ b/lib/promscrape/targets_response.qtpl @@ -1,9 +1,9 @@ {% import "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" %} -{% collapsespace %} +{% stripspace %} -{% func TargetsResponsePlain (jts []jobTargetsStatuses, showOriginLabels bool) -%} +{% func TargetsResponsePlain(jts []jobTargetsStatuses, emptyJobs []string, showOriginLabels bool) %} {% for _, js := range jts %} job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) @@ -13,22 +13,26 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) labels := promLabelsString(ts.labels) ol := promLabelsString(ts.originalLabels) %} -{%s= "\t" %}state={% if ts.up %}up{% else %}down{% endif %}, - endpoint={%s= ts.endpoint %}, +{%s= "\t" %}state={% if ts.up %}up{% else %}down{% endif %},{% space %} + endpoint={%s= ts.endpoint %},{ %space %} labels={%s= labels %} - {% if showOriginLabels %}, originalLabels={%s= ol %}{% endif %}, - last_scrape={%f.3 ts.lastScrapeTime.Seconds() %}s ago, - scrape_duration={%f.3 ts.scrapeDuration.Seconds() %}s, - samples_scraped={%d ts.samplesScraped %}, + {% if showOriginLabels %}, originalLabels={%s= ol %}{% endif %},{% space %} + last_scrape={%f.3 ts.lastScrapeTime.Seconds() %}s ago,{% space %} + scrape_duration={%f.3 ts.scrapeDuration.Seconds() %}s,{% space %} + samples_scraped={%d ts.samplesScraped %},{% space %} error={%q= ts.errMsg %} {% newline %} {% endfor %} {% endfor %} + +{% for _, jobName := range emptyJobs %} +job={%q= jobName %} (0/0 up) {% newline %} +{% endfor %} {% endfunc %} -{% func TargetsResponseHTML(jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) %} +{% func TargetsResponseHTML(jts []jobTargetsStatuses, emptyJobs []string, redirectPath string, onlyUnhealthy bool) %} @@ -49,7 +53,7 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) Unhealthy - {% for _,js :=range jts %} + {% for _, js := range jts %} {% if onlyUnhealthy && js.upCount == js.targetsTotal %}{% continue %}{% endif %}

@@ -86,6 +90,27 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up)

{% endfor %} + + {% for _, jobName := range emptyJobs %} +
+

+ {%s jobName %} (0/0 up) +

+ + + + + + + + + + + + +
EndpointStateLabelsLast ScrapeScrape DurationSamples ScrapedError
+
+ {% endfor %} {% endfunc %} @@ -96,4 +121,4 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %} up) {% endfor %} {% endfunc %} -{% endcollapsespace %} +{% endstripspace %} diff --git a/lib/promscrape/targets_response.qtpl.go b/lib/promscrape/targets_response.qtpl.go index bce59cdcd..1292c9faf 100644 --- a/lib/promscrape/targets_response.qtpl.go +++ b/lib/promscrape/targets_response.qtpl.go @@ -21,15 +21,15 @@ var ( ) //line lib/promscrape/targets_response.qtpl:6 -func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatuses, showOriginLabels bool) { +func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatuses, emptyJobs []string, showOriginLabels bool) { //line lib/promscrape/targets_response.qtpl:8 for _, js := range jts { //line lib/promscrape/targets_response.qtpl:8 - qw422016.N().S(` job=`) + qw422016.N().S(`job=`) //line lib/promscrape/targets_response.qtpl:9 qw422016.N().Q(js.job) //line lib/promscrape/targets_response.qtpl:9 - qw422016.N().S(` (`) + qw422016.N().S(`(`) //line lib/promscrape/targets_response.qtpl:9 qw422016.N().D(js.upCount) //line lib/promscrape/targets_response.qtpl:9 @@ -37,22 +37,16 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatu //line lib/promscrape/targets_response.qtpl:9 qw422016.N().D(js.targetsTotal) //line lib/promscrape/targets_response.qtpl:9 - qw422016.N().S(` up) `) + qw422016.N().S(`up)`) //line lib/promscrape/targets_response.qtpl:10 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:10 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:11 for _, ts := range js.targetsStatus { -//line lib/promscrape/targets_response.qtpl:11 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:13 labels := promLabelsString(ts.labels) ol := promLabelsString(ts.originalLabels) -//line lib/promscrape/targets_response.qtpl:15 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:16 qw422016.N().S("\t") //line lib/promscrape/targets_response.qtpl:16 @@ -68,15 +62,17 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatu //line lib/promscrape/targets_response.qtpl:16 } //line lib/promscrape/targets_response.qtpl:16 - qw422016.N().S(`, endpoint=`) + qw422016.N().S(`,`) +//line lib/promscrape/targets_response.qtpl:16 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:16 + qw422016.N().S(`endpoint=`) //line lib/promscrape/targets_response.qtpl:17 qw422016.N().S(ts.endpoint) //line lib/promscrape/targets_response.qtpl:17 - qw422016.N().S(`, labels=`) + qw422016.N().S(`,{ %space %}labels=`) //line lib/promscrape/targets_response.qtpl:18 qw422016.N().S(labels) -//line lib/promscrape/targets_response.qtpl:18 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:19 if showOriginLabels { //line lib/promscrape/targets_response.qtpl:19 @@ -86,296 +82,308 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, jts []jobTargetsStatu //line lib/promscrape/targets_response.qtpl:19 } //line lib/promscrape/targets_response.qtpl:19 - qw422016.N().S(`, last_scrape=`) + qw422016.N().S(`,`) +//line lib/promscrape/targets_response.qtpl:19 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:19 + qw422016.N().S(`last_scrape=`) //line lib/promscrape/targets_response.qtpl:20 qw422016.N().FPrec(ts.lastScrapeTime.Seconds(), 3) //line lib/promscrape/targets_response.qtpl:20 - qw422016.N().S(`s ago, scrape_duration=`) + qw422016.N().S(`s ago,`) +//line lib/promscrape/targets_response.qtpl:20 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:20 + qw422016.N().S(`scrape_duration=`) //line lib/promscrape/targets_response.qtpl:21 qw422016.N().FPrec(ts.scrapeDuration.Seconds(), 3) //line lib/promscrape/targets_response.qtpl:21 - qw422016.N().S(`s, samples_scraped=`) + qw422016.N().S(`s,`) +//line lib/promscrape/targets_response.qtpl:21 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:21 + qw422016.N().S(`samples_scraped=`) //line lib/promscrape/targets_response.qtpl:22 qw422016.N().D(ts.samplesScraped) //line lib/promscrape/targets_response.qtpl:22 - qw422016.N().S(`, error=`) + qw422016.N().S(`,`) +//line lib/promscrape/targets_response.qtpl:22 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:22 + qw422016.N().S(`error=`) //line lib/promscrape/targets_response.qtpl:23 qw422016.N().Q(ts.errMsg) -//line lib/promscrape/targets_response.qtpl:23 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:24 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:24 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:25 } -//line lib/promscrape/targets_response.qtpl:25 - qw422016.N().S(` `) //line lib/promscrape/targets_response.qtpl:26 } -//line lib/promscrape/targets_response.qtpl:26 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:27 - qw422016.N().S(` +//line lib/promscrape/targets_response.qtpl:28 + for _, jobName := range emptyJobs { +//line lib/promscrape/targets_response.qtpl:28 + qw422016.N().S(`job=`) +//line lib/promscrape/targets_response.qtpl:29 + qw422016.N().Q(jobName) +//line lib/promscrape/targets_response.qtpl:29 + qw422016.N().S(`(0/0 up)`) +//line lib/promscrape/targets_response.qtpl:30 + qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:27 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:29 -} - -//line lib/promscrape/targets_response.qtpl:29 -func WriteTargetsResponsePlain(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, showOriginLabels bool) { -//line lib/promscrape/targets_response.qtpl:29 - qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:29 - StreamTargetsResponsePlain(qw422016, jts, showOriginLabels) -//line lib/promscrape/targets_response.qtpl:29 - qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:29 -} - -//line lib/promscrape/targets_response.qtpl:29 -func TargetsResponsePlain(jts []jobTargetsStatuses, showOriginLabels bool) string { -//line lib/promscrape/targets_response.qtpl:29 - qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:29 - WriteTargetsResponsePlain(qb422016, jts, showOriginLabels) -//line lib/promscrape/targets_response.qtpl:29 - qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:29 - qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:29 - return qs422016 -//line lib/promscrape/targets_response.qtpl:29 -} - //line lib/promscrape/targets_response.qtpl:31 -func StreamTargetsResponseHTML(qw422016 *qt422016.Writer, jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) { -//line lib/promscrape/targets_response.qtpl:31 - qw422016.N().S(` Scrape targets

Scrape targets

`) //line lib/promscrape/targets_response.qtpl:52 + } +//line lib/promscrape/targets_response.qtpl:52 + qw422016.N().S(`>Unhealthy`) +//line lib/promscrape/targets_response.qtpl:56 for _, js := range jts { -//line lib/promscrape/targets_response.qtpl:52 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:53 +//line lib/promscrape/targets_response.qtpl:57 if onlyUnhealthy && js.upCount == js.targetsTotal { -//line lib/promscrape/targets_response.qtpl:53 +//line lib/promscrape/targets_response.qtpl:57 continue -//line lib/promscrape/targets_response.qtpl:53 +//line lib/promscrape/targets_response.qtpl:57 } -//line lib/promscrape/targets_response.qtpl:53 - qw422016.N().S(`

`) -//line lib/promscrape/targets_response.qtpl:56 +//line lib/promscrape/targets_response.qtpl:57 + qw422016.N().S(`

`) +//line lib/promscrape/targets_response.qtpl:60 qw422016.E().S(js.job) -//line lib/promscrape/targets_response.qtpl:56 - qw422016.N().S(` (`) -//line lib/promscrape/targets_response.qtpl:56 +//line lib/promscrape/targets_response.qtpl:60 + qw422016.N().S(`(`) +//line lib/promscrape/targets_response.qtpl:60 qw422016.N().D(js.upCount) -//line lib/promscrape/targets_response.qtpl:56 +//line lib/promscrape/targets_response.qtpl:60 qw422016.N().S(`/`) -//line lib/promscrape/targets_response.qtpl:56 +//line lib/promscrape/targets_response.qtpl:60 qw422016.N().D(js.targetsTotal) -//line lib/promscrape/targets_response.qtpl:56 - qw422016.N().S(` up)

`) -//line lib/promscrape/targets_response.qtpl:71 +//line lib/promscrape/targets_response.qtpl:60 + qw422016.N().S(`up)
Endpoint State Labels Last Scrape Scrape Duration Samples Scraped Error
`) +//line lib/promscrape/targets_response.qtpl:75 for _, ts := range js.targetsStatus { -//line lib/promscrape/targets_response.qtpl:71 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:72 +//line lib/promscrape/targets_response.qtpl:76 if onlyUnhealthy && ts.up { -//line lib/promscrape/targets_response.qtpl:72 +//line lib/promscrape/targets_response.qtpl:76 continue -//line lib/promscrape/targets_response.qtpl:72 +//line lib/promscrape/targets_response.qtpl:76 } -//line lib/promscrape/targets_response.qtpl:72 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:84 +//line lib/promscrape/targets_response.qtpl:86 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:88 } -//line lib/promscrape/targets_response.qtpl:84 - qw422016.N().S(`
EndpointStateLabelsLast ScrapeScrape DurationSamples ScrapedError
`) -//line lib/promscrape/targets_response.qtpl:74 +//line lib/promscrape/targets_response.qtpl:78 qw422016.E().S(ts.endpoint) -//line lib/promscrape/targets_response.qtpl:74 - qw422016.N().S(`
`) -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:78 + qw422016.N().S(`
`) +//line lib/promscrape/targets_response.qtpl:79 if ts.up { -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:79 qw422016.N().S(`UP`) -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:79 } else { -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:79 qw422016.N().S(`DOWN`) -//line lib/promscrape/targets_response.qtpl:75 +//line lib/promscrape/targets_response.qtpl:79 } -//line lib/promscrape/targets_response.qtpl:75 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:77 +//line lib/promscrape/targets_response.qtpl:80 + qw422016.N().S(`">`) +//line lib/promscrape/targets_response.qtpl:81 streamformatLabel(qw422016, ts.labels) -//line lib/promscrape/targets_response.qtpl:77 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:79 +//line lib/promscrape/targets_response.qtpl:81 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:83 qw422016.N().FPrec(ts.lastScrapeTime.Seconds(), 3) -//line lib/promscrape/targets_response.qtpl:79 - qw422016.N().S(`s ago `) -//line lib/promscrape/targets_response.qtpl:80 +//line lib/promscrape/targets_response.qtpl:83 + qw422016.N().S(`s ago`) +//line lib/promscrape/targets_response.qtpl:84 qw422016.N().FPrec(ts.scrapeDuration.Seconds(), 3) -//line lib/promscrape/targets_response.qtpl:80 - qw422016.N().S(`s `) -//line lib/promscrape/targets_response.qtpl:81 +//line lib/promscrape/targets_response.qtpl:84 + qw422016.N().S(`s`) +//line lib/promscrape/targets_response.qtpl:85 qw422016.N().D(ts.samplesScraped) -//line lib/promscrape/targets_response.qtpl:81 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:82 +//line lib/promscrape/targets_response.qtpl:85 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:86 qw422016.E().S(ts.errMsg) -//line lib/promscrape/targets_response.qtpl:82 - qw422016.N().S(`
`) //line lib/promscrape/targets_response.qtpl:88 + qw422016.N().S(`

`) +//line lib/promscrape/targets_response.qtpl:92 } -//line lib/promscrape/targets_response.qtpl:88 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:94 + for _, jobName := range emptyJobs { +//line lib/promscrape/targets_response.qtpl:94 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:113 + } +//line lib/promscrape/targets_response.qtpl:113 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:116 } -//line lib/promscrape/targets_response.qtpl:91 -func WriteTargetsResponseHTML(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) { -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 +func WriteTargetsResponseHTML(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, emptyJobs []string, redirectPath string, onlyUnhealthy bool) { +//line lib/promscrape/targets_response.qtpl:116 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:91 - StreamTargetsResponseHTML(qw422016, jts, redirectPath, onlyUnhealthy) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 + StreamTargetsResponseHTML(qw422016, jts, emptyJobs, redirectPath, onlyUnhealthy) +//line lib/promscrape/targets_response.qtpl:116 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 } -//line lib/promscrape/targets_response.qtpl:91 -func TargetsResponseHTML(jts []jobTargetsStatuses, redirectPath string, onlyUnhealthy bool) string { -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 +func TargetsResponseHTML(jts []jobTargetsStatuses, emptyJobs []string, redirectPath string, onlyUnhealthy bool) string { +//line lib/promscrape/targets_response.qtpl:116 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:91 - WriteTargetsResponseHTML(qb422016, jts, redirectPath, onlyUnhealthy) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 + WriteTargetsResponseHTML(qb422016, jts, emptyJobs, redirectPath, onlyUnhealthy) +//line lib/promscrape/targets_response.qtpl:116 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 return qs422016 -//line lib/promscrape/targets_response.qtpl:91 +//line lib/promscrape/targets_response.qtpl:116 } -//line lib/promscrape/targets_response.qtpl:93 +//line lib/promscrape/targets_response.qtpl:118 func streamformatLabel(qw422016 *qt422016.Writer, labels []prompbmarshal.Label) { -//line lib/promscrape/targets_response.qtpl:93 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:94 +//line lib/promscrape/targets_response.qtpl:119 for _, label := range labels { -//line lib/promscrape/targets_response.qtpl:94 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:95 +//line lib/promscrape/targets_response.qtpl:120 qw422016.E().S(label.Name) -//line lib/promscrape/targets_response.qtpl:95 +//line lib/promscrape/targets_response.qtpl:120 qw422016.N().S(`=`) -//line lib/promscrape/targets_response.qtpl:95 +//line lib/promscrape/targets_response.qtpl:120 qw422016.E().Q(label.Value) -//line lib/promscrape/targets_response.qtpl:95 +//line lib/promscrape/targets_response.qtpl:120 qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:95 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:95 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:96 +//line lib/promscrape/targets_response.qtpl:121 } -//line lib/promscrape/targets_response.qtpl:96 - qw422016.N().S(` `) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 } -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 func writeformatLabel(qq422016 qtio422016.Writer, labels []prompbmarshal.Label) { -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 streamformatLabel(qw422016, labels) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 } -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 func formatLabel(labels []prompbmarshal.Label) string { -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 writeformatLabel(qb422016, labels) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 return qs422016 -//line lib/promscrape/targets_response.qtpl:97 +//line lib/promscrape/targets_response.qtpl:122 } diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go index eee6a548a..2bb4c5e33 100644 --- a/lib/promscrape/targetstatus.go +++ b/lib/promscrape/targetstatus.go @@ -58,8 +58,9 @@ func WriteAPIV1Targets(w io.Writer, state string) { } type targetStatusMap struct { - mu sync.Mutex - m map[*ScrapeWork]*targetStatus + mu sync.Mutex + m map[*ScrapeWork]*targetStatus + jobNames []string } func newTargetStatusMap() *targetStatusMap { @@ -74,6 +75,12 @@ func (tsm *targetStatusMap) Reset() { tsm.mu.Unlock() } +func (tsm *targetStatusMap) registerJobNames(jobNames []string) { + tsm.mu.Lock() + tsm.jobNames = append(tsm.jobNames[:0], jobNames...) + tsm.mu.Unlock() +} + func (tsm *targetStatusMap) Register(sw *ScrapeWork) { tsm.mu.Lock() tsm.m[sw] = &targetStatus{ @@ -284,13 +291,14 @@ type jobTargetsStatuses struct { targetsStatus []jobTargetStatus } -func (tsm *targetStatusMap) getTargetsStatusByJob() []jobTargetsStatuses { +func (tsm *targetStatusMap) getTargetsStatusByJob() ([]jobTargetsStatuses, []string) { byJob := make(map[string][]targetStatus) tsm.mu.Lock() for _, st := range tsm.m { job := st.sw.Job() byJob[job] = append(byJob[job], *st) } + jobNames := append([]string{}, tsm.jobNames...) tsm.mu.Unlock() var jts []jobTargetsStatuses @@ -331,20 +339,37 @@ func (tsm *targetStatusMap) getTargetsStatusByJob() []jobTargetsStatuses { sort.Slice(jts, func(i, j int) bool { return jts[i].job < jts[j].job }) - return jts + emptyJobs := getEmptyJobs(jts, jobNames) + return jts, emptyJobs +} + +func getEmptyJobs(jts []jobTargetsStatuses, jobNames []string) []string { + jobNamesMap := make(map[string]struct{}, len(jobNames)) + for _, jobName := range jobNames { + jobNamesMap[jobName] = struct{}{} + } + for i := range jts { + delete(jobNamesMap, jts[i].job) + } + emptyJobs := make([]string, 0, len(jobNamesMap)) + for k := range jobNamesMap { + emptyJobs = append(emptyJobs, k) + } + sort.Strings(emptyJobs) + return emptyJobs } // WriteTargetsHTML writes targets status grouped by job into writer w in html table, // accepts filter to show only unhealthy targets. func (tsm *targetStatusMap) WriteTargetsHTML(w io.Writer, showOnlyUnhealthy bool) { - jss := tsm.getTargetsStatusByJob() + jss, emptyJobs := tsm.getTargetsStatusByJob() targetsPath := path.Join(httpserver.GetPathPrefix(), "/targets") - WriteTargetsResponseHTML(w, jss, targetsPath, showOnlyUnhealthy) + WriteTargetsResponseHTML(w, jss, emptyJobs, targetsPath, showOnlyUnhealthy) } // WriteTargetsPlain writes targets grouped by job into writer w in plain text, // accept filter to show original labels. func (tsm *targetStatusMap) WriteTargetsPlain(w io.Writer, showOriginalLabels bool) { - jss := tsm.getTargetsStatusByJob() - WriteTargetsResponsePlain(w, jss, showOriginalLabels) + jss, emptyJobs := tsm.getTargetsStatusByJob() + WriteTargetsResponsePlain(w, jss, emptyJobs, showOriginalLabels) }