app/vmstorage: expose vm_{hourly,daily}_series_limit_{max,current}_series metrics if -storage.max{Hourly,Daily}Series limits are set

These metrics allow alerting when the number of unique series approach the limit.
For example, the following query alerts when the number of series reaches 90% of the configured limit:

    vm_hourly_series_limit_current_series / vm_hourly_series_limit_max_series > 0.9
This commit is contained in:
Aliaksandr Valialkin 2022-08-24 13:41:53 +03:00
parent b1e1c50627
commit 796aa310c2
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
7 changed files with 125 additions and 23 deletions

View file

@ -1547,8 +1547,27 @@ Both limits can be set simultaneously. If any of these limits is reached, then i
The exceeded limits can be [monitored](#monitoring) with the following metrics:
* `vm_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
* `vm_hourly_series_limit_max_series` - the hourly series limit set via `-storage.maxHourlySeries` command-line flag.
* `vm_hourly_series_limit_current_series` - the current number of unique series during the last hour.
The following query can be useful for alerting when the number of unique series during the last hour exceeds 90% of the `-storage.maxHourlySeries`:
```metricsql
vm_hourly_series_limit_current_series / vm_hourly_series_limit_max_series > 0.9
```
* `vm_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
* `vm_daily_series_limit_max_series` - the daily series limit set via `-storage.maxDailySeries` command-line flag.
* `vm_daily_series_limit_current_series` - the current number of unique series during the last day.
The following query can be useful for alerting when the number of unique series during the last day exceeds 90% of the `-storage.maxDailySeries`:
```metricsql
vm_daily_series_limit_current_series / vm_daily_series_limit_max_series > 0.9
```
These limits are approximate, so VictoriaMetrics can underflow/overflow the limit by a small percentage (usually less than 1%).
See also more advanced [cardinality limiter in vmagent](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter).
@ -2225,9 +2244,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Overrides max size for storage/tsid cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-storage.maxDailySeries int
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxHourlySeries
-storage.maxHourlySeries int
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxDailySeries
-storage.minFreeDiskSpaceBytes size
The minimum free disk space at -storageDataPath after which the storage stops accepting new data
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 10000000)

View file

@ -49,9 +49,11 @@ var (
"When set, then /api/v1/query_range would return '503 Service Unavailable' error for queries with 'from' value outside -retentionPeriod. "+
"This may be useful when multiple data sources with distinct retentions are hidden behind query-tee")
maxHourlySeries = flag.Int("storage.maxHourlySeries", 0, "The maximum number of unique series can be added to the storage during the last hour. "+
"Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries")
"Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/#cardinality-limiter . "+
"See also -storage.maxDailySeries")
maxDailySeries = flag.Int("storage.maxDailySeries", 0, "The maximum number of unique series can be added to the storage during the last 24 hours. "+
"Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries")
"Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/#cardinality-limiter . "+
"See also -storage.maxHourlySeries")
minFreeDiskSpaceBytes = flagutil.NewBytes("storage.minFreeDiskSpaceBytes", 10e6, "The minimum free disk space at -storageDataPath after which the storage stops accepting new data")
@ -626,12 +628,29 @@ func registerStorageMetrics(strg *storage.Storage) {
return float64(m().SlowMetricNameLoads)
})
metrics.NewGauge(`vm_hourly_series_limit_rows_dropped_total`, func() float64 {
return float64(m().HourlySeriesLimitRowsDropped)
})
metrics.NewGauge(`vm_daily_series_limit_rows_dropped_total`, func() float64 {
return float64(m().DailySeriesLimitRowsDropped)
})
if *maxHourlySeries > 0 {
metrics.NewGauge(`vm_hourly_series_limit_current_series`, func() float64 {
return float64(m().HourlySeriesLimitCurrentSeries)
})
metrics.NewGauge(`vm_hourly_series_limit_max_series`, func() float64 {
return float64(m().HourlySeriesLimitMaxSeries)
})
metrics.NewGauge(`vm_hourly_series_limit_rows_dropped_total`, func() float64 {
return float64(m().HourlySeriesLimitRowsDropped)
})
}
if *maxDailySeries > 0 {
metrics.NewGauge(`vm_daily_series_limit_current_series`, func() float64 {
return float64(m().DailySeriesLimitCurrentSeries)
})
metrics.NewGauge(`vm_daily_series_limit_max_series`, func() float64 {
return float64(m().DailySeriesLimitMaxSeries)
})
metrics.NewGauge(`vm_daily_series_limit_rows_dropped_total`, func() float64 {
return float64(m().DailySeriesLimitRowsDropped)
})
}
metrics.NewGauge(`vm_timestamps_blocks_merged_total`, func() float64 {
return float64(m().TimestampsBlocksMerged)

View file

@ -22,6 +22,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* SECURITY: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not expose `-remoteWrite.url`, `-remoteRead.url` and `-datasource.url` command-line flag values in logs and at `http://vmalert:8880/flags` page by default, since they may contain sensitive data such as auth keys. This aligns `vmalert` behaviour with [vmagent](https://docs.victoriametrics.com/vmagent.html), which doesn't expose `-remoteWrite.url` command-line flag value in logs and at `http://vmagent:8429/flags` page by default. Specify `-remoteWrite.showURL`, `-remoteRead.showURL` and `-datasource.showURL` command-line flags for showing values for the corresponding `-*.url` flags in logs. Thanks to @mble for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2965).
* FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs.
* FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details.
* FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`.
* FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance for heavy queries on systems with many CPU cores.
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to use {% raw %}`{{label_name}}`{% endraw %} placeholders in the `replacement` option of relabeling rules. This simplifies constructing label values from multiple existing label values. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
@ -33,7 +34,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add a legend in the top right corner for shortcut keys. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2813).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `toTime()` template function in the same way as Prometheus 2.38 [does](https://github.com/prometheus/prometheus/pull/10993). See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/template_reference/#numbers).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `$alertID` and `$groupID` template variables. These variables may be used for templating annotations or `-external.alert.source` command-line flag. See the full list of supported variables [here](https://docs.victoriametrics.com/vmalert.html#templating).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `$activeAt` template variable. See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2999). See the full list of supported variables [here](https://docs.victoriametrics.com/vmalert.html#templating). Thanks to @laixintao.
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `$activeAt` template variable. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2999). See the full list of supported variables [here](https://docs.victoriametrics.com/vmalert.html#templating). Thanks to @laixintao for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3000).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): point alert source to [vmalert's UI](https://docs.victoriametrics.com/vmalert.html#web) at `/vmalert/alert?...` instead of JSON handler at `/vmalert/api/v1/alert?...`. This improves user experience. The old behavior can be achieved by setting {% raw %}`-external.alert.source=vmalert/api/v1/alert?group_id={{.GroupID}}&alert_id={{.AlertID}}`{% endraw %} command-line flag.
* BUGFIX: prevent from excess CPU usage when the storage enters [read-only mode](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#readonly-mode).

View file

@ -193,6 +193,17 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
It is recommended setting up alerts in [vmalert](https://docs.victoriametrics.com/vmalert.html) or in Prometheus from [this config](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/alerts.yml).
## Cardinality limiter
`vmstorage` nodes can be configured with limits on the number of unique time series across all the tenants with the following command-line flags:
- `-storage.maxHourlySeries` is the limit on the number of [active time series](https://docs.victoriametrics.com/FAQ.html#what-is-an-active-time-series) during the last hour.
- `-storage.maxDailySeries` is the limit on the number of unique time series during the day. This limit can be used for limiting daily [time series churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate).
Note that these limits are set and applied individually per each `vmstorage` node in the cluster. So, if the cluster has `N` `vmstorage` nodes, then the cluster-level limits will be `N` times bigger than the per-`vmstorage` limits.
See more details about cardinality limiter in [these docs](https://docs.victoriametrics.com/#cardinality-limiter).
## Troubleshooting
See [trobuleshooting docs](https://docs.victoriametrics.com/Troubleshooting.html).
@ -432,8 +443,8 @@ By default cluster components of VictoriaMetrics are tuned for an optimal resour
- `-search.maxSeries` at `vmselect` limits the number of time series, which may be returned from [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). This endpoint is used mostly by Grafana for auto-completion of metric names, label names and label values. Queries to this endpoint may take big amounts of CPU time and memory at `vmstorage` and `vmselect` when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxSeries` to quite low value in order limit CPU and memory usage.
- `-search.maxTagKeys` at `vmstorage` limits the number of items, which may be returned from [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names). This endpoint is used mostly by Grafana for auto-completion of label names. Queries to this endpoint may take big amounts of CPU time and memory at `vmstorage` and `vmselect` when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagKeys` to quite low value in order to limit CPU and memory usage.
- `-search.maxTagValues` at `vmstorage` limits the number of items, which may be returned from [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values). This endpoint is used mostly by Grafana for auto-completion of label values. Queries to this endpoint may take big amounts of CPU time and memory at `vmstorage` and `vmselect` when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagValues` to quite low value in order to limit CPU and memory usage.
- `-storage.maxDailySeries` at `vmstorage` can be used for limiting the number of time series seen per day.
- `-storage.maxHourlySeries` at `vmstorage` can be used for limiting the number of [active time series](https://docs.victoriametrics.com/FAQ.html#what-is-an-active-time-series).
- `-storage.maxDailySeries` at `vmstorage` can be used for limiting the number of time series seen per day aka [time series churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). See [cardinality limiter docs](#cardinality-limiter).
- `-storage.maxHourlySeries` at `vmstorage` can be used for limiting the number of [active time series](https://docs.victoriametrics.com/FAQ.html#what-is-an-active-time-series). See [cardinality limiter docs](#cardinality-limiter).
See also [capacity planning docs](#capacity-planning) and [cardinality limiter in vmagent](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter).
@ -1099,9 +1110,9 @@ Below is the output for `/path/to/vmstorage -help`:
Overrides max size for storage/tsid cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-storage.maxDailySeries int
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxHourlySeries
-storage.maxHourlySeries int
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxDailySeries
-storage.minFreeDiskSpaceBytes size
The minimum free disk space at -storageDataPath after which the storage stops accepting new data
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 10000000)

View file

@ -1547,8 +1547,27 @@ Both limits can be set simultaneously. If any of these limits is reached, then i
The exceeded limits can be [monitored](#monitoring) with the following metrics:
* `vm_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
* `vm_hourly_series_limit_max_series` - the hourly series limit set via `-storage.maxHourlySeries` command-line flag.
* `vm_hourly_series_limit_current_series` - the current number of unique series during the last hour.
The following query can be useful for alerting when the number of unique series during the last hour exceeds 90% of the `-storage.maxHourlySeries`:
```metricsql
vm_hourly_series_limit_current_series / vm_hourly_series_limit_max_series > 0.9
```
* `vm_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
* `vm_daily_series_limit_max_series` - the daily series limit set via `-storage.maxDailySeries` command-line flag.
* `vm_daily_series_limit_current_series` - the current number of unique series during the last day.
The following query can be useful for alerting when the number of unique series during the last day exceeds 90% of the `-storage.maxDailySeries`:
```metricsql
vm_daily_series_limit_current_series / vm_daily_series_limit_max_series > 0.9
```
These limits are approximate, so VictoriaMetrics can underflow/overflow the limit by a small percentage (usually less than 1%).
See also more advanced [cardinality limiter in vmagent](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter).
@ -2225,9 +2244,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Overrides max size for storage/tsid cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-storage.maxDailySeries int
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxHourlySeries
-storage.maxHourlySeries int
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxDailySeries
-storage.minFreeDiskSpaceBytes size
The minimum free disk space at -storageDataPath after which the storage stops accepting new data
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 10000000)

View file

@ -1551,8 +1551,27 @@ Both limits can be set simultaneously. If any of these limits is reached, then i
The exceeded limits can be [monitored](#monitoring) with the following metrics:
* `vm_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series.
* `vm_hourly_series_limit_max_series` - the hourly series limit set via `-storage.maxHourlySeries` command-line flag.
* `vm_hourly_series_limit_current_series` - the current number of unique series during the last hour.
The following query can be useful for alerting when the number of unique series during the last hour exceeds 90% of the `-storage.maxHourlySeries`:
```metricsql
vm_hourly_series_limit_current_series / vm_hourly_series_limit_max_series > 0.9
```
* `vm_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series.
* `vm_daily_series_limit_max_series` - the daily series limit set via `-storage.maxDailySeries` command-line flag.
* `vm_daily_series_limit_current_series` - the current number of unique series during the last day.
The following query can be useful for alerting when the number of unique series during the last day exceeds 90% of the `-storage.maxDailySeries`:
```metricsql
vm_daily_series_limit_current_series / vm_daily_series_limit_max_series > 0.9
```
These limits are approximate, so VictoriaMetrics can underflow/overflow the limit by a small percentage (usually less than 1%).
See also more advanced [cardinality limiter in vmagent](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter).
@ -2229,9 +2248,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
Overrides max size for storage/tsid cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 0)
-storage.maxDailySeries int
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries
The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxHourlySeries
-storage.maxHourlySeries int
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries
The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See https://docs.victoriametrics.com/#cardinality-limiter . See also -storage.maxDailySeries
-storage.minFreeDiskSpaceBytes size
The minimum free disk space at -storageDataPath after which the storage stops accepting new data
Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 10000000)

View file

@ -469,8 +469,13 @@ type Metrics struct {
SlowPerDayIndexInserts uint64
SlowMetricNameLoads uint64
HourlySeriesLimitRowsDropped uint64
DailySeriesLimitRowsDropped uint64
HourlySeriesLimitRowsDropped uint64
HourlySeriesLimitMaxSeries uint64
HourlySeriesLimitCurrentSeries uint64
DailySeriesLimitRowsDropped uint64
DailySeriesLimitMaxSeries uint64
DailySeriesLimitCurrentSeries uint64
TimestampsBlocksMerged uint64
TimestampsBytesSaved uint64
@ -546,8 +551,17 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
m.SlowMetricNameLoads += atomic.LoadUint64(&s.slowMetricNameLoads)
m.HourlySeriesLimitRowsDropped += atomic.LoadUint64(&s.hourlySeriesLimitRowsDropped)
m.DailySeriesLimitRowsDropped += atomic.LoadUint64(&s.dailySeriesLimitRowsDropped)
if sl := s.hourlySeriesLimiter; sl != nil {
m.HourlySeriesLimitRowsDropped += atomic.LoadUint64(&s.hourlySeriesLimitRowsDropped)
m.HourlySeriesLimitMaxSeries += uint64(sl.MaxItems())
m.HourlySeriesLimitCurrentSeries += uint64(sl.CurrentItems())
}
if sl := s.dailySeriesLimiter; sl != nil {
m.DailySeriesLimitRowsDropped += atomic.LoadUint64(&s.dailySeriesLimitRowsDropped)
m.DailySeriesLimitMaxSeries += uint64(sl.MaxItems())
m.DailySeriesLimitCurrentSeries += uint64(sl.CurrentItems())
}
m.TimestampsBlocksMerged = atomic.LoadUint64(&timestampsBlocksMerged)
m.TimestampsBytesSaved = atomic.LoadUint64(&timestampsBytesSaved)