vmstorage: auto calculate maxUniqueTimeseries based on resources (#6961)

### Describe Your Changes

Add support for
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6930

Calculate `-search.maxUniqueTimeseries` by
`-search.maxConcurrentRequests` and remaining memory if it's **not set**
or **less equal than 0**.

The remaining memory is affected by `-memory.allowedPercent`,
`-memory.allowedBytes` and cgroup memory limit.
### Checklist

The following checks are **mandatory**:

- [x] My change adheres [VictoriaMetrics contributing
guidelines](https://docs.victoriametrics.com/contributing/).

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
This commit is contained in:
Zhu Jiekun 2024-10-18 19:41:43 +08:00 committed by GitHub
parent 1d352b92c7
commit 85f60237e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 93 additions and 12 deletions

View file

@ -53,7 +53,8 @@ var (
"points with timestamps closer than -search.latencyOffset to the current time. The adjustment is needed because such points may contain incomplete data")
selectNodes = flagutil.NewArrayString("selectNode", "Comma-separated addresses of vmselect nodes; usage: -selectNode=vmselect-host1,...,vmselect-hostN")
maxUniqueTimeseries = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage")
maxUniqueTimeseries = flag.Int("search.maxUniqueTimeseries", 0, "The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage. "+
"The limit can't exceed the corresponding -search.maxUniqueTimeseries limit on vmstorage, it can be only set to lower values.")
maxFederateSeries = flag.Int("search.maxFederateSeries", 1e6, "The maximum number of time series, which can be returned from /federate. This option allows limiting memory usage")
maxExportSeries = flag.Int("search.maxExportSeries", 10e6, "The maximum number of time series, which can be returned from /api/v1/export* APIs. This option allows limiting memory usage")
maxTSDBStatusSeries = flag.Int("search.maxTSDBStatusSeries", 10e6, "The maximum number of time series, which can be processed during the call to /api/v1/status/tsdb. This option allows limiting memory usage")

View file

@ -122,6 +122,8 @@ func main() {
metrics.RegisterSet(storageMetrics)
common.StartUnmarshalWorkers()
servers.GetMaxUniqueTimeSeries() // for init and logging only.
vminsertSrv, err := servers.NewVMInsertServer(*vminsertAddr, strg)
if err != nil {
logger.Fatalf("cannot create a server with -vminsertAddr=%s: %s", *vminsertAddr, err)
@ -565,6 +567,8 @@ func writeStorageMetrics(w io.Writer, strg *storage.Storage) {
metrics.WriteGaugeUint64(w, `vm_downsampling_partitions_scheduled`, tm.ScheduledDownsamplingPartitions)
metrics.WriteGaugeUint64(w, `vm_downsampling_partitions_scheduled_size_bytes`, tm.ScheduledDownsamplingPartitionsSize)
metrics.WriteGaugeUint64(w, `vm_search_max_unique_timeseries`, uint64(servers.GetMaxUniqueTimeSeries()))
}
func jsonResponseError(w http.ResponseWriter, err error) {

View file

@ -10,14 +10,17 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/querytracer"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/vmselectapi"
)
var (
maxUniqueTimeseries = flag.Int("search.maxUniqueTimeseries", 0, "The maximum number of unique time series, which can be scanned during every query. This allows protecting against heavy queries, which select unexpectedly high number of series. Zero means 'no limit'. See also -search.max* command-line flags at vmselect")
maxTagKeys = flag.Int("search.maxTagKeys", 100e3, "The maximum number of tag keys returned per search. "+
maxUniqueTimeseries = flag.Int("search.maxUniqueTimeseries", 0, "The maximum number of unique time series, which can be scanned during every query. "+
"This allows protecting against heavy queries, which select unexpectedly high number of series. When set to zero, the limit is automatically calculated based on -search.maxConcurrentRequests (inversely proportional) and memory available to the process (proportional). See also -search.max* command-line flags at vmselect")
maxTagKeys = flag.Int("search.maxTagKeys", 100e3, "The maximum number of tag keys returned per search. "+
"See also -search.maxLabelsAPISeries and -search.maxLabelsAPIDuration")
maxTagValues = flag.Int("search.maxTagValues", 100e3, "The maximum number of tag values returned per search. "+
"See also -search.maxLabelsAPISeries and -search.maxLabelsAPIDuration")
@ -35,6 +38,11 @@ var (
"This may be useful when multiple data sources with distinct retentions are hidden behind query-tee")
)
var (
maxUniqueTimeseriesValue int
maxUniqueTimeseriesValueOnce sync.Once
)
// NewVMSelectServer starts new server at the given addr, which serves vmselect requests from the given s.
func NewVMSelectServer(addr string, s *storage.Storage) (*vmselectapi.Server, error) {
api := &vmstorageAPI{
@ -249,10 +257,38 @@ func getMaxMetrics(sq *storage.SearchQuery) int {
maxMetrics := sq.MaxMetrics
maxMetricsLimit := *maxUniqueTimeseries
if maxMetricsLimit <= 0 {
maxMetricsLimit = 2e9
maxMetricsLimit = GetMaxUniqueTimeSeries()
}
if maxMetrics <= 0 || maxMetrics > maxMetricsLimit {
maxMetrics = maxMetricsLimit
}
return maxMetrics
}
// GetMaxUniqueTimeSeries returns the max metrics limit calculated by available resources.
// The calculation is split into calculateMaxUniqueTimeSeriesForResource for unit testing.
func GetMaxUniqueTimeSeries() int {
maxUniqueTimeseriesValueOnce.Do(func() {
maxUniqueTimeseriesValue = *maxUniqueTimeseries
if maxUniqueTimeseriesValue <= 0 {
maxUniqueTimeseriesValue = calculateMaxUniqueTimeSeriesForResource(*maxConcurrentRequests, memory.Remaining())
}
})
return maxUniqueTimeseriesValue
}
// calculateMaxUniqueTimeSeriesForResource calculate the max metrics limit calculated by available resources.
func calculateMaxUniqueTimeSeriesForResource(maxConcurrentRequests, remainingMemory int) int {
if maxConcurrentRequests <= 0 {
// This line should NOT be reached unless the user has set an incorrect `search.maxConcurrentRequests`.
// In such cases, fallback to unlimited.
logger.Warnf("limiting -search.maxUniqueTimeseries to %v because -search.maxConcurrentRequests=%d.", 2e9, maxConcurrentRequests)
return 2e9
}
// Calculate the max metrics limit for a single request in the worst-case concurrent scenario.
// The approximate size of 1 unique series that could occupy in the vmstorage is 200 bytes.
mts := remainingMemory / 200 / maxConcurrentRequests
logger.Infof("limiting -search.maxUniqueTimeseries to %d according to -search.maxConcurrentRequests=%d and remaining memory=%d bytes. To increase the limit, reduce -search.maxConcurrentRequests or increase memory available to the process.", mts, maxConcurrentRequests, remainingMemory)
return mts
}

View file

@ -0,0 +1,33 @@
package servers
import (
"math"
"runtime"
"testing"
)
func TestCalculateMaxMetricsLimitByResource(t *testing.T) {
f := func(maxConcurrentRequest, remainingMemory, expect int) {
t.Helper()
maxMetricsLimit := calculateMaxUniqueTimeSeriesForResource(maxConcurrentRequest, remainingMemory)
if maxMetricsLimit != expect {
t.Fatalf("unexpected max metrics limit: got %d, want %d", maxMetricsLimit, expect)
}
}
// Skip when GOARCH=386
if runtime.GOARCH != "386" {
// 8 CPU & 32 GiB
f(16, int(math.Round(32*1024*1024*1024*0.4)), 4294967)
// 4 CPU & 32 GiB
f(8, int(math.Round(32*1024*1024*1024*0.4)), 8589934)
}
// 2 CPU & 4 GiB
f(4, int(math.Round(4*1024*1024*1024*0.4)), 2147483)
// other edge cases
f(0, int(math.Round(4*1024*1024*1024*0.4)), 2e9)
f(4, 0, 0)
}

View file

@ -711,10 +711,12 @@ Some workloads may need fine-grained resource usage limits. In these cases the f
Queries, which need more memory, are rejected. Heavy queries, which select big number of time series,
may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated
as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
- `-search.maxUniqueTimeseries` at `vmselect` component limits the number of unique time series a single query can find and process.
`vmselect` passes the limit to `vmstorage` component, which keeps in memory some metainformation about the time series located
by each query and spends some CPU time for processing the found time series. This means that the maximum memory usage and CPU usage
a single query can use at `vmstorage` is proportional to `-search.maxUniqueTimeseries`.
- `-search.maxUniqueTimeseries` at `vmstorage` component limits the number of unique time series a single query can find and process.
This means that the maximum memory usage and CPU usage a single query can use at `vmstorage` is proportional to `-search.maxUniqueTimeseries`.
By default, `vmstorage` calculates this limit automatically based on the available memory and the maximum number of concurrent read requests (see `-search.maxConcurrentRequests`).
The calculated limit will be printed during process start-up logs and exposed as `vm_search_max_unique_timeseries` metric.
- `-search.maxUniqueTimeseries` at `vmselect` adjusts the limit with the same name at `vmstorage`. The vmstorage limit can be adjusted
only to **lower value** and can't exceed it. By default, vmselect doesn't apply limit adjustments.
- `-search.maxQueryDuration` at `vmselect` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled.
This allows saving CPU and RAM at `vmselect` and `vmstorage` when executing unexpectedly heavy queries.
The limit can be altered for each query by passing `timeout` GET parameter, but can't exceed the limit specified via `-search.maxQueryDuration` command-line flag.
@ -1620,7 +1622,7 @@ Below is the output for `/path/to/vmselect -help`:
-search.maxTagValueSuffixesPerSearch int
The maximum number of tag value suffixes returned from /metrics/find (default 100000)
-search.maxUniqueTimeseries int
The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage (default 300000)
The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage. The limit can't exceed the corresponding -search.maxUniqueTimeseries limit on vmstorage, it can be only set to lower values. (default 0)
-search.maxWorkersPerQuery int
The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 16)
-search.minStalenessInterval duration
@ -1894,7 +1896,7 @@ Below is the output for `/path/to/vmstorage -help`:
-search.maxTagValues int
The maximum number of tag values returned per search. See also -search.maxLabelsAPISeries and -search.maxLabelsAPIDuration (default 100000)
-search.maxUniqueTimeseries int
The maximum number of unique time series, which can be scanned during every query. This allows protecting against heavy queries, which select unexpectedly high number of series. Zero means 'no limit'. See also -search.max* command-line flags at vmselect
The maximum number of unique time series, which can be scanned during every query. This allows protecting against heavy queries, which select unexpectedly high number of series. When set to zero, the limit is automatically calculated based on -search.maxConcurrentRequests (inversely proportional) and memory available to the process (proportional). See also -search.max* command-line flags at vmselect.
-smallMergeConcurrency int
Deprecated: this flag does nothing
-snapshotAuthKey value

View file

@ -1704,7 +1704,8 @@ By default, VictoriaMetrics is tuned for an optimal resource usage under typical
- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query. Queries, which need more memory, are rejected.
Heavy queries, which select big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit
for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
- `-search.maxUniqueTimeseries` limits the number of unique time series a single query can find and process. VictoriaMetrics keeps in memory
- `-search.maxUniqueTimeseries` limits the number of unique time series a single query can find and process. By default, VictoriaMetrics calculates the limit automatically
based on the available memory and the maximum number of concurrent requests it can process (see `-search.maxConcurrentRequests`). VictoriaMetrics keeps in memory
some metainformation about the time series located by each query and spends some CPU time for processing the found time series.
This means that the maximum memory usage and CPU usage a single query can use is proportional to `-search.maxUniqueTimeseries`.
- `-search.maxQueryDuration` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled.

View file

@ -18,6 +18,8 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
## tip
**Update note 1: `-search.maxUniqueTimeseries` limit on `vmselect` can no longer exceed `-search.maxUniqueTimeseries` limit on `vmstorage`. If you don't set this flag at `vmstorage`, then it will be automatically calculated based on available resources. This can result into rejecting expensive read queries if they exceed auto-calculated limit. The limit can be overriden by manually setting `-search.maxUniqueTimeseries` at vmstorage, but for better reliability we recommend sticking to default values. Refer to the CHANGELOG below and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6930).**
* FEATURE: add Darwin binaries for [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/) to the release flow. The binaries will be available in the new release.
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): allow using HTTP/2 client for Kubernetes service discovery if `-promscrape.kubernetes.useHTTP2Client` cmd-line flag is set. This could help to reduce the amount of opened connections to the Kubernetes API server. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5971) for the details.
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): `-rule` cmd-line flag now supports multi-document YAML files. This could be useful when rules are retrieved via HTTP URL where multiple rule files were merged together in one response. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6753). Thanks to @Irene-123 for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6995).
@ -25,6 +27,8 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/): disable stream processing mode for data [ingested via InfluxDB](https://docs.victoriametrics.com/#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) HTTP endpoints by default. With this change, the data is processed in batches (see `-influx.maxRequestSize`) and user will get parsing errors immediately as they happen. This also improves users' experience and resiliency against thundering herd problems caused by clients without backoff policies like telegraf. To enable stream mode back, pass HTTP header `Stream-Mode: "1"` with each request. For data sent via TCP and UDP (see `-influxListenAddr`) protocols streaming processing remains enabled.
* FEATURE: [vmgateway](https://docs.victoriametrics.com/vmgateway/): allow parsing `scope` claim parsing in array format. This is useful for cases when identity provider does encode claims in array format.
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to cancel running queries. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7097).
* FEATURE: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): automatically set `-search.maxUniqueTimeseries` limit based on available memory and `-search.maxConcurrentRequests`. The more memory is available to the process and the lower is `-search.maxConcurrentRequests`, the higher will be `-search.maxUniqueTimeseries` limit. This should protect vmstorage from expensive queries without the need to manually set `-search.maxUniqueTimeseries`. The calculated limit will be printed during process start-up logs and exposed as `vm_search_max_unique_timeseries` metric. Set `-search.maxUniqueTimeseries` manually to override auto calculation. Please note, `-search.maxUniqueTimeseries` on vmselect can't exceed the same name limit on vmstorage, it can only be set to lower values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6930).
* FEATURE: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): set default value for `-search.maxUniqueTimeseries` to `0`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6930).
* BUGFIX: [vmgateway](https://docs.victoriametrics.com/vmgateway/): fix possible panic during parsing of a token without `vm_access` claim. This issue was introduced in v1.104.0.
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix error messages rendering from overflowing the screen with long messages. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7207).

View file

@ -2426,7 +2426,7 @@ func (is *indexSearch) searchMetricIDs(qt *querytracer.Tracer, tfss []*TagFilter
func errTooManyTimeseries(maxMetrics int) error {
return fmt.Errorf("the number of matching timeseries exceeds %d; "+
"either narrow down the search or increase -search.max* command-line flag values at vmselect "+
"either narrow down the search or increase -search.max* command-line flag values "+
"(the most likely limit is -search.maxUniqueTimeseries); "+
"see https://docs.victoriametrics.com/#resource-usage-limits", maxMetrics)
}