app/vmselect: expose vm_memory_intensive_queries_total counter metric (#5208)

The new metric gets increased each time `-search.logQueryMemoryUsage` memory limit is exceeded by a query. This metric should help to identify expensive and heavy queries without inspecting the logs. Signed-off-by: hagen1778 <roman@victoriametrics.com>
2025-03-21 15:45:01 +00:00 · 2023-10-31 13:31:09 +01:00 · 2023-10-31 13:31:09 +01:00 · a950873fff
commit a950873fff
parent a8051d48c4
5 changed files with 8 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -2792,7 +2792,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
  -search.latencyOffset duration
     The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
  -search.logQueryMemoryUsage size
-     Log queries, which require more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
+     Log query and increment vm_memory_intensive_queries_total metric each time when the query requires more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
     Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
  -search.logSlowQueryDuration duration
     Log queries with execution time exceeding this value. Zero disables slow query logging. See also -search.logQueryMemoryUsage (default 5s)
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@ -33,7 +33,7 @@ var (
 		"Queries requiring more memory are rejected. The total memory limit for concurrently executed queries can be estimated "+
 		"as -search.maxMemoryPerQuery multiplied by -search.maxConcurrentRequests . "+
 		"See also -search.logQueryMemoryUsage")
-	logQueryMemoryUsage = flagutil.NewBytes("search.logQueryMemoryUsage", 0, "Log queries, which require more memory than specified by this flag. "+
+	logQueryMemoryUsage = flagutil.NewBytes("search.logQueryMemoryUsage", 0, "Log query and increment vm_memory_intensive_queries_total metric each time when the query requires more memory than specified by this flag. "+
 		"This may help detecting and optimizing heavy queries. Query logging is disabled by default. "+
 		"See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery")
 	noStaleMarkers = flag.Bool("search.noStaleMarkers", false, "Set this flag to true if the database doesn't contain Prometheus stale markers, "+
@ -1042,6 +1042,8 @@ var (
 	rollupResultCacheFullHits    = metrics.NewCounter(`vm_rollup_result_cache_full_hits_total`)
 	rollupResultCachePartialHits = metrics.NewCounter(`vm_rollup_result_cache_partial_hits_total`)
 	rollupResultCacheMiss        = metrics.NewCounter(`vm_rollup_result_cache_miss_total`)
+
+	memoryIntensiveQueries = metrics.NewCounter(`vm_memory_intensive_queries_total`)
 )

 func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcName string, rf rollupFunc,
@ -1134,6 +1136,7 @@ func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcNa
 	rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
 	rollupMemorySize = sumNoOverflow(mulNoOverflow(int64(rssLen), 1000), mulNoOverflow(rollupPoints, 16))
 	if maxMemory := int64(logQueryMemoryUsage.N); maxMemory > 0 && rollupMemorySize > maxMemory {
+		memoryIntensiveQueries.Inc()
 		requestURI := ec.GetRequestURI()
 		logger.Warnf("remoteAddr=%s, requestURI=%s: the %s requires %d bytes of memory for processing; "+
 			"logging this query, since it exceeds the -search.logQueryMemoryUsage=%d; "+
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -35,6 +35,7 @@ The sandbox cluster installation is running under the constant load generated by
 * SECURITY: upgrade Go builder from Go1.21.1 to Go1.21.3. See [the list of issues addressed in Go1.21.2](https://github.com/golang/go/issues?q=milestone%3AGo1.21.2+label%3ACherryPickApproved) and [the list of issues addressed in Go1.21.3](https://github.com/golang/go/issues?q=milestone%3AGo1.21.3+label%3ACherryPickApproved).

 * FEATURE: `vmselect`: improve query performance on systems with big number of CPU cores (`>=32`). Add `-search.maxWorkersPerQuery` command-line flag, which can be used for fine-tuning query performance on systems with big number of CPU cores. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5195).
+* FEATURE: `vmselect`: expose `vm_memory_intensive_queries_total` counter metric which gets increased each time `-search.logQueryMemoryUsage` memory limit is exceeded by a query. This metric should help to identify expensive and heavy queries without inspecting the logs.
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [drop_empty_series()](https://docs.victoriametrics.com/MetricsQL.html#drop_empty_series) function, which can be used for filtering out empty series before performing additional calculations as shown in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071).
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [labels_equal()](https://docs.victoriametrics.com/MetricsQL.html#labels_equal) function, which can be used for searching series with identical values for the given labels. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5148).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `eval_alignment` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups), it will align group query requests timestamp with interval like `datasource.queryTimeAlignment` did.
--- a/docs/README.md
+++ b/docs/README.md
@ -2795,7 +2795,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
  -search.latencyOffset duration
     The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
  -search.logQueryMemoryUsage size
-     Log queries, which require more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
+     Log query and increment vm_memory_intensive_queries_total metric each time when the query requires more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
     Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
  -search.logSlowQueryDuration duration
     Log queries with execution time exceeding this value. Zero disables slow query logging. See also -search.logQueryMemoryUsage (default 5s)
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@ -2803,7 +2803,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
  -search.latencyOffset duration
     The time when data points become visible in query results after the collection. It can be overridden on per-query basis via latency_offset arg. Too small value can result in incomplete last points for query results (default 30s)
  -search.logQueryMemoryUsage size
-     Log queries, which require more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
+     Log query and increment vm_memory_intensive_queries_total metric each time when the query requires more memory than specified by this flag. This may help detecting and optimizing heavy queries. Query logging is disabled by default. See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery
     Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0)
  -search.logSlowQueryDuration duration
     Log queries with execution time exceeding this value. Zero disables slow query logging. See also -search.logQueryMemoryUsage (default 5s)