From 7fdea4b31c20028f18bef58b17d61db51dd84cf8 Mon Sep 17 00:00:00 2001
From: Hui Wang <haley@victoriametrics.com>
Date: Thu, 25 Apr 2024 18:54:42 +0800
Subject: [PATCH] app/vmselect: implement cmd-line flags
 `-search.disableImplicitConversions` and `-search.logImplicitConversions`
 (#6180)

address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4338
support disable or log [implicit
conversions](https://docs.victoriametrics.com/metricsql/#implicit-query-conversions)
for subquery with cmd-line flags `-search.disableImplicitConversion` and
`-search.logImplicitConversion`

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit dd0d2c77c8a3c0c22b1e5e14960f9f3048e85513)
---
 app/vmselect/promql/exec.go                   |  69 ++++++++++++
 app/vmselect/promql/exec_test.go              | 101 ++++++++++++++++++
 .../packages/vmui/src/assets/MetricsQL.md     |   8 +-
 docs/CHANGELOG.md                             |   1 +
 docs/MetricsQL.md                             |   4 +-
 5 files changed, 179 insertions(+), 4 deletions(-)

diff --git a/app/vmselect/promql/exec.go b/app/vmselect/promql/exec.go
index 9a5149fbda..b2b05d5d9e 100644
--- a/app/vmselect/promql/exec.go
+++ b/app/vmselect/promql/exec.go
@@ -13,6 +13,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/querystats"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/querytracer"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 	"github.com/VictoriaMetrics/metrics"
@@ -26,6 +27,12 @@ var (
 		`For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped `+
 		`in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. `+
 		`This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter`)
+	disableImplicitConversion = flag.Bool("search.disableImplicitConversion", false, "Whether to return an error for queries that rely on implicit subquery conversions, "+
+		"see https://docs.victoriametrics.com/metricsql/#subqueries for details. "+
+		"See also -search.logImplicitConversion.")
+	logImplicitConversion = flag.Bool("search.logImplicitConversion", false, "Whether to log queries with implicit subquery conversions, "+
+		"see https://docs.victoriametrics.com/metricsql/#subqueries for details. "+
+		"Such conversion can be disabled using -search.disableImplicitConversion.")
 )
 
 // UserReadableError is a type of error which supposed to be returned to the user without additional context.
@@ -64,6 +71,16 @@ func Exec(qt *querytracer.Tracer, ec *EvalConfig, q string, isFirstPointOnly boo
 		return nil, err
 	}
 
+	if *disableImplicitConversion || *logImplicitConversion {
+		complete := isSubQueryComplete(e, false)
+		if !complete && *disableImplicitConversion {
+			return nil, fmt.Errorf("query contains subquery that requires implicit conversion and is rejected according to `-search.disableImplicitConversion=true` setting. See https://docs.victoriametrics.com/metricsql/#subqueries for details")
+		}
+		if !complete && *logImplicitConversion {
+			logger.Warnf("query=%q contains subquery that requires implicit conversion, see https://docs.victoriametrics.com/metricsql/#subqueries for details", e.AppendString(nil))
+		}
+	}
+
 	qid := activeQueriesV.Add(ec, q)
 	rv, err := evalExpr(qt, ec, e)
 	activeQueriesV.Remove(qid)
@@ -405,3 +422,55 @@ func (pc *parseCache) Put(q string, pcv *parseCacheValue) {
 	pc.m[q] = pcv
 	pc.mu.Unlock()
 }
+
+// isSubQueryComplete checks if expr contains incomplete subquery
+func isSubQueryComplete(e metricsql.Expr, isSubExpr bool) bool {
+	switch exp := e.(type) {
+	case *metricsql.FuncExpr:
+		if isSubExpr {
+			return false
+		}
+		fe := e.(*metricsql.FuncExpr)
+		for _, arg := range exp.Args {
+			if getRollupFunc(fe.Name) != nil {
+				isSubExpr = true
+			}
+			if !isSubQueryComplete(arg, isSubExpr) {
+				return false
+			}
+		}
+	case *metricsql.RollupExpr:
+		if _, ok := exp.Expr.(*metricsql.MetricExpr); ok {
+			return true
+		}
+		// exp.Step is optional in subqueries
+		if exp.Window == nil {
+			return false
+		}
+		return isSubQueryComplete(exp.Expr, false)
+	case *metricsql.AggrFuncExpr:
+		if isSubExpr {
+			return false
+		}
+		for _, arg := range exp.Args {
+			if !isSubQueryComplete(arg, false) {
+				return false
+			}
+		}
+	case *metricsql.BinaryOpExpr:
+		if isSubExpr {
+			return false
+		}
+		if !isSubQueryComplete(exp.Left, false) {
+			return false
+		}
+		if !isSubQueryComplete(exp.Right, false) {
+			return false
+		}
+	case *metricsql.MetricExpr:
+		return true
+	default:
+		return true
+	}
+	return true
+}
diff --git a/app/vmselect/promql/exec_test.go b/app/vmselect/promql/exec_test.go
index 940aca306d..08dadc0130 100644
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
@@ -9452,3 +9452,104 @@ func testAddLabels(t *testing.T, mn *storage.MetricName, labels ...string) {
 		})
 	}
 }
+
+func TestIsSubQueryCompleteTrue(t *testing.T) {
+	f := func(q string) {
+		t.Helper()
+		e, err := metricsql.Parse(q)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if !isSubQueryComplete(e, false) {
+			t.Fatalf("query should be complete: %s", e.AppendString(nil))
+		}
+	}
+
+	f("rate(http_total)")
+	f("sum(http_total)")
+	f("absent(http_total)")
+	f("rate(http_total[1m])")
+	f("avg_over_time(up[1m])")
+	f("sum(http_total[1m])")
+	f("sum(rate(http_total))")
+	f("sum(sum(http_total))")
+	f(`sum(sum_over_time(http_total[1m] )) by (instance)`)
+	f("sum(up{cluster='a'}[1m] or up{cluster='b'}[1m])")
+	f("(avg_over_time(alarm_test1[1m]) - avg_over_time(alarm_test1[1m] offset 5m)) > 0.1")
+	f("http_total[1m] offset 1m")
+
+	// subquery
+	f("rate(http_total)[5m:1m]")
+	f("rate(sum(http_total)[5m:1m])")
+	f("rate(rate(http_total)[5m:1m])")
+	f("sum(rate(http_total[1m]))")
+	f("sum(rate(sum(http_total)[5m:1m]))")
+	f("rate(sum(rate(http_total))[5m:1m])")
+	f("rate(sum(sum(http_total))[5m:1m])")
+	f("rate(sum(rate(http_total))[5m:1m])")
+	f("rate(sum(sum(http_total))[5m:1m])")
+	f("avg_over_time(rate(http_total[5m])[5m:1m])")
+	f("delta(avg_over_time(up[1m])[5m:1m]) > 0.1")
+	f("avg_over_time(avg by (site) (metric)[2m:1m])")
+
+	f("sum(http_total)[5m:1m] offset 1m")
+	f("round(sum(sum_over_time(http_total[1m])) by (instance)) [5m:1m] offset 1m")
+
+	f("rate(sum(http_total)[5m:1m]) - rate(sum(http_total)[5m:1m])")
+	f("avg_over_time((rate(http_total)-rate(http_total))[5m:1m])")
+
+	f("sum_over_time((up{cluster='a'} or up{cluster='b'})[5m:1m])")
+	f("sum_over_time((up{cluster='a'} or up{cluster='b'})[5m:1m])")
+	f("sum(sum_over_time((up{cluster='a'} or up{cluster='b'})[5m:1m])) by (instance)")
+
+	// step (or resolution) is optional in subqueries
+	f("max_over_time(rate(my_counter_total[5m])[1h:])")
+	f("max_over_time(rate(my_counter_total[5m])[1h:1m])[5m:1m]")
+	f("max_over_time(rate(my_counter_total[5m])[1h:])[5m:]")
+
+	f(`
+WITH (
+    cpuSeconds = node_cpu_seconds_total{instance=~"$node:$port",job=~"$job"},
+    cpuIdle = rate(cpuSeconds{mode='idle'}[5m])
+)
+max_over_time(cpuIdle[1h:])`)
+}
+
+func TestIsSubQueryCompleteFalse(t *testing.T) {
+	f := func(q string) {
+		t.Helper()
+		e, err := metricsql.Parse(q)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if isSubQueryComplete(e, false) {
+			t.Fatalf("expect to detect incomplete subquery: %s", e.AppendString(nil))
+		}
+	}
+
+	f("rate(sum(http_total))")
+	f("rate(rate(http_total))")
+	f("sum(rate(sum(http_total)))")
+	f("rate(sum(rate(http_total)))")
+	f("rate(sum(sum(http_total)))")
+	f("avg_over_time(rate(http_total[5m]))")
+
+	// https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3974
+	f("sum(http_total) offset 1m")
+	f(`round(sum(sum_over_time(http_total[1m])) by (instance)) offset 1m`)
+
+	f("rate(sum(http_total)) - rate(sum(http_total))")
+	f("avg_over_time(rate(http_total)-rate(http_total))")
+
+	// https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3996
+	f("sum_over_time(up{cluster='a'} or up{cluster='b'})")
+	f("sum_over_time(up{cluster='a'}[1m] or up{cluster='b'}[1m])")
+	f("sum(sum_over_time(up{cluster='a'}[1m] or up{cluster='b'}[1m])) by (instance)")
+
+	f(`
+WITH (
+    cpuSeconds = node_cpu_seconds_total{instance=~"$node:$port",job=~"$job"},
+    cpuIdle = rate(cpuSeconds{mode='idle'}[5m])
+)
+max_over_time(cpuIdle)`)
+}
diff --git a/app/vmui/packages/vmui/src/assets/MetricsQL.md b/app/vmui/packages/vmui/src/assets/MetricsQL.md
index 7f3ccc498b..5c517e4ed7 100644
--- a/app/vmui/packages/vmui/src/assets/MetricsQL.md
+++ b/app/vmui/packages/vmui/src/assets/MetricsQL.md
@@ -2211,7 +2211,8 @@ MetricsQL supports and extends PromQL subqueries. See [this article](https://val
 Any [rollup function](#rollup-functions) for something other than [series selector](https://docs.victoriametrics.com/keyconcepts/#filtering) form a subquery.
 Nested rollup functions can be implicit thanks to the [implicit query conversions](#implicit-query-conversions).
 For example, `delta(sum(m))` is implicitly converted to `delta(sum(default_rollup(m))[1i:1i])`, so it becomes a subquery,
-since it contains [default_rollup](#default_rollup) nested into [delta](#delta).
+since it contains [default_rollup](#default_rollup) nested into [delta](#delta).  
+This behavior can be disabled or logged via cmd-line flags `-search.disableImplicitConversion` and `-search.logImplicitConversion` since v1.101.0.
 
 VictoriaMetrics performs subqueries in the following way:
 
@@ -2241,7 +2242,8 @@ VictoriaMetrics performs the following implicit conversions for incoming queries
   * `abs(temperature)` is transformed to `abs(default_rollup(temperature))`, because [abs](#abs) isn't a [rollup function](#rollup-functions) -
     it is [transform function](#transform-functions)
 * If `step` in square brackets is missing inside [subquery](#subqueries), then `1i` step is automatically added there.
-  For example, `avg_over_time(rate(http_requests_total[5m])[1h])` is automatically converted to `avg_over_time(rate(http_requests_total[5m])[1h:1i])`.
+  For example, `avg_over_time(rate(http_requests_total[5m])[1h])` is automatically converted to `avg_over_time(rate(http_requests_total[5m])[1h:1i])`.  
 * If something other than [series selector](https://docs.victoriametrics.com/keyconcepts/#filtering)
   is passed to [rollup function](#rollup-functions), then a [subquery](#subqueries) with `1i` lookbehind window and `1i` step is automatically formed.
-  For example, `rate(sum(up))` is automatically converted to `rate((sum(default_rollup(up)))[1i:1i])`.
+  For example, `rate(sum(up))` is automatically converted to `rate((sum(default_rollup(up)))[1i:1i])`.  
+  This behavior can be disabled or logged via cmd-line flags `-search.disableImplicitConversion` and `-search.logImplicitConversion` since v1.101.0.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 03d0410864..51d5a671c0 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -33,6 +33,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/metricsql/): support filtering by multiple numeric constants via `q == (C1, ..., CN)` and `q != (C1, ..., CN)` syntax. For example, `status_code == (200, 201, 300)` returns `status_code` metrics with any of `200`, `201` or `300` values, while `status_code != (400, 404, 500)` returns `status_code` metrics with all the values except of `400`, `404` and `500`.
 * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): add support for fault domain awareness to `vmselect`. It can be configured to return full responses if up to `-globalReplicationFactor - 1` fault domains (aka `vmstorage` groups) are unavailable. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6054) and [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect).
 * FEATURE: all VictoriaMetrics [enterprise](https://docs.victoriametrics.com/enterprise/) components: add support for automatic issuing of TLS certificates for HTTPS server at `-httpListenAddr` via [Let's Encrypt service](https://letsencrypt.org/). See [these docs](https://docs.victoriametrics.com/#automatic-issuing-of-tls-certificates) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5949).
+* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): support disable or log [implicit conversions](https://docs.victoriametrics.com/metricsql/#implicit-query-conversions) for subquery with cmd-line flags `-search.disableImplicitConversion` and `-search.logImplicitConversion`. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4338).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support data replication additionally to sharding among remote storage systems if `-remoteWrite.shardByURLReplicas=N` command-line flag is set additionally to `-remoteWrite.shardByURL` command-line flag, where `N` is desired replication factor. This allows setting up data replication among failure domains when the replication factor is smaller than the number of failure domains. See [these docs](https://docs.victoriametrics.com/vmagent/#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6054).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): reduce CPU usage when [sharding among remote storage systems](https://docs.victoriametrics.com/vmagent/#sharding-among-remote-storages) is enabled.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) addresses in `-remoteWrite.url` command-line option and in scrape target urls. For example, `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends the collected metrics to these TCP addresses. See [these docs](https://docs.victoriametrics.com/vmagent/#srv-urls) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
diff --git a/docs/MetricsQL.md b/docs/MetricsQL.md
index cdfa3039fc..fec10e5dc0 100644
--- a/docs/MetricsQL.md
+++ b/docs/MetricsQL.md
@@ -2229,6 +2229,7 @@ Any [rollup function](#rollup-functions) for something other than [series select
 Nested rollup functions can be implicit thanks to the [implicit query conversions](#implicit-query-conversions).
 For example, `delta(sum(m))` is implicitly converted to `delta(sum(default_rollup(m))[1i:1i])`, so it becomes a subquery,
 since it contains [default_rollup](#default_rollup) nested into [delta](#delta).
+This behavior can be disabled or logged via cmd-line flags `-search.disableImplicitConversion` and `-search.logImplicitConversion` since v1.101.0.
 
 VictoriaMetrics performs subqueries in the following way:
 
@@ -2261,4 +2262,5 @@ VictoriaMetrics performs the following implicit conversions for incoming queries
   For example, `avg_over_time(rate(http_requests_total[5m])[1h])` is automatically converted to `avg_over_time(rate(http_requests_total[5m])[1h:1i])`.
 * If something other than [series selector](https://docs.victoriametrics.com/keyconcepts/#filtering)
   is passed to [rollup function](#rollup-functions), then a [subquery](#subqueries) with `1i` lookbehind window and `1i` step is automatically formed.
-  For example, `rate(sum(up))` is automatically converted to `rate((sum(default_rollup(up)))[1i:1i])`.
+  For example, `rate(sum(up))` is automatically converted to `rate((sum(default_rollup(up)))[1i:1i])`. 
+  This behavior can be disabled or logged via cmd-line flags `-search.disableImplicitConversion` and `-search.logImplicitConversion` since v1.101.0.