From 015c0a4d1afe5dab2bebe3db6f8201c4f1a55758 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Tue, 16 Jan 2024 01:30:07 +0200
Subject: [PATCH] app/vmselect/promql: consistently sort results of `a or b`
 query

Previously the order of results returned from `a or b` query could change with each request
because the sorting for such query has been disabled in order to satisfy
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4763 .

This commit executes `a or b` query as `sortByMetricName(a) or sortByMetricName(b)`.
This makes the order of returned time series consistent across requests,
while maintaining the requirement from https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4763 ,
e.g. `b` results are consistently put after `a` results.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393
---
 app/vmselect/promql/binary_op.go | 10 +++++++
 app/vmselect/promql/exec.go      | 18 ++++++++-----
 app/vmselect/promql/exec_test.go | 45 ++++++++++++++++++++++++++++++++
 docs/CHANGELOG.md                |  1 +
 4 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/app/vmselect/promql/binary_op.go b/app/vmselect/promql/binary_op.go
index d020825569..3bf8ff5648 100644
--- a/app/vmselect/promql/binary_op.go
+++ b/app/vmselect/promql/binary_op.go
@@ -404,9 +404,15 @@ func binaryOpDefault(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 	mLeft, mRight := createTimeseriesMapByTagSet(bfa.be, bfa.left, bfa.right)
 	var rvs []*timeseries
+
 	for _, tss := range mLeft {
 		rvs = append(rvs, tss...)
 	}
+	// Sort left-hand-side series by metric name as Prometheus does.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393
+	sortSeriesByMetricName(rvs)
+	rvsLen := len(rvs)
+
 	for k, tssRight := range mRight {
 		tssLeft := mLeft[k]
 		if tssLeft == nil {
@@ -415,6 +421,10 @@ func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 		}
 		fillLeftNaNsWithRightValues(tssLeft, tssRight)
 	}
+	// Sort the added right-hand-side series by metric name as Prometheus does.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393
+	sortSeriesByMetricName(rvs[rvsLen:])
+
 	return rvs, nil
 }
 
diff --git a/app/vmselect/promql/exec.go b/app/vmselect/promql/exec.go
index 2317f800ad..6798f85460 100644
--- a/app/vmselect/promql/exec.go
+++ b/app/vmselect/promql/exec.go
@@ -111,6 +111,7 @@ func maySortResults(e metricsql.Expr) bool {
 		case "sort", "sort_desc",
 			"sort_by_label", "sort_by_label_desc",
 			"sort_by_label_numeric", "sort_by_label_numeric_desc":
+			// Results already sorted
 			return false
 		}
 	case *metricsql.AggrFuncExpr:
@@ -118,6 +119,7 @@ func maySortResults(e metricsql.Expr) bool {
 		case "topk", "bottomk", "outliersk",
 			"topk_max", "topk_min", "topk_avg", "topk_median", "topk_last",
 			"bottomk_max", "bottomk_min", "bottomk_avg", "bottomk_median", "bottomk_last":
+			// Results already sorted
 			return false
 		}
 	case *metricsql.BinaryOpExpr:
@@ -132,6 +134,10 @@ func maySortResults(e metricsql.Expr) bool {
 
 func timeseriesToResult(tss []*timeseries, maySort bool) ([]netstorage.Result, error) {
 	tss = removeEmptySeries(tss)
+	if maySort {
+		sortSeriesByMetricName(tss)
+	}
+
 	result := make([]netstorage.Result, len(tss))
 	m := make(map[string]struct{}, len(tss))
 	bb := bbPool.Get()
@@ -152,15 +158,15 @@ func timeseriesToResult(tss []*timeseries, maySort bool) ([]netstorage.Result, e
 	}
 	bbPool.Put(bb)
 
-	if maySort {
-		sort.Slice(result, func(i, j int) bool {
-			return metricNameLess(&result[i].MetricName, &result[j].MetricName)
-		})
-	}
-
 	return result, nil
 }
 
+func sortSeriesByMetricName(tss []*timeseries) {
+	sort.Slice(tss, func(i, j int) bool {
+		return metricNameLess(&tss[i].MetricName, &tss[j].MetricName)
+	})
+}
+
 func metricNameLess(a, b *storage.MetricName) bool {
 	if string(a.MetricGroup) != string(b.MetricGroup) {
 		return string(a.MetricGroup) < string(b.MetricGroup)
diff --git a/app/vmselect/promql/exec_test.go b/app/vmselect/promql/exec_test.go
index b5a674f7fe..3d5cc56d6e 100644
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
@@ -3059,6 +3059,51 @@ func TestExecSuccess(t *testing.T) {
 		resultExpected := []netstorage.Result{r}
 		f(q, resultExpected)
 	})
+	t.Run(`series or series`, func(t *testing.T) {
+		t.Parallel()
+		q := `(
+			label_set(time(), "x", "foo"),
+			label_set(time()+1, "x", "bar"),
+		) or (
+			label_set(time()+2, "x", "foo"),
+			label_set(time()+3, "x", "baz"),
+		)`
+		r1 := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{1001, 1201, 1401, 1601, 1801, 2001},
+			Timestamps: timestampsExpected,
+		}
+		r1.MetricName.Tags = []storage.Tag{
+			{
+				Key:   []byte("x"),
+				Value: []byte("bar"),
+			},
+		}
+		r2 := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{1000, 1200, 1400, 1600, 1800, 2000},
+			Timestamps: timestampsExpected,
+		}
+		r2.MetricName.Tags = []storage.Tag{
+			{
+				Key:   []byte("x"),
+				Value: []byte("foo"),
+			},
+		}
+		r3 := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{1003, 1203, 1403, 1603, 1803, 2003},
+			Timestamps: timestampsExpected,
+		}
+		r3.MetricName.Tags = []storage.Tag{
+			{
+				Key:   []byte("x"),
+				Value: []byte("baz"),
+			},
+		}
+		resultExpected := []netstorage.Result{r1, r2, r3}
+		f(q, resultExpected)
+	})
 	t.Run(`scalar or scalar`, func(t *testing.T) {
 		t.Parallel()
 		q := `time() > 1400 or 123`
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 9eeca32e0c..a47eef978c 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -50,6 +50,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: `vmstorage`: properly check for `storage/prefetchedMetricIDs` cache expiration deadline. Before, this cache was limited only by size.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
+* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
 * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.