From c97d6ed6a4e8825665bae8d73b78fd4e1d40ebbb Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Thu, 5 Jan 2023 02:08:24 -0800
Subject: [PATCH] lib/streamaggr: sort `by` and `without` labels in the
 aggregate output metric name

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460
---
 docs/stream-aggregation.md        | 16 ++++++-----
 go.mod                            |  3 +--
 lib/streamaggr/streamaggr.go      | 20 ++++++++++++--
 lib/streamaggr/streamaggr_test.go | 45 ++++++++++++++++++++++---------
 4 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/docs/stream-aggregation.md b/docs/stream-aggregation.md
index 50234ae8df..11a0327abd 100644
--- a/docs/stream-aggregation.md
+++ b/docs/stream-aggregation.md
@@ -132,7 +132,7 @@ In this case the following stream aggregation can be used for reducing the numbe
   outputs: [total]
 ```
 
-This config specifies labels, which must be removed from the aggregate outpit, in the `without` list.
+This config specifies labels, which must be removed from the aggregate output, in the `without` list.
 See [these docs](#aggregating-by-labels) for more details.
 
 The aggregated output metric has the following name according to [output metric naming](#output-metric-names):
@@ -296,9 +296,9 @@ Output metric names for stream aggregation are constructed according to the foll
 
 - `<metric_name>` is the original metric name.
 - `<interval>` is the interval specified in the [stream aggregation config](#stream-aggregation-config).
-- `<by_labels>` is `_`-delimited list of `by` labels specified in the [stream aggregation config](#stream-aggregation-config).
+- `<by_labels>` is `_`-delimited sorted list of `by` labels specified in the [stream aggregation config](#stream-aggregation-config).
   If the `by` list is missing in the config, then the `_by_<by_labels>` part isn't included in the output metric name.
-- `<without_labels>` is an optional `_`-delimited list of `without` labels specified in the [stream aggregation config](#stream-aggregation-config).
+- `<without_labels>` is an optional `_`-delimited sorted list of `without` labels specified in the [stream aggregation config](#stream-aggregation-config).
   If the `without` list is missing in the config, then the `_without_<without_labels>` part isn't included in the output metric name.
 - `<output>` is the aggregate used for constucting the output metric. The aggregate name is taken from the `outputs` list
   at the corresponding [stream aggregation config](#stream-aggregation-config).
@@ -324,7 +324,7 @@ For example, the following config removes the `:1m_sum_samples` suffix added [to
 
 ## Aggregation outputs
 
-The following aggregation outputs are supported in the `outputs` list of the [stream aggregation config](#stream-aggregation-config):
+The following aggregation outputs can be put in the `outputs` list at [stream aggregation config](#stream-aggregation-config):
 
 * `total` generates output [counter](https://docs.victoriametrics.com/keyConcepts.html#counter) by summing the input counters.
   The `total` handler properly handles input counter resets.
@@ -375,7 +375,7 @@ removes the `instance` label from output metrics by summing input samples across
 ```
 
 In this case the `foo{app="bar",instance="..."}` input metrics are transformed into `foo:1m_without_instance_sum_samples{app="bar"}`
-output metric.
+output metric according to [output metric naming](#output-metric-names).
 
 It is possible specifying the exact list of labels in the output metrics via `by` list.
 For example, the following config sums input samples by the `app` label:
@@ -387,7 +387,11 @@ For example, the following config sums input samples by the `app` label:
 ```
 
 In this case the `foo{app="bar",instance="..."}` input metrics are transformed into `foo:1m_by_app_sum_samples{app="bar"}`
-output metric.
+output metric according to [output metric naming](#output-metric-names).
+
+The labels used in `by` and `without` lists can be modified via `input_relabel_configs` section - see [these docs](#relabeling).
+
+See also [aggregation outputs](#aggregation-outputs).
 
 
 ## Stream aggregation config
diff --git a/go.mod b/go.mod
index eee96d5a54..30af610906 100644
--- a/go.mod
+++ b/go.mod
@@ -30,6 +30,7 @@ require (
 	github.com/valyala/fastrand v1.1.0
 	github.com/valyala/fasttemplate v1.2.2
 	github.com/valyala/gozstd v1.17.0
+	github.com/valyala/histogram v1.2.0
 	github.com/valyala/quicktemplate v1.7.0
 	golang.org/x/net v0.4.0
 	golang.org/x/oauth2 v0.3.0
@@ -38,8 +39,6 @@ require (
 	gopkg.in/yaml.v2 v2.4.0
 )
 
-require github.com/valyala/histogram v1.2.0
-
 require (
 	cloud.google.com/go v0.107.0 // indirect
 	cloud.google.com/go/compute v1.14.0 // indirect
diff --git a/lib/streamaggr/streamaggr.go b/lib/streamaggr/streamaggr.go
index ff9bab6a69..d963b86ffd 100644
--- a/lib/streamaggr/streamaggr.go
+++ b/lib/streamaggr/streamaggr.go
@@ -3,6 +3,7 @@ package streamaggr
 import (
 	"fmt"
 	"math"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -225,8 +226,8 @@ func newAggregator(cfg *Config, pushFunc PushFunc) (*aggregator, error) {
 	}
 
 	// check by and without lists
-	by := cfg.By
-	without := cfg.Without
+	by := sortAndRemoveDuplicates(cfg.By)
+	without := sortAndRemoveDuplicates(cfg.Without)
 	if len(by) > 0 && len(without) > 0 {
 		return nil, fmt.Errorf("`by: %s` and `without: %s` lists cannot be set simultaneously", by, without)
 	}
@@ -639,3 +640,18 @@ func removeUnderscoreName(labels []string) []string {
 	}
 	return result
 }
+
+func sortAndRemoveDuplicates(a []string) []string {
+	if len(a) == 0 {
+		return nil
+	}
+	a = append([]string{}, a...)
+	sort.Strings(a)
+	dst := a[:1]
+	for _, v := range a[1:] {
+		if v != dst[len(dst)-1] {
+			dst = append(dst, v)
+		}
+	}
+	return dst
+}
diff --git a/lib/streamaggr/streamaggr_test.go b/lib/streamaggr/streamaggr_test.go
index bc2a11388c..672032e459 100644
--- a/lib/streamaggr/streamaggr_test.go
+++ b/lib/streamaggr/streamaggr_test.go
@@ -191,7 +191,7 @@ foo:1m_sum_samples{abc="123"} 12.5
 foo:1m_sum_samples{abc="456",de="fg"} 8
 `)
 
-	// Special case: __name__ in by list
+	// Special case: __name__ in `by` list - this is the same as empty `by` list
 	f(`
 - interval: 1m
   by: [__name__]
@@ -209,7 +209,7 @@ foo:1m_count_series 2
 foo:1m_sum_samples 20.5
 `)
 
-	// Non-empty by list with non-existing labels
+	// Non-empty `by` list with non-existing labels
 	f(`
 - interval: 1m
   by: [foo, bar]
@@ -219,15 +219,15 @@ foo{abc="123"} 4
 bar 5
 foo{abc="123"} 8.5
 foo{abc="456",de="fg"} 8
-`, `bar:1m_by_foo_bar_count_samples 1
-bar:1m_by_foo_bar_count_series 1
-bar:1m_by_foo_bar_sum_samples 5
-foo:1m_by_foo_bar_count_samples 3
-foo:1m_by_foo_bar_count_series 2
-foo:1m_by_foo_bar_sum_samples 20.5
+`, `bar:1m_by_bar_foo_count_samples 1
+bar:1m_by_bar_foo_count_series 1
+bar:1m_by_bar_foo_sum_samples 5
+foo:1m_by_bar_foo_count_samples 3
+foo:1m_by_bar_foo_count_series 2
+foo:1m_by_bar_foo_sum_samples 20.5
 `)
 
-	// Non-empty by list with existing label
+	// Non-empty `by` list with existing label
 	f(`
 - interval: 1m
   by: [abc]
@@ -248,7 +248,28 @@ foo:1m_by_abc_sum_samples{abc="123"} 12.5
 foo:1m_by_abc_sum_samples{abc="456"} 8
 `)
 
-	// Non-empty without list with non-existing labels
+	// Non-empty `by` list with duplicate existing label
+	f(`
+- interval: 1m
+  by: [abc, abc]
+  outputs: [count_samples, sum_samples, count_series]
+`, `
+foo{abc="123"} 4
+bar 5
+foo{abc="123"} 8.5
+foo{abc="456",de="fg"} 8
+`, `bar:1m_by_abc_count_samples 1
+bar:1m_by_abc_count_series 1
+bar:1m_by_abc_sum_samples 5
+foo:1m_by_abc_count_samples{abc="123"} 2
+foo:1m_by_abc_count_samples{abc="456"} 1
+foo:1m_by_abc_count_series{abc="123"} 1
+foo:1m_by_abc_count_series{abc="456"} 1
+foo:1m_by_abc_sum_samples{abc="123"} 12.5
+foo:1m_by_abc_sum_samples{abc="456"} 8
+`)
+
+	// Non-empty `without` list with non-existing labels
 	f(`
 - interval: 1m
   without: [foo]
@@ -269,7 +290,7 @@ foo:1m_without_foo_sum_samples{abc="123"} 12.5
 foo:1m_without_foo_sum_samples{abc="456",de="fg"} 8
 `)
 
-	// Non-empty without list with existing labels
+	// Non-empty `without` list with existing labels
 	f(`
 - interval: 1m
   without: [abc]
@@ -290,7 +311,7 @@ foo:1m_without_abc_sum_samples 12.5
 foo:1m_without_abc_sum_samples{de="fg"} 8
 `)
 
-	// Special case: __name__ in without list
+	// Special case: __name__ in `without` list
 	f(`
 - interval: 1m
   without: [__name__]