app/vmselect/promql: add outliers_iqr(q) and outlier_iqr_over_time(m[d]) functions

These functions allow detecting anomalies in series and samples using Interquartile range method. See Outliers section at https://en.wikipedia.org/wiki/Interquartile_range for more details.
2024-11-21 14:44:00 +00:00 · 2023-10-31 22:10:29 +01:00 · 2023-10-31 22:10:29 +01:00 · ea81f6fc36
commit ea81f6fc36
parent fba93dbe0b
11 changed files with 173 additions and 9 deletions
--- a/app/vmselect/promql/aggr.go
+++ b/app/vmselect/promql/aggr.go
@ -38,6 +38,7 @@ var aggrFuncs = map[string]aggrFunc{
 	"median":         aggrFuncMedian,
 	"min":            newAggrFunc(aggrFuncMin),
 	"mode":           newAggrFunc(aggrFuncMode),
+	"outliers_iqr":   aggrFuncOutliersIQR,
 	"outliers_mad":   aggrFuncOutliersMAD,
 	"outliersk":      aggrFuncOutliersK,
 	"quantile":       aggrFuncQuantile,
@ -944,6 +945,58 @@ func aggrFuncMAD(tss []*timeseries) []*timeseries {
 	return tss[:1]
 }

+func aggrFuncOutliersIQR(afa *aggrFuncArg) ([]*timeseries, error) {
+	args := afa.args
+	if err := expectTransformArgsNum(args, 1); err != nil {
+		return nil, err
+	}
+	afe := func(tss []*timeseries, modifier *metricsql.ModifierExpr) []*timeseries {
+		// Calculate lower and upper bounds for interquartile range per each point across tss
+		// according to Outliers section at https://en.wikipedia.org/wiki/Interquartile_range
+		lower, upper := getPerPointIQRBounds(tss)
+		// Leave only time series with outliers above upper bound or below lower bound
+		tssDst := tss[:0]
+		for _, ts := range tss {
+			values := ts.Values
+			for i, v := range values {
+				if v > upper[i] || v < lower[i] {
+					tssDst = append(tssDst, ts)
+					break
+				}
+			}
+		}
+		return tssDst
+	}
+	return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, true)
+}
+
+func getPerPointIQRBounds(tss []*timeseries) ([]float64, []float64) {
+	if len(tss) == 0 {
+		return nil, nil
+	}
+	pointsLen := len(tss[0].Values)
+	values := make([]float64, 0, len(tss))
+	var qs []float64
+	lower := make([]float64, pointsLen)
+	upper := make([]float64, pointsLen)
+	for i := 0; i < pointsLen; i++ {
+		values = values[:0]
+		for _, ts := range tss {
+			v := ts.Values[i]
+			if !math.IsNaN(v) {
+				values = append(values, v)
+			}
+		}
+		qs := quantiles(qs[:0], iqrPhis, values)
+		iqr := 1.5 * (qs[1] - qs[0])
+		lower[i] = qs[0] - iqr
+		upper[i] = qs[1] + iqr
+	}
+	return lower, upper
+}
+
+var iqrPhis = []float64{0.25, 0.75}
+
 func aggrFuncOutliersMAD(afa *aggrFuncArg) ([]*timeseries, error) {
 	args := afa.args
 	if err := expectTransformArgsNum(args, 2); err != nil {
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
@ -6910,6 +6910,30 @@ func TestExecSuccess(t *testing.T) {
 		resultExpected := []netstorage.Result{r}
 		f(q, resultExpected)
 	})
+	t.Run(`outliers_iqr()`, func(t *testing.T) {
+		t.Parallel()
+		q := `sort(outliers_iqr((
+			alias(time(), "m1"),
+			alias(time()*1.5, "m2"),
+			alias(time()*10, "m3"),
+			alias(time()*1.2, "m4"),
+			alias(time()*0.1, "m5"),
+		)))`
+		r1 := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{100, 120, 140, 160, 180, 200},
+			Timestamps: timestampsExpected,
+		}
+		r1.MetricName.MetricGroup = []byte("m5")
+		r2 := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{10000, 12000, 14000, 16000, 18000, 20000},
+			Timestamps: timestampsExpected,
+		}
+		r2.MetricName.MetricGroup = []byte("m3")
+		resultExpected := []netstorage.Result{r1, r2}
+		f(q, resultExpected)
+	})
 	t.Run(`outliers_mad(1)`, func(t *testing.T) {
 		t.Parallel()
 		q := `outliers_mad(1, (
--- a/app/vmselect/promql/rollup.go
+++ b/app/vmselect/promql/rollup.go
@ -62,6 +62,7 @@ var rollupFuncs = map[string]newRollupFunc{
 	"median_over_time":        newRollupFuncOneArg(rollupMedian),
 	"min_over_time":           newRollupFuncOneArg(rollupMin),
 	"mode_over_time":          newRollupFuncOneArg(rollupModeOverTime),
+	"outlier_iqr_over_time":   newRollupFuncOneArg(rollupOutlierIQR),
 	"predict_linear":          newRollupPredictLinear,
 	"present_over_time":       newRollupFuncOneArg(rollupPresent),
 	"quantile_over_time":      newRollupQuantile,
@ -122,6 +123,7 @@ var rollupAggrFuncs = map[string]rollupFunc{
 	"increases_over_time":     rollupIncreases,
 	"integrate":               rollupIntegrate,
 	"irate":                   rollupIderiv,
+	"iqr_over_time":           rollupOutlierIQR,
 	"lag":                     rollupLag,
 	"last_over_time":          rollupLast,
 	"lifetime":                rollupLifetime,
@ -225,6 +227,7 @@ var rollupFuncsKeepMetricName = map[string]bool{
 	"hoeffding_bound_lower": true,
 	"hoeffding_bound_upper": true,
 	"holt_winters":          true,
+	"iqr_over_time":         true,
 	"last_over_time":        true,
 	"max_over_time":         true,
 	"median_over_time":      true,
@ -1287,6 +1290,29 @@ func newRollupQuantiles(args []interface{}) (rollupFunc, error) {
 	return rf, nil
 }

+func rollupOutlierIQR(rfa *rollupFuncArg) float64 {
+	// There is no need in handling NaNs here, since they must be cleaned up
+	// before calling rollup funcs.
+
+	// See Outliers section at https://en.wikipedia.org/wiki/Interquartile_range
+	values := rfa.values
+	if len(values) < 2 {
+		return nan
+	}
+	qs := getFloat64s()
+	qs.A = quantiles(qs.A[:0], iqrPhis, values)
+	q25 := qs.A[0]
+	q75 := qs.A[1]
+	iqr := 1.5 * (q75 - q25)
+	putFloat64s(qs)
+
+	v := values[len(values)-1]
+	if v > q75+iqr || v < q25-iqr {
+		return v
+	}
+	return nan
+}
+
 func newRollupQuantile(args []interface{}) (rollupFunc, error) {
 	if err := expectRollupArgsNum(args, 2); err != nil {
 		return nil, err
--- a/app/vmselect/promql/rollup_test.go
+++ b/app/vmselect/promql/rollup_test.go
@ -12,6 +12,35 @@ var (
 	testTimestamps = []int64{5, 15, 24, 36, 49, 60, 78, 80, 97, 115, 120, 130}
 )

+func TestRollupOutlierIQR(t *testing.T) {
+	f := func(values []float64, resultExpected float64) {
+		t.Helper()
+		rfa := &rollupFuncArg{
+			values:     values,
+			timestamps: nil,
+		}
+		result := rollupOutlierIQR(rfa)
+		if math.IsNaN(result) {
+			if !math.IsNaN(resultExpected) {
+				t.Fatalf("unexpected value; got %v; want %v", result, resultExpected)
+			}
+		} else {
+			if math.IsNaN(resultExpected) {
+				t.Fatalf("unexpected value; got %v; want %v", result, resultExpected)
+			}
+			if result != resultExpected {
+				t.Fatalf("unexpected value; got %v; want %v", result, resultExpected)
+			}
+		}
+	}
+
+	f([]float64{1, 2, 3, 4, 5}, nan)
+	f([]float64{1, 2, 3, 4, 7}, nan)
+	f([]float64{1, 2, 3, 4, 8}, 8)
+	f([]float64{1, 2, 3, 4, -2}, nan)
+	f([]float64{1, 2, 3, 4, -3}, -3)
+}
+
 func TestRollupIderivDuplicateTimestamps(t *testing.T) {
 	rfa := &rollupFuncArg{
 		values:     []float64{1, 2, 3, 4, 5},
@ -186,6 +215,9 @@ func testRollupFunc(t *testing.T, funcName string, args []interface{}, vExpected
 				t.Fatalf("unexpected value; got %v; want %v", v, vExpected)
 			}
 		} else {
+			if math.IsNaN(v) {
+				t.Fatalf("unexpected value; got %v want %v", v, vExpected)
+			}
 			eps := math.Abs(v - vExpected)
 			if eps > 1e-14 {
 				t.Fatalf("unexpected value; got %v; want %v", v, vExpected)
@ -514,6 +546,7 @@ func TestRollupNewRollupFuncSuccess(t *testing.T) {
 	f("increase", 398)
 	f("increase_prometheus", 275)
 	f("irate", 0)
+	f("outlier_iqr_over_time", nan)
 	f("rate", 2200)
 	f("resets", 5)
 	f("range_over_time", 111)
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -44,6 +44,7 @@ The sandbox cluster installation is running under the constant load generated by
 * FEATURE: `vmselect`: expose `vm_memory_intensive_queries_total` counter metric which gets increased each time `-search.logQueryMemoryUsage` memory limit is exceeded by a query. This metric should help to identify expensive and heavy queries without inspecting the logs.
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [drop_empty_series()](https://docs.victoriametrics.com/MetricsQL.html#drop_empty_series) function, which can be used for filtering out empty series before performing additional calculations as shown in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071).
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [labels_equal()](https://docs.victoriametrics.com/MetricsQL.html#labels_equal) function, which can be used for searching series with identical values for the given labels. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5148).
+* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [`outlier_iqr_over_time(m[d])`](https://docs.victoriametrics.com/MetricsQL.html#outlier_iqr_over_time) and [`outliers_iqr(q)`](https://docs.victoriametrics.com/MetricsQL.html#outliers_iqr) functions, which allow detecting anomalies in samples and series using [Interquartile range method](https://en.wikipedia.org/wiki/Interquartile_range).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `eval_alignment` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups), it will align group query requests timestamp with interval like `datasource.queryTimeAlignment` did.
  This also means that `datasource.queryTimeAlignment` command-line flag becomes deprecated now and will have no effect if configured. If `datasource.queryTimeAlignment` was set to `false` before, then `eval_alignment` has to be set to `false` explicitly under group.
  See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049).
--- a/docs/MetricsQL.md
+++ b/docs/MetricsQL.md
@ -532,7 +532,7 @@ See also [duration_over_time](#duration_over_time) and [lag](#lag).
 `mad_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which calculates [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation)
 over raw samples on the given lookbehind window `d` per each time series returned from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering).

-See also [mad](#mad) and [range_mad](#range_mad).
+See also [mad](#mad), [range_mad](#range_mad) and [outlier_iqr_over_time](#outlier_iqr_over_time).

 #### max_over_time

@ -562,6 +562,18 @@ This function is supported by PromQL. See also [tmin_over_time](#tmin_over_time)
 for raw samples on the given lookbehind window `d`. It is calculated individually per each time series returned
 from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering). It is expected that raw sample values are discrete.

+#### outlier_iqr_over_time
+
+`outlier_iqr_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which returns the last sample on the given lookbehind window `d`
+if its value is either smaller than the `q25-1.5*iqr` or bigger than `q75+1.5*iqr` where:
+- `iqr` is an [Interquartile range](https://en.wikipedia.org/wiki/Interquartile_range) over raw samples on the lookbehind window `d`
+- `q25` and `q75` are 25th and 75th [percentiles](https://en.wikipedia.org/wiki/Percentile) over raw samples on the lookbehind window `d`.
+
+The `outlier_iqr_over_time()` is useful for detecting anomalies in gauge values based on the previous history of values.
+For example, `outlier_iqr_over_time(memory_usage_bytes[1h])` triggers when `memory_usage_bytes` suddenly goes outside the usual value range for the last 24 hours.
+
+See also [outliers_iqr](#outliers_iqr).
+
 #### predict_linear

 `predict_linear(series_selector[d], t)` is a [rollup function](#rollup-functions), which calculates the value `t` seconds in the future using
@ -866,7 +878,7 @@ from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.ht

 Metric names are stripped from the resulting rollups. Add [keep_metric_names](#keep_metric_names) modifier in order to keep metric names.

-See also [zscore](#zscore) and [range_trim_zscore](#range_trim_zscore).
+See also [zscore](#zscore), [range_trim_zscore](#range_trim_zscore) and [outlier_iqr_over_time](#outlier_iqr_over_time).


 ### Transform functions
@ -1858,20 +1870,33 @@ This function is supported by PromQL.
 `mode(q) by (group_labels)` is [aggregate function](#aggregate-functions), which returns [mode](https://en.wikipedia.org/wiki/Mode_(statistics))
 per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp.

+#### outliers_iqr
+
+`outliers_iqr(q)` is [aggregate function](#aggregate-functions), which returns time series from `q` with at least a single point
+outside e.g. [Interquartile range outlier bounds](https://en.wikipedia.org/wiki/Interquartile_range) `[q25-1.5*iqr .. q75+1.5*iqr]`
+comparing to other time series at the given point, where:
+- `iqr` is an [Interquartile range](https://en.wikipedia.org/wiki/Interquartile_range) calculated independently per each point on the graph across `q` series.
+- `q25` and `q75` are 25th and 75th [percentiles](https://en.wikipedia.org/wiki/Percentile) calculated independently per each point on the graph across `q` series.
+
+The `outliers_iqr()` is useful for detecting anomalous series in the group of series. For example, `outliers_iqr(temperature) by (country)` returns
+per-country series with anomalous outlier values comparing to the rest of per-country series.
+
+See also [outliers_mad](#outliers_mad), [outliersk](#outliersk) and [outlier_iqr_over_time](#outlier_iqr_over_time).
+
 #### outliers_mad

 `outliers_mad(tolerance, q)` is [aggregate function](#aggregate-functions), which returns time series from `q` with at least
 a single point outside [Median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) (aka MAD) multiplied by `tolerance`.
 E.g. it returns time series with at least a single point below `median(q) - mad(q)` or a single point above `median(q) + mad(q)`.

-See also [outliersk](#outliersk) and [mad](#mad).
+See also [outliers_iqr](#outliers_iqr), [outliersk](#outliersk) and [mad](#mad).

 #### outliersk

 `outliersk(k, q)` is [aggregate function](#aggregate-functions), which returns up to `k` time series with the biggest standard deviation (aka outliers)
 out of time series returned by `q`.

-See also [outliers_mad](#outliers_mad).
+See also [outliers_iqr](#outliers_iqr) and [outliers_mad](#outliers_mad).

 #### quantile

@ -1991,7 +2016,7 @@ See also [bottomk_min](#bottomk_min).
 per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp.
 This function is useful for detecting anomalies in the group of related time series.

-See also [zscore_over_time](#zscore_over_time) and [range_trim_zscore](#range_trim_zscore).
+See also [zscore_over_time](#zscore_over_time), [range_trim_zscore](#range_trim_zscore) and [outliers_iqr](#outliers_iqr).

 ## Subqueries

--- a/go.mod
+++ b/go.mod
@ -12,7 +12,7 @@ require (
 	// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
 	github.com/VictoriaMetrics/fasthttp v1.2.0
 	github.com/VictoriaMetrics/metrics v1.24.0
-	github.com/VictoriaMetrics/metricsql v0.68.0
+	github.com/VictoriaMetrics/metricsql v0.69.0
 	github.com/aws/aws-sdk-go-v2 v1.22.0
 	github.com/aws/aws-sdk-go-v2/config v1.20.0
 	github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.12.0
--- a/go.sum
+++ b/go.sum
@ -70,8 +70,8 @@ github.com/VictoriaMetrics/fasthttp v1.2.0 h1:nd9Wng4DlNtaI27WlYh5mGXCJOmee/2c2b
 github.com/VictoriaMetrics/fasthttp v1.2.0/go.mod h1:zv5YSmasAoSyv8sBVexfArzFDIGGTN4TfCKAtAw7IfE=
 github.com/VictoriaMetrics/metrics v1.24.0 h1:ILavebReOjYctAGY5QU2F9X0MYvkcrG3aEn2RKa1Zkw=
 github.com/VictoriaMetrics/metrics v1.24.0/go.mod h1:eFT25kvsTidQFHb6U0oa0rTrDRdz4xTYjpL8+UPohys=
-github.com/VictoriaMetrics/metricsql v0.68.0 h1:fAzYPjYkEipM/L/+WYbAK/gYuqt5rQHnb3cTY2cN628=
-github.com/VictoriaMetrics/metricsql v0.68.0/go.mod h1:k4UaP/+CjuZslIjd+kCigNG9TQmUqh5v0TP/nMEy90I=
+github.com/VictoriaMetrics/metricsql v0.69.0 h1:6np68zGOnMiGEJR/rCvywS1gbLGXVrmQC3BKydsbWHw=
+github.com/VictoriaMetrics/metricsql v0.69.0/go.mod h1:k4UaP/+CjuZslIjd+kCigNG9TQmUqh5v0TP/nMEy90I=
 github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=
 github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
--- a/vendor/github.com/VictoriaMetrics/metricsql/aggr.go
+++ b/vendor/github.com/VictoriaMetrics/metricsql/aggr.go
@ -25,6 +25,7 @@ var aggrFuncs = map[string]bool{
 	"median":         true,
 	"min":            true,
 	"mode":           true,
+	"outliers_iqr":   true,
 	"outliers_mad":   true,
 	"outliersk":      true,
 	"quantile":       true,
--- a/vendor/github.com/VictoriaMetrics/metricsql/rollup.go
+++ b/vendor/github.com/VictoriaMetrics/metricsql/rollup.go
@ -47,6 +47,7 @@ var rollupFuncs = map[string]bool{
 	"median_over_time":        true,
 	"min_over_time":           true,
 	"mode_over_time":          true,
+	"outlier_iqr_over_time":   true,
 	"predict_linear":          true,
 	"present_over_time":       true,
 	"quantile_over_time":      true,
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -99,7 +99,7 @@ github.com/VictoriaMetrics/fasthttp/stackless
 # github.com/VictoriaMetrics/metrics v1.24.0
 ## explicit; go 1.20
 github.com/VictoriaMetrics/metrics
-# github.com/VictoriaMetrics/metricsql v0.68.0
+# github.com/VictoriaMetrics/metricsql v0.69.0
 ## explicit; go 1.13
 github.com/VictoriaMetrics/metricsql
 github.com/VictoriaMetrics/metricsql/binaryop