From ea81f6fc3623246985b082748cafefe8b58b808e Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 31 Oct 2023 22:10:29 +0100 Subject: [PATCH] app/vmselect/promql: add outliers_iqr(q) and outlier_iqr_over_time(m[d]) functions These functions allow detecting anomalies in series and samples using Interquartile range method. See Outliers section at https://en.wikipedia.org/wiki/Interquartile_range for more details. --- app/vmselect/promql/aggr.go | 53 +++++++++++++++++++ app/vmselect/promql/exec_test.go | 24 +++++++++ app/vmselect/promql/rollup.go | 26 +++++++++ app/vmselect/promql/rollup_test.go | 33 ++++++++++++ docs/CHANGELOG.md | 1 + docs/MetricsQL.md | 35 ++++++++++-- go.mod | 2 +- go.sum | 4 +- .../VictoriaMetrics/metricsql/aggr.go | 1 + .../VictoriaMetrics/metricsql/rollup.go | 1 + vendor/modules.txt | 2 +- 11 files changed, 173 insertions(+), 9 deletions(-) diff --git a/app/vmselect/promql/aggr.go b/app/vmselect/promql/aggr.go index c3310ee22..943be4210 100644 --- a/app/vmselect/promql/aggr.go +++ b/app/vmselect/promql/aggr.go @@ -38,6 +38,7 @@ var aggrFuncs = map[string]aggrFunc{ "median": aggrFuncMedian, "min": newAggrFunc(aggrFuncMin), "mode": newAggrFunc(aggrFuncMode), + "outliers_iqr": aggrFuncOutliersIQR, "outliers_mad": aggrFuncOutliersMAD, "outliersk": aggrFuncOutliersK, "quantile": aggrFuncQuantile, @@ -944,6 +945,58 @@ func aggrFuncMAD(tss []*timeseries) []*timeseries { return tss[:1] } +func aggrFuncOutliersIQR(afa *aggrFuncArg) ([]*timeseries, error) { + args := afa.args + if err := expectTransformArgsNum(args, 1); err != nil { + return nil, err + } + afe := func(tss []*timeseries, modifier *metricsql.ModifierExpr) []*timeseries { + // Calculate lower and upper bounds for interquartile range per each point across tss + // according to Outliers section at https://en.wikipedia.org/wiki/Interquartile_range + lower, upper := getPerPointIQRBounds(tss) + // Leave only time series with outliers above upper bound or below lower bound + tssDst := tss[:0] + for _, ts := range tss { + values := ts.Values + for i, v := range values { + if v > upper[i] || v < lower[i] { + tssDst = append(tssDst, ts) + break + } + } + } + return tssDst + } + return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, true) +} + +func getPerPointIQRBounds(tss []*timeseries) ([]float64, []float64) { + if len(tss) == 0 { + return nil, nil + } + pointsLen := len(tss[0].Values) + values := make([]float64, 0, len(tss)) + var qs []float64 + lower := make([]float64, pointsLen) + upper := make([]float64, pointsLen) + for i := 0; i < pointsLen; i++ { + values = values[:0] + for _, ts := range tss { + v := ts.Values[i] + if !math.IsNaN(v) { + values = append(values, v) + } + } + qs := quantiles(qs[:0], iqrPhis, values) + iqr := 1.5 * (qs[1] - qs[0]) + lower[i] = qs[0] - iqr + upper[i] = qs[1] + iqr + } + return lower, upper +} + +var iqrPhis = []float64{0.25, 0.75} + func aggrFuncOutliersMAD(afa *aggrFuncArg) ([]*timeseries, error) { args := afa.args if err := expectTransformArgsNum(args, 2); err != nil { diff --git a/app/vmselect/promql/exec_test.go b/app/vmselect/promql/exec_test.go index 52e72f9a2..049789082 100644 --- a/app/vmselect/promql/exec_test.go +++ b/app/vmselect/promql/exec_test.go @@ -6910,6 +6910,30 @@ func TestExecSuccess(t *testing.T) { resultExpected := []netstorage.Result{r} f(q, resultExpected) }) + t.Run(`outliers_iqr()`, func(t *testing.T) { + t.Parallel() + q := `sort(outliers_iqr(( + alias(time(), "m1"), + alias(time()*1.5, "m2"), + alias(time()*10, "m3"), + alias(time()*1.2, "m4"), + alias(time()*0.1, "m5"), + )))` + r1 := netstorage.Result{ + MetricName: metricNameExpected, + Values: []float64{100, 120, 140, 160, 180, 200}, + Timestamps: timestampsExpected, + } + r1.MetricName.MetricGroup = []byte("m5") + r2 := netstorage.Result{ + MetricName: metricNameExpected, + Values: []float64{10000, 12000, 14000, 16000, 18000, 20000}, + Timestamps: timestampsExpected, + } + r2.MetricName.MetricGroup = []byte("m3") + resultExpected := []netstorage.Result{r1, r2} + f(q, resultExpected) + }) t.Run(`outliers_mad(1)`, func(t *testing.T) { t.Parallel() q := `outliers_mad(1, ( diff --git a/app/vmselect/promql/rollup.go b/app/vmselect/promql/rollup.go index 9f84f74a7..7b7075128 100644 --- a/app/vmselect/promql/rollup.go +++ b/app/vmselect/promql/rollup.go @@ -62,6 +62,7 @@ var rollupFuncs = map[string]newRollupFunc{ "median_over_time": newRollupFuncOneArg(rollupMedian), "min_over_time": newRollupFuncOneArg(rollupMin), "mode_over_time": newRollupFuncOneArg(rollupModeOverTime), + "outlier_iqr_over_time": newRollupFuncOneArg(rollupOutlierIQR), "predict_linear": newRollupPredictLinear, "present_over_time": newRollupFuncOneArg(rollupPresent), "quantile_over_time": newRollupQuantile, @@ -122,6 +123,7 @@ var rollupAggrFuncs = map[string]rollupFunc{ "increases_over_time": rollupIncreases, "integrate": rollupIntegrate, "irate": rollupIderiv, + "iqr_over_time": rollupOutlierIQR, "lag": rollupLag, "last_over_time": rollupLast, "lifetime": rollupLifetime, @@ -225,6 +227,7 @@ var rollupFuncsKeepMetricName = map[string]bool{ "hoeffding_bound_lower": true, "hoeffding_bound_upper": true, "holt_winters": true, + "iqr_over_time": true, "last_over_time": true, "max_over_time": true, "median_over_time": true, @@ -1287,6 +1290,29 @@ func newRollupQuantiles(args []interface{}) (rollupFunc, error) { return rf, nil } +func rollupOutlierIQR(rfa *rollupFuncArg) float64 { + // There is no need in handling NaNs here, since they must be cleaned up + // before calling rollup funcs. + + // See Outliers section at https://en.wikipedia.org/wiki/Interquartile_range + values := rfa.values + if len(values) < 2 { + return nan + } + qs := getFloat64s() + qs.A = quantiles(qs.A[:0], iqrPhis, values) + q25 := qs.A[0] + q75 := qs.A[1] + iqr := 1.5 * (q75 - q25) + putFloat64s(qs) + + v := values[len(values)-1] + if v > q75+iqr || v < q25-iqr { + return v + } + return nan +} + func newRollupQuantile(args []interface{}) (rollupFunc, error) { if err := expectRollupArgsNum(args, 2); err != nil { return nil, err diff --git a/app/vmselect/promql/rollup_test.go b/app/vmselect/promql/rollup_test.go index 5b967b13d..12a3b0acb 100644 --- a/app/vmselect/promql/rollup_test.go +++ b/app/vmselect/promql/rollup_test.go @@ -12,6 +12,35 @@ var ( testTimestamps = []int64{5, 15, 24, 36, 49, 60, 78, 80, 97, 115, 120, 130} ) +func TestRollupOutlierIQR(t *testing.T) { + f := func(values []float64, resultExpected float64) { + t.Helper() + rfa := &rollupFuncArg{ + values: values, + timestamps: nil, + } + result := rollupOutlierIQR(rfa) + if math.IsNaN(result) { + if !math.IsNaN(resultExpected) { + t.Fatalf("unexpected value; got %v; want %v", result, resultExpected) + } + } else { + if math.IsNaN(resultExpected) { + t.Fatalf("unexpected value; got %v; want %v", result, resultExpected) + } + if result != resultExpected { + t.Fatalf("unexpected value; got %v; want %v", result, resultExpected) + } + } + } + + f([]float64{1, 2, 3, 4, 5}, nan) + f([]float64{1, 2, 3, 4, 7}, nan) + f([]float64{1, 2, 3, 4, 8}, 8) + f([]float64{1, 2, 3, 4, -2}, nan) + f([]float64{1, 2, 3, 4, -3}, -3) +} + func TestRollupIderivDuplicateTimestamps(t *testing.T) { rfa := &rollupFuncArg{ values: []float64{1, 2, 3, 4, 5}, @@ -186,6 +215,9 @@ func testRollupFunc(t *testing.T, funcName string, args []interface{}, vExpected t.Fatalf("unexpected value; got %v; want %v", v, vExpected) } } else { + if math.IsNaN(v) { + t.Fatalf("unexpected value; got %v want %v", v, vExpected) + } eps := math.Abs(v - vExpected) if eps > 1e-14 { t.Fatalf("unexpected value; got %v; want %v", v, vExpected) @@ -514,6 +546,7 @@ func TestRollupNewRollupFuncSuccess(t *testing.T) { f("increase", 398) f("increase_prometheus", 275) f("irate", 0) + f("outlier_iqr_over_time", nan) f("rate", 2200) f("resets", 5) f("range_over_time", 111) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 36428ff5e..ff2677fa2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -44,6 +44,7 @@ The sandbox cluster installation is running under the constant load generated by * FEATURE: `vmselect`: expose `vm_memory_intensive_queries_total` counter metric which gets increased each time `-search.logQueryMemoryUsage` memory limit is exceeded by a query. This metric should help to identify expensive and heavy queries without inspecting the logs. * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [drop_empty_series()](https://docs.victoriametrics.com/MetricsQL.html#drop_empty_series) function, which can be used for filtering out empty series before performing additional calculations as shown in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071). * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [labels_equal()](https://docs.victoriametrics.com/MetricsQL.html#labels_equal) function, which can be used for searching series with identical values for the given labels. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5148). +* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [`outlier_iqr_over_time(m[d])`](https://docs.victoriametrics.com/MetricsQL.html#outlier_iqr_over_time) and [`outliers_iqr(q)`](https://docs.victoriametrics.com/MetricsQL.html#outliers_iqr) functions, which allow detecting anomalies in samples and series using [Interquartile range method](https://en.wikipedia.org/wiki/Interquartile_range). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `eval_alignment` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups), it will align group query requests timestamp with interval like `datasource.queryTimeAlignment` did. This also means that `datasource.queryTimeAlignment` command-line flag becomes deprecated now and will have no effect if configured. If `datasource.queryTimeAlignment` was set to `false` before, then `eval_alignment` has to be set to `false` explicitly under group. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5049). diff --git a/docs/MetricsQL.md b/docs/MetricsQL.md index 876daddd2..5be2454a9 100644 --- a/docs/MetricsQL.md +++ b/docs/MetricsQL.md @@ -532,7 +532,7 @@ See also [duration_over_time](#duration_over_time) and [lag](#lag). `mad_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which calculates [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over raw samples on the given lookbehind window `d` per each time series returned from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering). -See also [mad](#mad) and [range_mad](#range_mad). +See also [mad](#mad), [range_mad](#range_mad) and [outlier_iqr_over_time](#outlier_iqr_over_time). #### max_over_time @@ -562,6 +562,18 @@ This function is supported by PromQL. See also [tmin_over_time](#tmin_over_time) for raw samples on the given lookbehind window `d`. It is calculated individually per each time series returned from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering). It is expected that raw sample values are discrete. +#### outlier_iqr_over_time + +`outlier_iqr_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which returns the last sample on the given lookbehind window `d` +if its value is either smaller than the `q25-1.5*iqr` or bigger than `q75+1.5*iqr` where: +- `iqr` is an [Interquartile range](https://en.wikipedia.org/wiki/Interquartile_range) over raw samples on the lookbehind window `d` +- `q25` and `q75` are 25th and 75th [percentiles](https://en.wikipedia.org/wiki/Percentile) over raw samples on the lookbehind window `d`. + +The `outlier_iqr_over_time()` is useful for detecting anomalies in gauge values based on the previous history of values. +For example, `outlier_iqr_over_time(memory_usage_bytes[1h])` triggers when `memory_usage_bytes` suddenly goes outside the usual value range for the last 24 hours. + +See also [outliers_iqr](#outliers_iqr). + #### predict_linear `predict_linear(series_selector[d], t)` is a [rollup function](#rollup-functions), which calculates the value `t` seconds in the future using @@ -866,7 +878,7 @@ from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.ht Metric names are stripped from the resulting rollups. Add [keep_metric_names](#keep_metric_names) modifier in order to keep metric names. -See also [zscore](#zscore) and [range_trim_zscore](#range_trim_zscore). +See also [zscore](#zscore), [range_trim_zscore](#range_trim_zscore) and [outlier_iqr_over_time](#outlier_iqr_over_time). ### Transform functions @@ -1858,20 +1870,33 @@ This function is supported by PromQL. `mode(q) by (group_labels)` is [aggregate function](#aggregate-functions), which returns [mode](https://en.wikipedia.org/wiki/Mode_(statistics)) per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp. +#### outliers_iqr + +`outliers_iqr(q)` is [aggregate function](#aggregate-functions), which returns time series from `q` with at least a single point +outside e.g. [Interquartile range outlier bounds](https://en.wikipedia.org/wiki/Interquartile_range) `[q25-1.5*iqr .. q75+1.5*iqr]` +comparing to other time series at the given point, where: +- `iqr` is an [Interquartile range](https://en.wikipedia.org/wiki/Interquartile_range) calculated independently per each point on the graph across `q` series. +- `q25` and `q75` are 25th and 75th [percentiles](https://en.wikipedia.org/wiki/Percentile) calculated independently per each point on the graph across `q` series. + +The `outliers_iqr()` is useful for detecting anomalous series in the group of series. For example, `outliers_iqr(temperature) by (country)` returns +per-country series with anomalous outlier values comparing to the rest of per-country series. + +See also [outliers_mad](#outliers_mad), [outliersk](#outliersk) and [outlier_iqr_over_time](#outlier_iqr_over_time). + #### outliers_mad `outliers_mad(tolerance, q)` is [aggregate function](#aggregate-functions), which returns time series from `q` with at least a single point outside [Median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) (aka MAD) multiplied by `tolerance`. E.g. it returns time series with at least a single point below `median(q) - mad(q)` or a single point above `median(q) + mad(q)`. -See also [outliersk](#outliersk) and [mad](#mad). +See also [outliers_iqr](#outliers_iqr), [outliersk](#outliersk) and [mad](#mad). #### outliersk `outliersk(k, q)` is [aggregate function](#aggregate-functions), which returns up to `k` time series with the biggest standard deviation (aka outliers) out of time series returned by `q`. -See also [outliers_mad](#outliers_mad). +See also [outliers_iqr](#outliers_iqr) and [outliers_mad](#outliers_mad). #### quantile @@ -1991,7 +2016,7 @@ See also [bottomk_min](#bottomk_min). per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp. This function is useful for detecting anomalies in the group of related time series. -See also [zscore_over_time](#zscore_over_time) and [range_trim_zscore](#range_trim_zscore). +See also [zscore_over_time](#zscore_over_time), [range_trim_zscore](#range_trim_zscore) and [outliers_iqr](#outliers_iqr). ## Subqueries diff --git a/go.mod b/go.mod index 5583663a0..879b59e8f 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( // like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b github.com/VictoriaMetrics/fasthttp v1.2.0 github.com/VictoriaMetrics/metrics v1.24.0 - github.com/VictoriaMetrics/metricsql v0.68.0 + github.com/VictoriaMetrics/metricsql v0.69.0 github.com/aws/aws-sdk-go-v2 v1.22.0 github.com/aws/aws-sdk-go-v2/config v1.20.0 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.12.0 diff --git a/go.sum b/go.sum index 56fa9d5a0..7f1de4b4a 100644 --- a/go.sum +++ b/go.sum @@ -70,8 +70,8 @@ github.com/VictoriaMetrics/fasthttp v1.2.0 h1:nd9Wng4DlNtaI27WlYh5mGXCJOmee/2c2b github.com/VictoriaMetrics/fasthttp v1.2.0/go.mod h1:zv5YSmasAoSyv8sBVexfArzFDIGGTN4TfCKAtAw7IfE= github.com/VictoriaMetrics/metrics v1.24.0 h1:ILavebReOjYctAGY5QU2F9X0MYvkcrG3aEn2RKa1Zkw= github.com/VictoriaMetrics/metrics v1.24.0/go.mod h1:eFT25kvsTidQFHb6U0oa0rTrDRdz4xTYjpL8+UPohys= -github.com/VictoriaMetrics/metricsql v0.68.0 h1:fAzYPjYkEipM/L/+WYbAK/gYuqt5rQHnb3cTY2cN628= -github.com/VictoriaMetrics/metricsql v0.68.0/go.mod h1:k4UaP/+CjuZslIjd+kCigNG9TQmUqh5v0TP/nMEy90I= +github.com/VictoriaMetrics/metricsql v0.69.0 h1:6np68zGOnMiGEJR/rCvywS1gbLGXVrmQC3BKydsbWHw= +github.com/VictoriaMetrics/metricsql v0.69.0/go.mod h1:k4UaP/+CjuZslIjd+kCigNG9TQmUqh5v0TP/nMEy90I= github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow= github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= diff --git a/vendor/github.com/VictoriaMetrics/metricsql/aggr.go b/vendor/github.com/VictoriaMetrics/metricsql/aggr.go index 51bc36a0f..9bc8e9a4a 100644 --- a/vendor/github.com/VictoriaMetrics/metricsql/aggr.go +++ b/vendor/github.com/VictoriaMetrics/metricsql/aggr.go @@ -25,6 +25,7 @@ var aggrFuncs = map[string]bool{ "median": true, "min": true, "mode": true, + "outliers_iqr": true, "outliers_mad": true, "outliersk": true, "quantile": true, diff --git a/vendor/github.com/VictoriaMetrics/metricsql/rollup.go b/vendor/github.com/VictoriaMetrics/metricsql/rollup.go index 99d8f56bc..921367f18 100644 --- a/vendor/github.com/VictoriaMetrics/metricsql/rollup.go +++ b/vendor/github.com/VictoriaMetrics/metricsql/rollup.go @@ -47,6 +47,7 @@ var rollupFuncs = map[string]bool{ "median_over_time": true, "min_over_time": true, "mode_over_time": true, + "outlier_iqr_over_time": true, "predict_linear": true, "present_over_time": true, "quantile_over_time": true, diff --git a/vendor/modules.txt b/vendor/modules.txt index 2d97c8b2c..5021879be 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -99,7 +99,7 @@ github.com/VictoriaMetrics/fasthttp/stackless # github.com/VictoriaMetrics/metrics v1.24.0 ## explicit; go 1.20 github.com/VictoriaMetrics/metrics -# github.com/VictoriaMetrics/metricsql v0.68.0 +# github.com/VictoriaMetrics/metricsql v0.69.0 ## explicit; go 1.13 github.com/VictoriaMetrics/metricsql github.com/VictoriaMetrics/metricsql/binaryop