app/vmselect/promql: add range_zscore(q) and range_trim_zscore(z, q) functions

These functions may be useful for dropping outliers at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759
This commit is contained in:
Aliaksandr Valialkin 2023-02-18 16:56:44 -08:00
parent 1030be91ae
commit c86f1f1d1b
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
9 changed files with 116 additions and 14 deletions

View file

@ -6525,6 +6525,28 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r} resultExpected := []netstorage.Result{r}
f(q, resultExpected) f(q, resultExpected)
}) })
t.Run(`range_trim_zscore()`, func(t *testing.T) {
t.Parallel()
q := `range_trim_zscore(0.9, time())`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{nan, 1200, 1400, 1600, 1800, nan},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`range_zscore()`, func(t *testing.T) {
t.Parallel()
q := `round(range_zscore(time()), 0.1)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{-1.5, -0.9, -0.3, 0.3, 0.9, 1.5},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`range_quantile(0.5)`, func(t *testing.T) { t.Run(`range_quantile(0.5)`, func(t *testing.T) {
t.Parallel() t.Parallel()
q := `range_quantile(0.5, time())` q := `range_quantile(0.5, time())`
@ -8341,8 +8363,10 @@ func TestExecError(t *testing.T) {
f(`running_sum(1, 2)`) f(`running_sum(1, 2)`)
f(`range_mad()`) f(`range_mad()`)
f(`range_sum(1, 2)`) f(`range_sum(1, 2)`)
f(`range_trim_spikes()`)
f(`range_trim_outliers()`) f(`range_trim_outliers()`)
f(`range_trim_spikes()`)
f(`range_trim_zscore()`)
f(`range_zscore()`)
f(`range_first(1, 2)`) f(`range_first(1, 2)`)
f(`range_last(1, 2)`) f(`range_last(1, 2)`)
f(`range_linear_regression(1, 2)`) f(`range_linear_regression(1, 2)`)

View file

@ -99,6 +99,8 @@ var transformFuncs = map[string]transformFunc{
"range_sum": newTransformFuncRange(runningSum), "range_sum": newTransformFuncRange(runningSum),
"range_trim_outliers": transformRangeTrimOutliers, "range_trim_outliers": transformRangeTrimOutliers,
"range_trim_spikes": transformRangeTrimSpikes, "range_trim_spikes": transformRangeTrimSpikes,
"range_trim_zscore": transformRangeTrimZscore,
"range_zscore": transformRangeZscore,
"remove_resets": transformRemoveResets, "remove_resets": transformRemoveResets,
"round": transformRound, "round": transformRound,
"running_avg": newTransformFuncRunning(runningAvg), "running_avg": newTransformFuncRunning(runningAvg),
@ -1277,6 +1279,64 @@ func transformRangeNormalize(tfa *transformFuncArg) ([]*timeseries, error) {
return rvs, nil return rvs, nil
} }
func transformRangeTrimZscore(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 2); err != nil {
return nil, err
}
zs, err := getScalar(args[0], 0)
if err != nil {
return nil, err
}
z := float64(0)
if len(zs) > 0 {
z = math.Abs(zs[0])
}
// Trim samples with z-score above z.
rvs := args[1]
for _, ts := range rvs {
values := ts.Values
qStddev := stddev(values)
avg := mean(values)
for i, v := range values {
zCurr := math.Abs(v-avg) / qStddev
if zCurr > z {
values[i] = nan
}
}
}
return rvs, nil
}
func transformRangeZscore(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args
if err := expectTransformArgsNum(args, 1); err != nil {
return nil, err
}
rvs := args[0]
for _, ts := range rvs {
values := ts.Values
qStddev := stddev(values)
avg := mean(values)
for i, v := range values {
values[i] = (v - avg) / qStddev
}
}
return rvs, nil
}
func mean(values []float64) float64 {
var sum float64
var n int
for _, v := range values {
if !math.IsNaN(v) {
sum += v
n++
}
}
return sum / float64(n)
}
func transformRangeTrimOutliers(tfa *transformFuncArg) ([]*timeseries, error) { func transformRangeTrimOutliers(tfa *transformFuncArg) ([]*timeseries, error) {
args := tfa.args args := tfa.args
if err := expectTransformArgsNum(args, 2); err != nil { if err := expectTransformArgsNum(args, 2); err != nil {
@ -1290,7 +1350,7 @@ func transformRangeTrimOutliers(tfa *transformFuncArg) ([]*timeseries, error) {
if len(ks) > 0 { if len(ks) > 0 {
k = ks[0] k = ks[0]
} }
// Trim samples v satisfying the `abs(v - range_median(q)) > k*range_mad(q)` // Trim samples satisfying the `abs(v - range_median(q)) > k*range_mad(q)`
rvs := args[1] rvs := args[1]
for _, ts := range rvs { for _, ts := range rvs {
values := ts.Values values := ts.Values

View file

@ -24,7 +24,9 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmalert enterprise](https://docs.victoriametrics.com/vmalert.html): add ability to read alerting and recording rules from S3, GCS or S3-compatible object storage. See [these docs](https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage). * FEATURE: [vmalert enterprise](https://docs.victoriametrics.com/vmalert.html): add ability to read alerting and recording rules from S3, GCS or S3-compatible object storage. See [these docs](https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage).
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `mad_over_time(m[d])` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over raw samples on the lookbehind window `d`. See [this feature request](https://github.com/prometheus/prometheus/issues/5514). * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `mad_over_time(m[d])` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over raw samples on the lookbehind window `d`. See [this feature request](https://github.com/prometheus/prometheus/issues/5514).
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_mad(q)` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over points per each time series returned by `q`. * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_mad(q)` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over points per each time series returned by `q`.
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_outliers(k, q)` function for dropping outliers farther than `k*range_mad(q)` from the `range_median(q)`. This should removing outliers at query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759). * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_zscore(q)` function for calculating [z-score](https://en.wikipedia.org/wiki/Standard_score) over points per each time series returned from `q`.
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_outliers(k, q)` function for dropping outliers located farther than `k*range_mad(q)` from the `range_median(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759).
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_zscore(z, q)` function for dropping outliers located farther than `z*range_stddev(q)` from `range_avg(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show `median` instead of `avg` in graph tooltip and line legend, since `median` is more tolerant against spikes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3706). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show `median` instead of `avg` in graph tooltip and line legend, since `median` is more tolerant against spikes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3706).
* BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790). * BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790).

View file

@ -809,12 +809,14 @@ See also [min_over_time](#min_over_time).
#### zscore_over_time #### zscore_over_time
`zscore_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which calculates returns [z-score](https://en.wikipedia.org/wiki/Standard_score) `zscore_over_time(series_selector[d])` is a [rollup function](#rollup-functions), which returns [z-score](https://en.wikipedia.org/wiki/Standard_score)
for raw samples on the given lookbehind window `d`. It is calculated independently per each time series returned for raw samples on the given lookbehind window `d`. It is calculated independently per each time series returned
from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering). from the given [series_selector](https://docs.victoriametrics.com/keyConcepts.html#filtering).
Metric names are stripped from the resulting rollups. Add [keep_metric_names](#keep_metric_names) modifier in order to keep metric names. Metric names are stripped from the resulting rollups. Add [keep_metric_names](#keep_metric_names) modifier in order to keep metric names.
See also [zscore](#zscore) and [range_trim_zscore](#range_trim_zscore).
### Transform functions ### Transform functions
@ -1267,18 +1269,28 @@ per each time series returned by `q` on the selected time range.
#### range_trim_outliers #### range_trim_outliers
`range_trim_outliers(k, q)` is a [transform function](#transform-functions), which drops points located farther than `k*range_mad(q)` `range_trim_outliers(k, q)` is a [transform function](#transform-functions), which drops points located farther than `k*range_mad(q)`
from the `range_median(q)`. E.g., it is equivalent to the following query: `q ifnot (abs(q - range_median(q)) > k*range_mad(q))`. from the `range_median(q)`. E.g. it is equivalent to the following query: `q ifnot (abs(q - range_median(q)) > k*range_mad(q))`.
The `phi` must be in the range `[0..1]`, where `0` means `0%` and `1` means `100%`. See also [range_trim_spikes](#range_trim_spikes) and [range_trim_zscore](#range_trim_zscore).
See also [range_trim_outliers](#range_trim_outliers).
#### range_trim_spikes #### range_trim_spikes
`range_trim_spikes(phi, q)` is a [transform function](#transform-functions), which drops `phi` percent of biggest spikes from time series returned by `q`. `range_trim_spikes(phi, q)` is a [transform function](#transform-functions), which drops `phi` percent of biggest spikes from time series returned by `q`.
The `phi` must be in the range `[0..1]`, where `0` means `0%` and `1` means `100%`. The `phi` must be in the range `[0..1]`, where `0` means `0%` and `1` means `100%`.
See also [range_trim_outliers](#range_trim_outliers). See also [range_trim_outliers](#range_trim_outliers) and [range_trim_zscore](#range_trim_zscore).
#### range_trim_zscore
`range_trim_zscore(z, q)` is a [transform function](#transform-functions), which drops points located farther than `z*range_stddev(q)`
from the `range_avg(q)`. E.g. it is equivalent to the following query: `q ifnot (abs(q - range_avg(q)) > z*range_avg(q))`.
See also [range_trim_outliers](#range_trim_outliers) and [range_trim_spikes](#range_trim_spikes).
#### range_zscore
`range_zscore(q)` is a [transform function](#transform-functions), which calculates [z-score](https://en.wikipedia.org/wiki/Standard_score)
for points returned by `q`, e.g. it is equivalent to the following query: `(q - range_avg(q)) / range_stddev(q)`.
#### remove_resets #### remove_resets
@ -1890,6 +1902,8 @@ See also [bottomk_min](#bottomk_min).
per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp. per each `group_labels` for all the time series returned by `q`. The aggregate is calculated individually per each group of points with the same timestamp.
This function is useful for detecting anomalies in the group of related time series. This function is useful for detecting anomalies in the group of related time series.
See also [zscore_over_time](#zscore_over_time) and [range_trim_zscore](#range_trim_zscore).
## Subqueries ## Subqueries
MetricsQL supports and extends PromQL subqueries. See [this article](https://valyala.medium.com/prometheus-subqueries-in-victoriametrics-9b1492b720b3) for details. MetricsQL supports and extends PromQL subqueries. See [this article](https://valyala.medium.com/prometheus-subqueries-in-victoriametrics-9b1492b720b3) for details.

2
go.mod
View file

@ -12,7 +12,7 @@ require (
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b // like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
github.com/VictoriaMetrics/fasthttp v1.1.0 github.com/VictoriaMetrics/fasthttp v1.1.0
github.com/VictoriaMetrics/metrics v1.23.1 github.com/VictoriaMetrics/metrics v1.23.1
github.com/VictoriaMetrics/metricsql v0.54.0 github.com/VictoriaMetrics/metricsql v0.55.0
github.com/aws/aws-sdk-go-v2 v1.17.4 github.com/aws/aws-sdk-go-v2 v1.17.4
github.com/aws/aws-sdk-go-v2/config v1.18.13 github.com/aws/aws-sdk-go-v2/config v1.18.13
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.53 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.53

4
go.sum
View file

@ -69,8 +69,8 @@ github.com/VictoriaMetrics/fasthttp v1.1.0/go.mod h1:/7DMcogqd+aaD3G3Hg5kFgoFwlR
github.com/VictoriaMetrics/metrics v1.18.1/go.mod h1:ArjwVz7WpgpegX/JpB0zpNF2h2232kErkEnzH1sxMmA= github.com/VictoriaMetrics/metrics v1.18.1/go.mod h1:ArjwVz7WpgpegX/JpB0zpNF2h2232kErkEnzH1sxMmA=
github.com/VictoriaMetrics/metrics v1.23.1 h1:/j8DzeJBxSpL2qSIdqnRFLvQQhbJyJbbEi22yMm7oL0= github.com/VictoriaMetrics/metrics v1.23.1 h1:/j8DzeJBxSpL2qSIdqnRFLvQQhbJyJbbEi22yMm7oL0=
github.com/VictoriaMetrics/metrics v1.23.1/go.mod h1:rAr/llLpEnAdTehiNlUxKgnjcOuROSzpw0GvjpEbvFc= github.com/VictoriaMetrics/metrics v1.23.1/go.mod h1:rAr/llLpEnAdTehiNlUxKgnjcOuROSzpw0GvjpEbvFc=
github.com/VictoriaMetrics/metricsql v0.54.0 h1:dKAIJtWcSPKnMNhRY5MYpqC77ZyHtA1xuDRr1pJuN5Q= github.com/VictoriaMetrics/metricsql v0.55.0 h1:GZMZ1dUKPMhKsSPtVTRHfMChwRZ4KrXBxnSQgr3mjSg=
github.com/VictoriaMetrics/metricsql v0.54.0/go.mod h1:6pP1ZeLVJHqJrHlF6Ij3gmpQIznSsgktEcZgsAWYel0= github.com/VictoriaMetrics/metricsql v0.55.0/go.mod h1:6pP1ZeLVJHqJrHlF6Ij3gmpQIznSsgktEcZgsAWYel0=
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA= github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=
github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow= github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=
github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4= github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4=

View file

@ -393,7 +393,7 @@ func getTransformArgIdxForOptimization(funcName string, args []Expr) int {
case "limit_offset": case "limit_offset":
return 2 return 2
case "buckets_limit", "histogram_quantile", "histogram_share", "range_quantile", case "buckets_limit", "histogram_quantile", "histogram_share", "range_quantile",
"range_trim_outliers", "range_trim_spikes": "range_trim_outliers", "range_trim_spikes", "range_trim_zscore":
return 1 return 1
case "histogram_quantiles": case "histogram_quantiles":
return len(args) - 1 return len(args) - 1

View file

@ -84,6 +84,8 @@ var transformFuncs = map[string]bool{
"range_sum": true, "range_sum": true,
"range_trim_outliers": true, "range_trim_outliers": true,
"range_trim_spikes": true, "range_trim_spikes": true,
"range_trim_zscore": true,
"range_zscore": true,
"remove_resets": true, "remove_resets": true,
"round": true, "round": true,
"running_avg": true, "running_avg": true,

2
vendor/modules.txt vendored
View file

@ -71,7 +71,7 @@ github.com/VictoriaMetrics/fasthttp/stackless
# github.com/VictoriaMetrics/metrics v1.23.1 # github.com/VictoriaMetrics/metrics v1.23.1
## explicit; go 1.15 ## explicit; go 1.15
github.com/VictoriaMetrics/metrics github.com/VictoriaMetrics/metrics
# github.com/VictoriaMetrics/metricsql v0.54.0 # github.com/VictoriaMetrics/metricsql v0.55.0
## explicit; go 1.13 ## explicit; go 1.13
github.com/VictoriaMetrics/metricsql github.com/VictoriaMetrics/metricsql
github.com/VictoriaMetrics/metricsql/binaryop github.com/VictoriaMetrics/metricsql/binaryop