app/vmselect: quantile func compatiblity with Prometheus (#1646)

* app/vmselect: `quantile` func compatiblity with Prometheus

The `quantile` func was previously calculated by https://github.com/valyala/histogram
package. The result of such calculation was always the closest real value to
requested quantile. While in Prometheus implementation interpolation is used.
Such difference may result into discrepancy in output between Prometheus and
VictoriaMetrics.

This commit adds a Prometheus-like `quantile` function. It also used by other
functions which depend on it, such as `quantiles`, `quantile_over_time`, `median` etc.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1625

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* app/vmselect: `quantile` review fixes

* quantile functions were split into multiple to provide
different API for already sorted data;
* float64sPool is used for reducing allocations. Items in pool may have
different sizes, but defining a new pool was complicates due to name collisions;

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2021-09-27 18:02:41 +03:00 committed by GitHub
parent 80b0b92d2f
commit 526dd93b32
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 116 additions and 63 deletions

View file

@ -2,16 +2,15 @@ package promql
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/histogram"
"math"
"sort"
"strconv"
"strings"
)
var aggrFuncs = map[string]aggrFunc{
@ -801,15 +800,71 @@ func avgValue(values []float64) float64 {
}
func medianValue(values []float64) float64 {
h := histogram.GetFast()
for _, v := range values {
if !math.IsNaN(v) {
h.Update(v)
}
return quantile(0.5, values)
}
// quantiles calculates the given phis from originValues
// without modifying originValues
func quantiles(phis []float64, originValues []float64) []float64 {
a := float64sPool.Get().(*float64s)
a.A = prepareForQuantileFloat64(a.A[:0], originValues)
res := quantilesSorted(phis, a.A)
float64sPool.Put(a)
return res
}
func quantilesSorted(phis []float64, values []float64) []float64 {
res := make([]float64, len(phis))
for i, phi := range phis {
res[i] = quantileSorted(phi, values)
}
value := h.Quantile(0.5)
histogram.PutFast(h)
return value
return res
}
// quantile calculates the given phi from originValues
// without modifying originValues
func quantile(phi float64, originValues []float64) float64 {
a := float64sPool.Get().(*float64s)
a.A = prepareForQuantileFloat64(a.A[:0], originValues)
res := quantileSorted(phi, a.A)
float64sPool.Put(a)
return res
}
// prepareForQuantileFloat64 copies items from src
// to dst but removes NaNs and sorts the dst
func prepareForQuantileFloat64(dst, src []float64) []float64 {
for _, v := range src {
if math.IsNaN(v) {
continue
}
dst = append(dst, v)
}
sort.Float64s(dst)
return dst
}
// quantileSorted calculates the given quantile of a sorted list of values.
// It is expected that values won't contain NaN items.
// The implementation mimics Prometheus implementation for compatibility's sake.
func quantileSorted(phi float64, values []float64) float64 {
if len(values) == 0 || math.IsNaN(phi) {
return nan
}
if phi < 0 {
return math.Inf(-1)
}
if phi > 1 {
return math.Inf(+1)
}
n := float64(len(values))
rank := phi * (n - 1)
lowerIndex := math.Max(0, math.Floor(rank))
upperIndex := math.Min(n-1, lowerIndex+1)
weight := rank - math.Floor(rank)
return values[int(lowerIndex)]*(1-weight) + values[int(upperIndex)]*weight
}
func aggrFuncMAD(tss []*timeseries) []*timeseries {
@ -987,22 +1042,25 @@ func aggrFuncQuantiles(afa *aggrFuncArg) ([]*timeseries, error) {
ts.MetricName.AddTag(dstLabel, fmt.Sprintf("%g", phis[j]))
tssDst[j] = ts
}
h := histogram.GetFast()
defer histogram.PutFast(h)
var qs []float64
values := float64sPool.Get().(*float64s)
for n := range tss[0].Values {
h.Reset()
values.A = values.A[:0]
for j := range tss {
v := tss[j].Values[n]
if !math.IsNaN(v) {
h.Update(v)
if math.IsNaN(v) {
continue
}
values.A = append(values.A, v)
}
qs = h.Quantiles(qs[:0], phis)
sort.Float64s(values.A)
qs = quantilesSorted(phis, values.A)
for j := range tssDst {
tssDst[j].Values[n] = qs[j]
}
}
float64sPool.Put(values)
return tssDst
}
return aggrFuncExt(afe, argOrig, &afa.ae.Modifier, afa.ae.Limit, false)
@ -1034,18 +1092,19 @@ func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
func newAggrQuantileFunc(phis []float64) func(tss []*timeseries, modifier *metricsql.ModifierExpr) []*timeseries {
return func(tss []*timeseries, modifier *metricsql.ModifierExpr) []*timeseries {
dst := tss[0]
h := histogram.GetFast()
defer histogram.PutFast(h)
var values []float64
for n := range dst.Values {
h.Reset()
values = values[:0]
for j := range tss {
v := tss[j].Values[n]
if !math.IsNaN(v) {
h.Update(v)
if math.IsNaN(v) {
continue
}
values = append(values, v)
}
phi := phis[n]
dst.Values[n] = h.Quantile(phi)
sort.Float64s(values)
dst.Values[n] = quantileSorted(phi, values)
}
tss[0] = dst
return tss[:1]

View file

@ -1,6 +1,7 @@
package promql
import (
"math"
"testing"
"time"
@ -4620,7 +4621,7 @@ func TestExecSuccess(t *testing.T) {
q := `quantile_over_time(0.9, label_set(round(rand(0), 0.01), "__name__", "foo", "xx", "yy")[200s:5s])`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{0.89, 0.89, 0.95, 0.87, 0.92, 0.89},
Values: []float64{0.893, 0.892, 0.9510000000000001, 0.8730000000000001, 0.9250000000000002, 0.891},
Timestamps: timestampsExpected,
}
r.MetricName.MetricGroup = []byte("foo")
@ -4643,7 +4644,7 @@ func TestExecSuccess(t *testing.T) {
)`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{0.47, 0.57, 0.49, 0.54, 0.56, 0.53},
Values: []float64{0.46499999999999997, 0.57, 0.485, 0.54, 0.555, 0.515},
Timestamps: timestampsExpected,
}
r1.MetricName.MetricGroup = []byte("foo")
@ -4659,7 +4660,7 @@ func TestExecSuccess(t *testing.T) {
}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{0.89, 0.89, 0.95, 0.87, 0.92, 0.89},
Values: []float64{0.893, 0.892, 0.9510000000000001, 0.8730000000000001, 0.9250000000000002, 0.891},
Timestamps: timestampsExpected,
}
r2.MetricName.MetricGroup = []byte("foo")
@ -5278,7 +5279,7 @@ func TestExecSuccess(t *testing.T) {
})
t.Run(`bottomk_median(1)`, func(t *testing.T) {
t.Parallel()
q := `sort(bottomk_median(1, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss")))`
q := `sort(bottomk_median(1, label_set(10, "foo", "bar") or label_set(time()/15, "baz", "sss")))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, nan, nan, nan},
@ -5532,9 +5533,10 @@ func TestExecSuccess(t *testing.T) {
t.Run(`quantile(-2)`, func(t *testing.T) {
t.Parallel()
q := `quantile(-2, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
inf := math.Inf(-1)
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{6.666666666666667, 8, 9.333333333333334, 10, 10, 10},
Values: []float64{inf, inf, inf, inf, inf, inf},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5545,7 +5547,7 @@ func TestExecSuccess(t *testing.T) {
q := `quantile(0.2, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{6.666666666666667, 8, 9.333333333333334, 10, 10, 10},
Values: []float64{7.333333333333334, 8.4, 9.466666666666669, 10.133333333333333, 10.4, 10.666666666666668},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5556,7 +5558,7 @@ func TestExecSuccess(t *testing.T) {
q := `quantile(0.5, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
Values: []float64{8.333333333333334, 9, 9.666666666666668, 10.333333333333332, 11, 11.666666666666668},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5567,7 +5569,7 @@ func TestExecSuccess(t *testing.T) {
q := `sort(quantiles("phi", 0.2, 0.5, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss")))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{6.666666666666667, 8, 9.333333333333334, 10, 10, 10},
Values: []float64{7.333333333333334, 8.4, 9.466666666666669, 10.133333333333333, 10.4, 10.666666666666668},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{{
@ -5576,7 +5578,7 @@ func TestExecSuccess(t *testing.T) {
}}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
Values: []float64{8.333333333333334, 9, 9.666666666666668, 10.333333333333332, 11, 11.666666666666668},
Timestamps: timestampsExpected,
}
r2.MetricName.Tags = []storage.Tag{{
@ -5591,7 +5593,7 @@ func TestExecSuccess(t *testing.T) {
q := `median(label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
Values: []float64{8.333333333333334, 9, 9.666666666666668, 10.333333333333332, 11, 11.666666666666668},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5611,9 +5613,10 @@ func TestExecSuccess(t *testing.T) {
t.Run(`quantile(3)`, func(t *testing.T) {
t.Parallel()
q := `quantile(3, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
inf := math.Inf(+1)
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
Values: []float64{inf, inf, inf, inf, inf, inf},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5725,7 +5728,8 @@ func TestExecSuccess(t *testing.T) {
q := `range_quantile(0.5, time())`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1600, 1600, 1600, 1600, 1600, 1600},
// time() results in [1000 1200 1400 1600 1800 2000]
Values: []float64{1500, 1500, 1500, 1500, 1500, 1500},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -5736,7 +5740,8 @@ func TestExecSuccess(t *testing.T) {
q := `range_median(time())`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1600, 1600, 1600, 1600, 1600, 1600},
// time() results in [1000 1200 1400 1600 1800 2000]
Values: []float64{1500, 1500, 1500, 1500, 1500, 1500},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}

View file

@ -1066,12 +1066,7 @@ func newRollupQuantiles(args []interface{}) (rollupFunc, error) {
// Fast path - only a single value.
return values[0]
}
hf := histogram.GetFast()
for _, v := range values {
hf.Update(v)
}
qs := hf.Quantiles(nil, phis)
histogram.PutFast(hf)
qs := quantiles(phis, values)
idx := rfa.idx
tsm := rfa.tsm
for i, phiStr := range phiStrs {
@ -1102,13 +1097,8 @@ func newRollupQuantile(args []interface{}) (rollupFunc, error) {
// Fast path - only a single value.
return values[0]
}
hf := histogram.GetFast()
for _, v := range values {
hf.Update(v)
}
phi := phis[rfa.idx]
qv := hf.Quantile(phi)
histogram.PutFast(hf)
qv := quantile(phi, values)
return qv
}
return rf, nil

View file

@ -335,14 +335,14 @@ func TestRollupQuantileOverTime(t *testing.T) {
testRollupFunc(t, "quantile_over_time", args, &me, vExpected)
}
f(-123, 12)
f(-0.5, 12)
f(-123, math.Inf(-1))
f(-0.5, math.Inf(-1))
f(0, 12)
f(0.1, 21)
f(0.1, 22.1)
f(0.5, 34)
f(0.9, 99)
f(0.9, 94.50000000000001)
f(1, 123)
f(234, 123)
f(234, math.Inf(+1))
}
func TestRollupPredictLinear(t *testing.T) {

View file

@ -14,7 +14,6 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metricsql"
"github.com/valyala/histogram"
)
var transformFuncsKeepMetricGroup = map[string]bool{
@ -1178,23 +1177,23 @@ func transformRangeQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
}
phi := phis[0]
rvs := args[1]
hf := histogram.GetFast()
var values []float64
for _, ts := range rvs {
hf.Reset()
lastIdx := -1
values := ts.Values
for i, v := range values {
originValues := ts.Values
values = values[:0]
for i, v := range originValues {
if math.IsNaN(v) {
continue
}
hf.Update(v)
values = append(values, v)
lastIdx = i
}
if lastIdx >= 0 {
values[lastIdx] = hf.Quantile(phi)
sort.Float64s(values)
originValues[lastIdx] = quantileSorted(phi, values)
}
}
histogram.PutFast(hf)
setLastValues(rvs)
return rvs, nil
}