app/vmselect/promql: properly calculate histogram_quantile over unexpected le buckets

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2819
2025-03-11 15:34:56 +00:00 · 2022-07-05 13:18:55 +03:00 · 2022-07-05 13:18:55 +03:00 · 8516670582
commit 8516670582
parent 82581ab728
4 changed files with 62 additions and 11 deletions
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
@ -3849,14 +3849,14 @@ func TestExecSuccess(t *testing.T) {
 	})
 	t.Run(`histogram_quantile(nan-bucket-count-some)`, func(t *testing.T) {
 		t.Parallel()
-		q := `histogram_quantile(0.6,
+		q := `round(histogram_quantile(0.6,
 			label_set(90, "foo", "bar", "le", "10")
 			or label_set(NaN, "foo", "bar", "le", "30")
 			or label_set(300, "foo", "bar", "le", "+Inf")
-		)`
+		),0.01)`
 		r := netstorage.Result{
 			MetricName: metricNameExpected,
-			Values:     []float64{30, 30, 30, 30, 30, 30},
+			Values:     []float64{18.57, 18.57, 18.57, 18.57, 18.57, 18.57},
 			Timestamps: timestampsExpected,
 		}
 		r.MetricName.Tags = []storage.Tag{{
--- a/app/vmselect/promql/transform.go
+++ b/app/vmselect/promql/transform.go
@ -994,16 +994,32 @@ func groupLeTimeseries(tss []*timeseries) map[string][]leTimeseries {
 }

 func fixBrokenBuckets(i int, xss []leTimeseries) {
-	// Fix broken buckets.
-	// They are already sorted by le, so their values must be in ascending order,
+	// Buckets are already sorted by le, so their values must be in ascending order,
 	// since the next bucket includes all the previous buckets.
-	vPrev := float64(0)
-	for _, xs := range xss {
-		v := xs.ts.Values[i]
-		if v < vPrev || math.IsNaN(v) {
-			xs.ts.Values[i] = vPrev
+	// If the next bucket has lower value than the current bucket,
+	// then the current bucket must be substituted with the next bucket value.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2819
+	if len(xss) < 2 {
+		return
+	}
+	for j := len(xss)-1; j>=0; j-- {
+		v := xss[j].ts.Values[i]
+		if !math.IsNaN(v) {
+			j++
+			for j < len(xss) {
+				xss[j].ts.Values[i] = v
+				j++
+			}
+			break
+		}
+	}
+	vNext := xss[len(xss)-1].ts.Values[i]
+	for j := len(xss)-2; j >= 0; j-- {
+		v := xss[j].ts.Values[i]
+		if math.IsNaN(v) || v > vNext {
+			xss[j].ts.Values[i] = vNext
 		} else {
-			vPrev = v
+			vNext = v
 		}
 	}
 }
--- a/app/vmselect/promql/transform_test.go
+++ b/app/vmselect/promql/transform_test.go
@ -0,0 +1,34 @@
+package promql
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestFixBrokenBuckets(t *testing.T) {
+	f := func(values, expectedResult []float64) {
+		t.Helper()
+		xss := make([]leTimeseries, len(values))
+		for i, v := range values {
+			xss[i].ts = &timeseries{
+				Values: []float64{v},
+			}
+		}
+		fixBrokenBuckets(0, xss)
+		result := make([]float64, len(values))
+		for i, xs := range xss {
+			result[i] = xs.ts.Values[0]
+		}
+		if !reflect.DeepEqual(result, expectedResult) {
+			t.Fatalf("unexpected result for values=%v\ngot\n%v\nwant\n%v", values, result, expectedResult)
+		}
+	}
+	f(nil, []float64{})
+	f([]float64{1}, []float64{1})
+	f([]float64{1,2}, []float64{1,2})
+	f([]float64{2,1}, []float64{1,1})
+	f([]float64{1,2,3,nan,nan},[]float64{1,2,3,3,3})
+	f([]float64{5,1,2,3,nan}, []float64{1,1,2,3,3})
+	f([]float64{1,5,2,nan,6,3}, []float64{1,2,2,3,3,3})
+	f([]float64{5,10,4,3}, []float64{3,3,3,3})
+}
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -53,6 +53,7 @@ scrape_configs:

 * BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle partial counter resets when calculating [rate](https://docs.victoriametrics.com/MetricsQL.html#rate), [irate](https://docs.victoriametrics.com/MetricsQL.html#irate) and [increase](https://docs.victoriametrics.com/MetricsQL.html#increase) functions. Previously these functions could return zero values after partial counter resets until the counter increases to the last value before partial counter reset. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787).
+* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate [histogram_quantile](https://docs.victoriametrics.com/MetricsQL.html#histogram_quantile) over Prometheus buckets with unexpected values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2819).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): make sure that [stale markers](https://docs.victoriametrics.com/vmagent.html#prometheus-staleness-markers) are generated with the actual timestamp when unsuccessful scrape occurs. This should prevent from possible time series overlap on scrape target restart in dynmaic envirnoments such as Kubernetes.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly reload changed `-promscrape.config` file when `-promscrape.configCheckInterval` option is set. The changed config file wasn't reloaded in this case since [v1.69.0](#v1690). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2786). Thanks to @ttyv for the fix.
 * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): assume that the response is complete if `-search.denyPartialResponse` is enabled and up to `-replicationFactor - 1` `vmstorage` nodes are unavailable. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1767).