app/vmselect: properly adjust the lower bound for the time range where raw samples must be selected for default_rollup() function

Previously the lower bound could be too small, which could result in missing values at the beginning of the graph for default_rollup() function. This function is automatically applied to all the series selectors if they aren't explicitly wrapped into a rollup function - see https://docs.victoriametrics.com/MetricsQL.html#implicit-query-conversions While at it, properly take into account `-search.minStalenessInterval` command-line flag when adjusting the lower bound for the selected time range. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388
2024-11-21 14:44:00 +00:00 · 2023-12-06 14:15:48 +02:00 · 2023-12-06 14:15:48 +02:00 · 509339bf63
commit 509339bf63
parent 065f5a7f9e
3 changed files with 19 additions and 10 deletions
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@ -945,7 +945,7 @@ func evalRollupFuncWithSubquery(qt *querytracer.Tracer, ec *EvalConfig, funcName
 	}

 	ecSQ := copyEvalConfig(ec)
-	ecSQ.Start -= window + maxSilenceInterval + step
+	ecSQ.Start -= window + step + maxSilenceInterval()
 	ecSQ.End += step
 	ecSQ.Step = step
 	ecSQ.MaxPointsPerSeries = *maxPointsSubqueryPerTimeseries
@ -1699,12 +1699,12 @@ func evalRollupFuncNoCache(qt *querytracer.Tracer, ec *EvalConfig, funcName stri
 		return nil, err
 	}

-	// Fetch the remaining part of the result.
+	// Fetch the result.
 	tfss := searchutils.ToTagFilterss(me.LabelFilterss)
 	tfss = searchutils.JoinTagFilterss(tfss, ec.EnforcedTagFilterss)
 	minTimestamp := ec.Start
 	if needSilenceIntervalForRollupFunc(funcName) {
-		minTimestamp -= maxSilenceInterval
+		minTimestamp -= maxSilenceInterval()
 	}
 	if window > ec.Step {
 		minTimestamp -= window
@ -1800,10 +1800,22 @@ func getRollupMemoryLimiter() *memoryLimiter {
 	return &rollupMemoryLimiter
 }

+func maxSilenceInterval() int64 {
+	d := minStalenessInterval.Milliseconds()
+	if d <= 0 {
+		d = 5 * 60 * 1000
+	}
+	return d
+}
+
 func needSilenceIntervalForRollupFunc(funcName string) bool {
-	// All rollup the functions, which do not rely on the previous sample
-	// before the lookbehind window (aka prevValue), do not need silence interval.
+	// All the rollup functions, which do not rely on the previous sample
+	// before the lookbehind window (aka prevValue and realPrevValue), do not need silence interval.
 	switch strings.ToLower(funcName) {
+	case "default_rollup":
+		// The default_rollup implicitly relies on the previous samples in order to fill gaps.
+		// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388
+		return true
 	case
 		"absent_over_time",
 		"avg_over_time",
@ -1812,7 +1824,6 @@ func needSilenceIntervalForRollupFunc(funcName string) bool {
 		"count_le_over_time",
 		"count_ne_over_time",
 		"count_over_time",
-		"default_rollup",
 		"first_over_time",
 		"histogram_over_time",
 		"hoeffding_bound_lower",
--- a/app/vmselect/promql/rollup.go
+++ b/app/vmselect/promql/rollup.go
@ -561,9 +561,6 @@ var (
 	inf = math.Inf(1)
 )

-// The maximum interval without previous rows.
-const maxSilenceInterval = 5 * 60 * 1000
-
 type timeseriesMap struct {
 	origin *timeseries
 	h      metrics.Histogram
@ -620,7 +617,7 @@ func (tsm *timeseriesMap) GetOrCreateTimeseries(labelName, labelValue string) *t
 //
 // rc.Timestamps are used as timestamps for dstValues.
 //
-// timestamps must cover time range [rc.Start - rc.Window - maxSilenceInterval ... rc.End].
+// It is expected that timestamps cover the time range [rc.Start - rc.Window ... rc.End].
 //
 // Do cannot be called from concurrent goroutines.
 func (rc *rollupConfig) Do(dstValues []float64, values []float64, timestamps []int64) ([]float64, uint64) {
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -49,6 +49,7 @@ The sandbox cluster installation is running under the constant load generated by
  * `go_scavenge_cpu_seconds_total` - the [counter](https://docs.victoriametrics.com/keyConcepts.html#counter), which shows the total CPU time spent by Go runtime for returning memory to the Operating System.
  * `go_memlimit_bytes` - the value of [GOMEMLIMIT](https://pkg.go.dev/runtime#hdr-Environment_Variables) environment variable.

+* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate values for the first point on the graph for queries, which do not use [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, previously `count(up)` could return lower than expected values for the first point on the graph. This also could result in lower than expected values in the middle of the graph like in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388) when the response caching isn't disabled. The issue has been introduced in [v1.95.0](https://docs.victoriametrics.com/CHANGELOG.html#v1950).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): prevent from `FATAL: cannot flush metainfo` panic when [`-remoteWrite.multitenantURL`](https://docs.victoriametrics.com/vmagent.html#multitenancy) command-line flag is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5357).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly decode zstd-encoded data blocks received via [VictoriaMetrics remote_write protocol](https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol). See [this issue comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5301#issuecomment-1815871992).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly add new labels at `output_relabel_configs` during [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html). Previously this could lead to corrupted labels in output samples. Thanks to @ChengChung for providing [detailed report for this bug](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5402).