app/vmselect/promql: properly handle partial counter resets in rate(), irate(), increase() and remove_resets() functions

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
This commit is contained in:
Aliaksandr Valialkin 2022-06-30 22:39:38 +03:00
parent 1e6b0a1f54
commit fa08220d27
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
5 changed files with 32 additions and 15 deletions

View file

@ -6568,7 +6568,7 @@ func TestExecSuccess(t *testing.T) {
q := `rate((2000-time())[100s:100s])`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{5.5, 4.5, 6.5, 4.5, 2.5, 0.5},
Values: []float64{0, 0, 6.5, 4.5, 2.5, 0.5},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -6579,7 +6579,7 @@ func TestExecSuccess(t *testing.T) {
q := `rate((2000-time())[100s:100s] offset 100s)`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{6, 5, 7.5, 5.5, 3.5, 1.5},
Values: []float64{0, 0, 3.5, 5.5, 3.5, 1.5},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -6590,7 +6590,7 @@ func TestExecSuccess(t *testing.T) {
q := `rate((2000-time())[100s:100s] offset 100s)[:] offset 100s`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{7, 6, 5, 7.5, 5.5, 3.5},
Values: []float64{0, 0, 0, 3.5, 5.5, 3.5},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
@ -6746,7 +6746,7 @@ func TestExecSuccess(t *testing.T) {
})
t.Run(`remove_resets()`, func(t *testing.T) {
t.Parallel()
q := `remove_resets( abs(1500-time()) )`
q := `remove_resets(abs(1500-time()))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{500, 800, 900, 900, 1100, 1300},
@ -6755,6 +6755,20 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`remove_resets(sum)`, func(t *testing.T) {
t.Parallel()
q := `remove_resets(sum(
alias(time(), "full"),
alias(time()/5 < 300, "partial"),
))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1200, 1440, 1680, 1680, 1880, 2080},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`range_avg(time())`, func(t *testing.T) {
t.Parallel()
q := `range_avg(time())`
@ -6945,10 +6959,10 @@ func TestExecSuccess(t *testing.T) {
})
t.Run(`aggr_over_time(single-func)`, func(t *testing.T) {
t.Parallel()
q := `aggr_over_time("increase", rand(0)[:10s])`
q := `round(aggr_over_time("increase", rand(0)[:10s]),0.01)`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{5.465672601448873, 6.642207999066246, 6.8400051805114295, 7.182425481980655, 5.1677922402706, 6.594060518641982},
Values: []float64{5.47, 6.64, 6.84, 7.24, 5.17, 6.59},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{{

View file

@ -704,9 +704,9 @@ func removeCounterResets(values []float64) {
d := v - prevValue
if d < 0 {
if (-d * 8) < prevValue {
// This is likely jitter from `Prometheus HA pairs`.
// Just substitute v with prevValue.
v = prevValue
// This is likely a partial counter reset.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
correction += prevValue - v
} else {
correction += prevValue
}

View file

@ -100,10 +100,11 @@ func TestRemoveCounterResets(t *testing.T) {
timestampsExpected := []int64{0, 1, 2, 3}
testRowsEqual(t, values, timestampsExpected, valuesExpected, timestampsExpected)
// verify how jitter from `Prometheus HA pairs` is handled
values = []float64{100, 95, 120, 140, 137, 50}
// verify how partial counter reset is handled.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
values = []float64{100, 95, 120, 119, 139, 50}
removeCounterResets(values)
valuesExpected = []float64{100, 100, 120, 140, 140, 190}
valuesExpected = []float64{100, 100, 125, 125, 145, 195}
timestampsExpected = []int64{0, 1, 2, 3, 4, 5}
testRowsEqual(t, values, timestampsExpected, valuesExpected, timestampsExpected)
}

View file

@ -2329,9 +2329,9 @@ func removeCounterResetsMaybeNaNs(values []float64) {
d := v - prevValue
if d < 0 {
if (-d * 8) < prevValue {
// This is likely jitter from `Prometheus HA pairs`.
// Just substitute v with prevValue.
v = prevValue
// This is likely a partial counter reset.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
correction += prevValue - v
} else {
correction += prevValue
}

View file

@ -35,6 +35,7 @@ scrape_configs:
* FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`. This improves traces' readability.
* FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series.
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle partial counter resets in [remove_resets](https://docs.victoriametrics.com/MetricsQL.html#remove_resets) function. Now `remove_resets(sum(m))` should returns the expected increasing line when some time series matching `m` disappear on the selected time range. Previously such a query would return horizontal line after the disappeared series.
* FEATURE: expose additional histogram metrics at `http://victoriametrics:8428/metrics`, which may help understanding query workload:
* `vm_rows_read_per_query` - the number of raw samples read per query.
@ -51,6 +52,7 @@ scrape_configs:
{% endraw %}
* BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle partial counter resets when calculating [rate](https://docs.victoriametrics.com/MetricsQL.html#rate), [irate](https://docs.victoriametrics.com/MetricsQL.html#irate) and [increase](https://docs.victoriametrics.com/MetricsQL.html#increase) functions. Previously these functions could return zero values after partial counter resets until the counter increases to the last value before partial counter reset. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): make sure that [stale markers](https://docs.victoriametrics.com/vmagent.html#prometheus-staleness-markers) are generated with the actual timestamp when unsuccessful scrape occurs. This should prevent from possible time series overlap on scrape target restart in dynmaic envirnoments such as Kubernetes.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly reload changed `-promscrape.config` file when `-promscrape.configCheckInterval` option is set. The changed config file wasn't reloaded in this case since [v1.69.0](#v1690). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2786). Thanks to @ttyv for the fix.
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): assume that the response is complete if `-search.denyPartialResponse` is enabled and up to `-replicationFactor - 1` `vmstorage` nodes are unavailable. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1767).