app/vmselect/netstorage: consistently select the sample with the biggest value out of samples with identical timestamps

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3333

This fix is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3620 ,
but doesn't slow down the common case with merging replicated data blocks so significantly.

Benchmark results:

Before the change:

BenchmarkMergeSortBlocks/replicationFactor-1-4         	   13968	     85643 ns/op	 956.53 MB/s	    1700 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-2-4         	   10806	    109171 ns/op	1500.77 MB/s	    2191 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-3-4         	    8887	    130623 ns/op	1881.45 MB/s	    2660 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-4-4         	    7440	    157348 ns/op	2082.52 MB/s	    3174 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-5-4         	    6534	    184473 ns/op	2220.38 MB/s	    3612 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/overlapped-blocks-bestcase-4  	   13419	     85205 ns/op	 961.44 MB/s	    2213 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/overlapped-blocks-worstcase-4 	     579	   1894900 ns/op	  43.23 MB/s	   46760 B/op	       1 allocs/op

After the change:

BenchmarkMergeSortBlocks/replicationFactor-1-4         	   13832	     85298 ns/op	 960.40 MB/s	    1716 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-2-4         	    8833	    134222 ns/op	1220.66 MB/s	    2675 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-3-4         	    6487	    184830 ns/op	1329.65 MB/s	    3636 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-4-4         	    4977	    236318 ns/op	1386.61 MB/s	    4733 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/replicationFactor-5-4         	    4088	    296734 ns/op	1380.36 MB/s	    5761 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/overlapped-blocks-bestcase-4  	   14083	     84067 ns/op	 974.47 MB/s	    2110 B/op	       1 allocs/op
BenchmarkMergeSortBlocks/overlapped-blocks-worstcase-4 	     536	   2043534 ns/op	  40.09 MB/s	   50511 B/op	       1 allocs/op
This commit is contained in:
Aliaksandr Valialkin 2023-01-09 12:57:43 -08:00
parent fe0e199859
commit 1f9d605988
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
3 changed files with 40 additions and 9 deletions

View file

@ -508,18 +508,17 @@ func mergeSortBlocks(dst *Result, sbh sortBlocksHeap, dedupInterval int64) {
}
sbNext := sbh.getNextBlock()
tsNext := sbNext.Timestamps[sbNext.NextIdx]
topTimestamps := top.Timestamps
topNextIdx := top.NextIdx
if n := equalTimestampsPrefix(topTimestamps[topNextIdx:], sbNext.Timestamps[sbNext.NextIdx:]); n > 0 && dedupInterval > 0 {
if n := equalSamplesPrefix(top, sbNext); n > 0 && dedupInterval > 0 {
// Skip n replicated samples at top if deduplication is enabled.
top.NextIdx = topNextIdx + n
} else {
// Copy samples from top to dst with timestamps not exceeding tsNext.
top.NextIdx = topNextIdx + binarySearchTimestamps(topTimestamps[topNextIdx:], tsNext)
dst.Timestamps = append(dst.Timestamps, topTimestamps[topNextIdx:top.NextIdx]...)
top.NextIdx = topNextIdx + binarySearchTimestamps(top.Timestamps[topNextIdx:], tsNext)
dst.Timestamps = append(dst.Timestamps, top.Timestamps[topNextIdx:top.NextIdx]...)
dst.Values = append(dst.Values, top.Values[topNextIdx:top.NextIdx]...)
}
if top.NextIdx < len(topTimestamps) {
if top.NextIdx < len(top.Timestamps) {
heap.Fix(&sbh, 0)
} else {
heap.Pop(&sbh)
@ -535,6 +534,14 @@ func mergeSortBlocks(dst *Result, sbh sortBlocksHeap, dedupInterval int64) {
var dedupsDuringSelect = metrics.NewCounter(`vm_deduplicated_samples_total{type="select"}`)
func equalSamplesPrefix(a, b *sortBlock) int {
n := equalTimestampsPrefix(a.Timestamps[a.NextIdx:], b.Timestamps[b.NextIdx:])
if n == 0 {
return 0
}
return equalValuesPrefix(a.Values[a.NextIdx:a.NextIdx+n], b.Values[b.NextIdx:b.NextIdx+n])
}
func equalTimestampsPrefix(a, b []int64) int {
for i, v := range a {
if i >= len(b) || v != b[i] {
@ -544,6 +551,15 @@ func equalTimestampsPrefix(a, b []int64) int {
return len(a)
}
func equalValuesPrefix(a, b []float64) int {
for i, v := range a {
if i >= len(b) || v != b[i] {
return i
}
}
return len(a)
}
func binarySearchTimestamps(timestamps []int64, ts int64) int {
// The code has been adapted from sort.Search.
n := len(timestamps)

View file

@ -102,21 +102,35 @@ func TestMergeSortBlocks(t *testing.T) {
Values: []float64{9, 4.2, 2.1, 5.2, 42, 6.1},
})
// Multiple blocks with identical timestamps.
// Multiple blocks with identical timestamps and identical values.
f([]*sortBlock{
{
Timestamps: []int64{1, 2, 4, 5},
Values: []float64{9, 5.2, 6.1, 9},
},
{
Timestamps: []int64{1, 2, 4},
Values: []float64{9, 5.2, 6.1},
},
}, 1, &Result{
Timestamps: []int64{1, 2, 4, 5},
Values: []float64{9, 5.2, 6.1, 9},
})
// Multiple blocks with identical timestamps.
f([]*sortBlock{
{
Timestamps: []int64{1, 2, 4, 5},
Values: []float64{9, 5.2, 6.1, 9},
},
{
Timestamps: []int64{1, 2, 4},
Values: []float64{4.2, 2.1, 42},
},
}, 1, &Result{
Timestamps: []int64{1, 2, 4},
Values: []float64{4.2, 2.1, 42},
Timestamps: []int64{1, 2, 4, 5},
Values: []float64{9, 5.2, 42, 9},
})
// Multiple blocks with identical timestamps, disabled deduplication.
f([]*sortBlock{
{

View file

@ -45,6 +45,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): [dockerswarm_sd_configs](https://docs.victoriametrics.com/sd_configs.html#dockerswarm_sd_configs): properly encode `filters` field. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3579).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): fix possible resource leak after hot reload of the updated [consul_sd_configs](https://docs.victoriametrics.com/sd_configs.html#consul_sd_configs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3468).
* BUGFIX: properly return label names starting from uppercase such as `CamelCaseLabel` from [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3566).
* BUGFIX: consistently select the sample with the biggest value out of samples with identical timestamps during querying when the [deduplication](https://docs.victoriametrics.com/#deduplication) is enabled according to [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3333). Previously random samples could be selected during querying.
## [v1.85.3](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.3)