VictoriaMetrics/lib/streamaggr/deduplicator_test.go

package streamaggr

import (
	"sync"
	"testing"
	"time"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)

func TestDeduplicator(t *testing.T) {
	var tssResult []prompbmarshal.TimeSeries
	var tssResultLock sync.Mutex
	pushFunc := func(tss []prompbmarshal.TimeSeries) {
		tssResultLock.Lock()
		tssResult = appendClonedTimeseries(tssResult, tss)
		tssResultLock.Unlock()
	}

	offsetMsecs := time.Now().Add(time.Minute).UnixMilli()
	tss := prompbmarshal.MustParsePromMetrics(`
foo{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 123
bar{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 34.54
x 8943 1
baz_aaa_aaa_fdd{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} -34.34
x 90984
x 433 1
asfjkldsf{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 12322
foo{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 894
baz_aaa_aaa_fdd{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} -2.3
`, offsetMsecs)

	dedupInterval := time.Hour
	d := NewDeduplicator(pushFunc, true, dedupInterval, []string{"node", "instance"}, "global")
	for i := 0; i < 10; i++ {
		d.Push(tss)
	}

	d.flush(pushFunc)
	d.MustStop()

	result := timeSeriessToString(tssResult)
	resultExpected := `asfjkldsf{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 12322
bar{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 34.54
baz_aaa_aaa_fdd{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} -2.3
foo{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 894
x 8943
`
	if result != resultExpected {
		t.Fatalf("unexpected result; got\n%s\nwant\n%s", result, resultExpected)
	}
}
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`package streamaggr`

			`import (`
			`"sync"`
			`"testing"`
			`"time"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"`
			`)`

			`func TestDeduplicator(t *testing.T) {`
			`var tssResult []prompbmarshal.TimeSeries`
			`var tssResultLock sync.Mutex`
			`pushFunc := func(tss []prompbmarshal.TimeSeries) {`
			`tssResultLock.Lock()`
			`tssResult = appendClonedTimeseries(tssResult, tss)`
			`tssResultLock.Unlock()`
			`}`

lib/streamaggr: added aggregation windows (#6314) ### Describe Your Changes By default, stream aggregation and deduplication stores a single state per each aggregation output result. The data for each aggregator is flushed independently once per aggregation interval. But there's no guarantee that incoming samples with timestamps close to the aggregation interval's end will get into it. For example, when aggregating with `interval: 1m` a data sample with timestamp 1739473078 (18:57:59) can fall into aggregation round `18:58:00` or `18:59:00`. It depends on network lag, load, clock synchronization, etc. In most scenarios it doesn't impact aggregation or deduplication results, which are consistent within margin of error. But for metrics represented as a collection of series, like [histograms](https://docs.victoriametrics.com/keyconcepts/#histogram), such inaccuracy leads to invalid aggregation results. For this case, streaming aggregation and deduplication support mode with aggregation windows for current and previous state. With this mode, flush doesn't happen immediately but is shifted by a calculated samples lag that improves correctness for delayed data. Enabling of this mode has increased resource usage: memory usage is expected to double as aggregation will store two states instead of one. However, this significantly improves accuracy of calculations. Aggregation windows can be enabled via the following settings: - `-streamAggr.enableWindows` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/). At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr.enableWindows` flag can be specified individually per each `-remoteWrite.url`. If one of these flags is set, then all aggregators will be using fixed windows. In conjunction with `-remoteWrite.streamAggr.dedupInterval` or `-streamAggr.dedupInterval` fixed aggregation windows are enabled on deduplicator as well. - `enable_windows` option in [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It allows enabling aggregation windows for a specific aggregator. ### Checklist The following checks are mandatory: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit c8fc90366944fe07115c3571b9f7cb7864b05886) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2025-02-19 12:19:33 +00:00			`offsetMsecs := time.Now().Add(time.Minute).UnixMilli()`
app/vmagent/remotewrite,lib/streamaggr: re-use common code in tests after 879771808b7f7fe6bd3020957fb6835a02424846 - Export streamaggr.LoadFromData() function, so it could be used in tests outside the lib/streamaggr package. This allows removing a hack with creation of temporary files at TestRemoteWriteContext_TryPush_ImmutableTimeseries. - Move common code for mustParsePromMetrics() function into lib/prompbmarshal package, so it could be used in tests for building []prompbmarshal.TimeSeries from string. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6205 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6206 2024-07-03 13:10:09 +00:00			tss := prompbmarshal.MustParsePromMetrics(`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`foo{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 123`
			`bar{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 34.54`
lib/streamaggr: added aggregation windows (#6314) ### Describe Your Changes By default, stream aggregation and deduplication stores a single state per each aggregation output result. The data for each aggregator is flushed independently once per aggregation interval. But there's no guarantee that incoming samples with timestamps close to the aggregation interval's end will get into it. For example, when aggregating with `interval: 1m` a data sample with timestamp 1739473078 (18:57:59) can fall into aggregation round `18:58:00` or `18:59:00`. It depends on network lag, load, clock synchronization, etc. In most scenarios it doesn't impact aggregation or deduplication results, which are consistent within margin of error. But for metrics represented as a collection of series, like [histograms](https://docs.victoriametrics.com/keyconcepts/#histogram), such inaccuracy leads to invalid aggregation results. For this case, streaming aggregation and deduplication support mode with aggregation windows for current and previous state. With this mode, flush doesn't happen immediately but is shifted by a calculated samples lag that improves correctness for delayed data. Enabling of this mode has increased resource usage: memory usage is expected to double as aggregation will store two states instead of one. However, this significantly improves accuracy of calculations. Aggregation windows can be enabled via the following settings: - `-streamAggr.enableWindows` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/). At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr.enableWindows` flag can be specified individually per each `-remoteWrite.url`. If one of these flags is set, then all aggregators will be using fixed windows. In conjunction with `-remoteWrite.streamAggr.dedupInterval` or `-streamAggr.dedupInterval` fixed aggregation windows are enabled on deduplicator as well. - `enable_windows` option in [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It allows enabling aggregation windows for a specific aggregator. ### Checklist The following checks are mandatory: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit c8fc90366944fe07115c3571b9f7cb7864b05886) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2025-02-19 12:19:33 +00:00			`x 8943 1`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`baz_aaa_aaa_fdd{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} -34.34`
lib/streamaggr: added aggregation windows (#6314) ### Describe Your Changes By default, stream aggregation and deduplication stores a single state per each aggregation output result. The data for each aggregator is flushed independently once per aggregation interval. But there's no guarantee that incoming samples with timestamps close to the aggregation interval's end will get into it. For example, when aggregating with `interval: 1m` a data sample with timestamp 1739473078 (18:57:59) can fall into aggregation round `18:58:00` or `18:59:00`. It depends on network lag, load, clock synchronization, etc. In most scenarios it doesn't impact aggregation or deduplication results, which are consistent within margin of error. But for metrics represented as a collection of series, like [histograms](https://docs.victoriametrics.com/keyconcepts/#histogram), such inaccuracy leads to invalid aggregation results. For this case, streaming aggregation and deduplication support mode with aggregation windows for current and previous state. With this mode, flush doesn't happen immediately but is shifted by a calculated samples lag that improves correctness for delayed data. Enabling of this mode has increased resource usage: memory usage is expected to double as aggregation will store two states instead of one. However, this significantly improves accuracy of calculations. Aggregation windows can be enabled via the following settings: - `-streamAggr.enableWindows` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/). At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr.enableWindows` flag can be specified individually per each `-remoteWrite.url`. If one of these flags is set, then all aggregators will be using fixed windows. In conjunction with `-remoteWrite.streamAggr.dedupInterval` or `-streamAggr.dedupInterval` fixed aggregation windows are enabled on deduplicator as well. - `enable_windows` option in [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It allows enabling aggregation windows for a specific aggregator. ### Checklist The following checks are mandatory: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit c8fc90366944fe07115c3571b9f7cb7864b05886) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2025-02-19 12:19:33 +00:00			`x 90984`
			`x 433 1`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`asfjkldsf{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 12322`
			`foo{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} 894`
			`baz_aaa_aaa_fdd{instance="x",job="aaa",pod="sdfd-dfdfdfs",node="aosijjewrerfd",namespace="asdff",container="ohohffd"} -2.3`
app/vmagent/remotewrite,lib/streamaggr: re-use common code in tests after 879771808b7f7fe6bd3020957fb6835a02424846 - Export streamaggr.LoadFromData() function, so it could be used in tests outside the lib/streamaggr package. This allows removing a hack with creation of temporary files at TestRemoteWriteContext_TryPush_ImmutableTimeseries. - Move common code for mustParsePromMetrics() function into lib/prompbmarshal package, so it could be used in tests for building []prompbmarshal.TimeSeries from string. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6205 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6206 2024-07-03 13:10:09 +00:00			`, offsetMsecs)
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00
lib/streamaggr: added aggregation windows (#6314) ### Describe Your Changes By default, stream aggregation and deduplication stores a single state per each aggregation output result. The data for each aggregator is flushed independently once per aggregation interval. But there's no guarantee that incoming samples with timestamps close to the aggregation interval's end will get into it. For example, when aggregating with `interval: 1m` a data sample with timestamp 1739473078 (18:57:59) can fall into aggregation round `18:58:00` or `18:59:00`. It depends on network lag, load, clock synchronization, etc. In most scenarios it doesn't impact aggregation or deduplication results, which are consistent within margin of error. But for metrics represented as a collection of series, like [histograms](https://docs.victoriametrics.com/keyconcepts/#histogram), such inaccuracy leads to invalid aggregation results. For this case, streaming aggregation and deduplication support mode with aggregation windows for current and previous state. With this mode, flush doesn't happen immediately but is shifted by a calculated samples lag that improves correctness for delayed data. Enabling of this mode has increased resource usage: memory usage is expected to double as aggregation will store two states instead of one. However, this significantly improves accuracy of calculations. Aggregation windows can be enabled via the following settings: - `-streamAggr.enableWindows` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/). At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr.enableWindows` flag can be specified individually per each `-remoteWrite.url`. If one of these flags is set, then all aggregators will be using fixed windows. In conjunction with `-remoteWrite.streamAggr.dedupInterval` or `-streamAggr.dedupInterval` fixed aggregation windows are enabled on deduplicator as well. - `enable_windows` option in [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It allows enabling aggregation windows for a specific aggregator. ### Checklist The following checks are mandatory: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit c8fc90366944fe07115c3571b9f7cb7864b05886) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2025-02-19 12:19:33 +00:00			`dedupInterval := time.Hour`
			`d := NewDeduplicator(pushFunc, true, dedupInterval, []string{"node", "instance"}, "global")`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`for i := 0; i < 10; i++ {`
			`d.Push(tss)`
			`}`
lib/streamaggr: added aggregation windows (#6314) ### Describe Your Changes By default, stream aggregation and deduplication stores a single state per each aggregation output result. The data for each aggregator is flushed independently once per aggregation interval. But there's no guarantee that incoming samples with timestamps close to the aggregation interval's end will get into it. For example, when aggregating with `interval: 1m` a data sample with timestamp 1739473078 (18:57:59) can fall into aggregation round `18:58:00` or `18:59:00`. It depends on network lag, load, clock synchronization, etc. In most scenarios it doesn't impact aggregation or deduplication results, which are consistent within margin of error. But for metrics represented as a collection of series, like [histograms](https://docs.victoriametrics.com/keyconcepts/#histogram), such inaccuracy leads to invalid aggregation results. For this case, streaming aggregation and deduplication support mode with aggregation windows for current and previous state. With this mode, flush doesn't happen immediately but is shifted by a calculated samples lag that improves correctness for delayed data. Enabling of this mode has increased resource usage: memory usage is expected to double as aggregation will store two states instead of one. However, this significantly improves accuracy of calculations. Aggregation windows can be enabled via the following settings: - `-streamAggr.enableWindows` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and [vmagent](https://docs.victoriametrics.com/vmagent/). At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr.enableWindows` flag can be specified individually per each `-remoteWrite.url`. If one of these flags is set, then all aggregators will be using fixed windows. In conjunction with `-remoteWrite.streamAggr.dedupInterval` or `-streamAggr.dedupInterval` fixed aggregation windows are enabled on deduplicator as well. - `enable_windows` option in [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It allows enabling aggregation windows for a specific aggregator. ### Checklist The following checks are mandatory: - [ ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit c8fc90366944fe07115c3571b9f7cb7864b05886) Signed-off-by: hagen1778 <roman@victoriametrics.com> 2025-02-19 12:19:33 +00:00
			`d.flush(pushFunc)`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`d.MustStop()`

			`result := timeSeriessToString(tssResult)`
app/{vmagent,vminsert}: add `-streamAggr.dropInputSamples` command-line flag for dropping the specified labels from input samples before deduplication and streaming aggregation 2024-03-05 00:13:21 +00:00			resultExpected := `asfjkldsf{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 12322
			`bar{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 34.54`
			`baz_aaa_aaa_fdd{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} -2.3`
			`foo{container="ohohffd",job="aaa",namespace="asdff",pod="sdfd-dfdfdfs"} 894`
lib/streamaggr: follow-up for 15e33d56f17522e80e9a4fbd86e17193e4c24c0f - Properly set pushSample.timestamp when flushing de-duplicated samples to stream aggregation This is needed for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5931 - Re-classify this change as feature instead of bugfix at docs/CHANGELOG.md - Verify de-duplication logic for samples with different timestamps Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5643 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5939 2024-03-17 19:37:14 +00:00			`x 8943`
app/{vminsert,vmagent}: allow using -streamAggr.dedupInterval without -streamAggr.config This allows performing online de-duplication of incoming samples 2024-03-04 22:45:22 +00:00			`
			`if result != resultExpected {`
			`t.Fatalf("unexpected result; got\n%s\nwant\n%s", result, resultExpected)`
			`}`
			`}`