VictoriaMetrics/lib/streamaggr/quantiles.go

package streamaggr

import (
	"strconv"
	"strings"
	"sync"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
	"github.com/valyala/histogram"
)

// quantilesAggrState calculates output=quantiles, e.g. the the given quantiles over the input samples.
type quantilesAggrState struct {
	m sync.Map

	phis []float64
}

type quantilesStateValue struct {
	mu      sync.Mutex
	h       *histogram.Fast
	deleted bool
}

func newQuantilesAggrState(phis []float64) *quantilesAggrState {
	return &quantilesAggrState{
		phis: phis,
	}
}

func (as *quantilesAggrState) pushSamples(samples []pushSample) {
	for i := range samples {
		s := &samples[i]
		outputKey := getOutputKey(s.key)

	again:
		v, ok := as.m.Load(outputKey)
		if !ok {
			// The entry is missing in the map. Try creating it.
			h := histogram.GetFast()
			v = &quantilesStateValue{
				h: h,
			}
			outputKey = strings.Clone(outputKey)
			vNew, loaded := as.m.LoadOrStore(outputKey, v)
			if loaded {
				// Use the entry created by a concurrent goroutine.
				histogram.PutFast(h)
				v = vNew
			}
		}
		sv := v.(*quantilesStateValue)
		sv.mu.Lock()
		deleted := sv.deleted
		if !deleted {
			sv.h.Update(s.value)
		}
		sv.mu.Unlock()
		if deleted {
			// The entry has been deleted by the concurrent call to appendSeriesForFlush
			// Try obtaining and updating the entry again.
			goto again
		}
	}
}

func (as *quantilesAggrState) appendSeriesForFlush(ctx *flushCtx) {
	currentTimeMsec := int64(fasttime.UnixTimestamp()) * 1000
	m := &as.m
	phis := as.phis
	var quantiles []float64
	var b []byte
	m.Range(func(k, v interface{}) bool {
		// Atomically delete the entry from the map, so new entry is created for the next flush.
		m.Delete(k)

		sv := v.(*quantilesStateValue)
		sv.mu.Lock()
		quantiles = sv.h.Quantiles(quantiles[:0], phis)
		histogram.PutFast(sv.h)
		// Mark the entry as deleted, so it won't be updated anymore by concurrent pushSample() calls.
		sv.deleted = true
		sv.mu.Unlock()

		key := k.(string)
		for i, quantile := range quantiles {
			b = strconv.AppendFloat(b[:0], phis[i], 'g', -1, 64)
			phiStr := bytesutil.InternBytes(b)
			ctx.appendSeriesWithExtraLabel(key, "quantiles", currentTimeMsec, quantile, "quantile", phiStr)
		}
		return true
	})
}
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 06:19:18 +00:00			`package streamaggr`

			`import (`
			`"strconv"`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 00:42:26 +00:00			`"strings"`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 06:19:18 +00:00			`"sync"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"`
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"`
			`"github.com/valyala/histogram"`
			`)`

			`// quantilesAggrState calculates output=quantiles, e.g. the the given quantiles over the input samples.`
			`type quantilesAggrState struct {`
			`m sync.Map`

			`phis []float64`
			`}`

			`type quantilesStateValue struct {`
			`mu sync.Mutex`
			`h *histogram.Fast`
			`deleted bool`
			`}`

			`func newQuantilesAggrState(phis []float64) *quantilesAggrState {`
			`return &quantilesAggrState{`
			`phis: phis,`
			`}`
			`}`

lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 00:42:26 +00:00			`func (as *quantilesAggrState) pushSamples(samples []pushSample) {`
			`for i := range samples {`
			`s := &samples[i]`
			`outputKey := getOutputKey(s.key)`

			`again:`
			`v, ok := as.m.Load(outputKey)`
			`if !ok {`
			`// The entry is missing in the map. Try creating it.`
			`h := histogram.GetFast()`
			`v = &quantilesStateValue{`
			`h: h,`
			`}`
			`outputKey = strings.Clone(outputKey)`
			`vNew, loaded := as.m.LoadOrStore(outputKey, v)`
			`if loaded {`
			`// Use the entry created by a concurrent goroutine.`
			`histogram.PutFast(h)`
			`v = vNew`
			`}`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 06:19:18 +00:00			`}`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 00:42:26 +00:00			`sv := v.(*quantilesStateValue)`
			`sv.mu.Lock()`
			`deleted := sv.deleted`
			`if !deleted {`
			`sv.h.Update(s.value)`
			`}`
			`sv.mu.Unlock()`
			`if deleted {`
			`// The entry has been deleted by the concurrent call to appendSeriesForFlush`
			`// Try obtaining and updating the entry again.`
			`goto again`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 06:19:18 +00:00			`}`
			`}`
			`}`

			`func (as quantilesAggrState) appendSeriesForFlush(ctx flushCtx) {`
			`currentTimeMsec := int64(fasttime.UnixTimestamp()) * 1000`
			`m := &as.m`
			`phis := as.phis`
			`var quantiles []float64`
			`var b []byte`
			`m.Range(func(k, v interface{}) bool {`
			`// Atomically delete the entry from the map, so new entry is created for the next flush.`
			`m.Delete(k)`

			`sv := v.(*quantilesStateValue)`
			`sv.mu.Lock()`
			`quantiles = sv.h.Quantiles(quantiles[:0], phis)`
			`histogram.PutFast(sv.h)`
			`// Mark the entry as deleted, so it won't be updated anymore by concurrent pushSample() calls.`
			`sv.deleted = true`
			`sv.mu.Unlock()`

			`key := k.(string)`
			`for i, quantile := range quantiles {`
			`b = strconv.AppendFloat(b[:0], phis[i], 'g', -1, 64)`
			`phiStr := bytesutil.InternBytes(b)`
			`ctx.appendSeriesWithExtraLabel(key, "quantiles", currentTimeMsec, quantile, "quantile", phiStr)`
			`}`
			`return true`
			`})`
			`}`