VictoriaMetrics/lib/logstorage/stats_uniq_count.go

368 lines
9.2 KiB
Go
Raw Normal View History

2024-04-29 01:20:43 +00:00
package logstorage
import (
"slices"
"strconv"
"unsafe"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
2024-05-07 21:44:12 +00:00
type statsUniqCount struct {
2024-04-29 01:20:43 +00:00
fields []string
containsStar bool
}
2024-05-07 21:44:12 +00:00
func (su *statsUniqCount) String() string {
return "uniq_count(" + fieldNamesString(su.fields) + ")"
2024-04-29 01:20:43 +00:00
}
2024-05-07 21:44:12 +00:00
func (su *statsUniqCount) neededFields() []string {
2024-04-29 01:20:43 +00:00
return su.fields
}
2024-05-07 21:44:12 +00:00
func (su *statsUniqCount) newStatsProcessor() (statsProcessor, int) {
sup := &statsUniqCountProcessor{
2024-04-29 01:20:43 +00:00
su: su,
m: make(map[string]struct{}),
}
return sup, int(unsafe.Sizeof(*sup))
}
2024-05-07 21:44:12 +00:00
type statsUniqCountProcessor struct {
su *statsUniqCount
2024-04-29 01:20:43 +00:00
m map[string]struct{}
columnValues [][]string
keyBuf []byte
}
2024-05-07 21:44:12 +00:00
func (sup *statsUniqCountProcessor) updateStatsForAllRows(br *blockResult) int {
2024-04-29 01:20:43 +00:00
fields := sup.su.fields
m := sup.m
stateSizeIncrease := 0
2024-05-03 12:03:17 +00:00
if sup.su.containsStar {
2024-04-29 01:20:43 +00:00
// Count unique rows
2024-04-30 21:03:34 +00:00
columns := br.getColumns()
keyBuf := sup.keyBuf[:0]
for i := range br.timestamps {
2024-04-29 01:20:43 +00:00
seenKey := true
for _, c := range columns {
2024-04-30 21:03:34 +00:00
values := c.getValues(br)
2024-04-29 01:20:43 +00:00
if i == 0 || values[i-1] != values[i] {
seenKey = false
break
}
}
if seenKey {
2024-04-30 21:03:34 +00:00
// This key has been already counted.
2024-04-29 01:20:43 +00:00
continue
}
allEmptyValues := true
keyBuf = keyBuf[:0]
for _, c := range columns {
2024-04-30 21:03:34 +00:00
v := c.getValueAtRow(br, i)
2024-04-29 01:20:43 +00:00
if v != "" {
allEmptyValues = false
}
// Put column name into key, since every block can contain different set of columns for '*' selector.
2024-04-30 21:03:34 +00:00
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(c.name))
2024-04-29 01:20:43 +00:00
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
}
if allEmptyValues {
// Do not count empty values
continue
}
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
if len(fields) == 1 {
2024-04-30 21:03:34 +00:00
// Fast path for a single column.
2024-05-04 22:28:01 +00:00
// The unique key is formed as "<is_time> <value>",
2024-05-03 10:54:37 +00:00
// This guarantees that keys do not clash for different column types across blocks.
2024-04-30 21:03:34 +00:00
c := br.getColumnByName(fields[0])
if c.isTime {
// Count unique br.timestamps
timestamps := br.timestamps
keyBuf := sup.keyBuf[:0]
for i, timestamp := range timestamps {
if i > 0 && timestamps[i-1] == timestamps[i] {
// This timestamp has been already counted.
2024-04-29 01:20:43 +00:00
continue
}
2024-04-30 21:03:34 +00:00
keyBuf = append(keyBuf[:0], 1)
keyBuf = encoding.MarshalInt64(keyBuf, timestamp)
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
if c.isConst {
// count unique const values
v := c.encodedValues[0]
if v == "" {
// Do not count empty values
return stateSizeIncrease
}
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
2024-04-30 21:03:34 +00:00
keyBuf = append(keyBuf, v...)
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
if c.valueType == valueTypeDict {
// count unique non-zero c.dictValues
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
for _, v := range c.dictValues {
2024-04-30 21:03:34 +00:00
if v == "" {
// Do not count empty values
2024-04-29 01:20:43 +00:00
continue
}
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
keyBuf = append(keyBuf, v...)
2024-04-30 21:03:34 +00:00
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
2024-04-29 01:20:43 +00:00
}
}
2024-04-30 21:03:34 +00:00
sup.keyBuf = keyBuf
return stateSizeIncrease
}
// Count unique values across encodedValues
2024-05-04 22:28:01 +00:00
values := c.getValues(br)
2024-04-30 21:03:34 +00:00
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
for i, v := range values {
if v == "" {
2024-04-30 21:03:34 +00:00
// Do not count empty values
continue
}
2024-05-04 22:28:01 +00:00
if i > 0 && values[i-1] == v {
2024-04-30 21:03:34 +00:00
// This value has been already counted.
continue
}
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
2024-04-30 21:03:34 +00:00
keyBuf = append(keyBuf, v...)
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
2024-04-29 01:20:43 +00:00
}
2024-04-30 21:03:34 +00:00
keyBuf = sup.keyBuf
2024-04-29 01:20:43 +00:00
return stateSizeIncrease
}
// Slow path for multiple columns.
// Pre-calculate column values for byFields in order to speed up building group key in the loop below.
2024-05-03 09:15:09 +00:00
columnValues := sup.columnValues[:0]
for _, f := range fields {
c := br.getColumnByName(f)
values := c.getValues(br)
columnValues = append(columnValues, values)
}
sup.columnValues = columnValues
2024-04-29 01:20:43 +00:00
2024-04-30 21:03:34 +00:00
keyBuf := sup.keyBuf[:0]
for i := range br.timestamps {
2024-04-29 01:20:43 +00:00
seenKey := true
for _, values := range columnValues {
if i == 0 || values[i-1] != values[i] {
seenKey = false
2024-04-30 21:03:34 +00:00
break
2024-04-29 01:20:43 +00:00
}
}
if seenKey {
continue
}
allEmptyValues := true
keyBuf = keyBuf[:0]
for _, values := range columnValues {
v := values[i]
if v != "" {
allEmptyValues = false
}
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
}
if allEmptyValues {
// Do not count empty values
continue
}
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
2024-05-07 21:44:12 +00:00
func (sup *statsUniqCountProcessor) updateStatsForRow(br *blockResult, rowIdx int) int {
2024-04-29 01:20:43 +00:00
fields := sup.su.fields
m := sup.m
stateSizeIncrease := 0
2024-05-03 12:03:17 +00:00
if sup.su.containsStar {
2024-04-29 01:20:43 +00:00
// Count unique rows
allEmptyValues := true
keyBuf := sup.keyBuf[:0]
2024-04-30 21:03:34 +00:00
for _, c := range br.getColumns() {
v := c.getValueAtRow(br, rowIdx)
2024-04-29 01:20:43 +00:00
if v != "" {
allEmptyValues = false
}
// Put column name into key, since every block can contain different set of columns for '*' selector.
2024-04-30 21:03:34 +00:00
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(c.name))
2024-04-29 01:20:43 +00:00
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
}
sup.keyBuf = keyBuf
if allEmptyValues {
// Do not count empty values
return stateSizeIncrease
}
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
return stateSizeIncrease
}
if len(fields) == 1 {
2024-04-30 21:03:34 +00:00
// Fast path for a single column.
2024-05-04 22:28:01 +00:00
// The unique key is formed as "<is_time> <value>",
2024-05-03 10:54:37 +00:00
// This guarantees that keys do not clash for different column types across blocks.
2024-04-30 21:03:34 +00:00
c := br.getColumnByName(fields[0])
if c.isTime {
// Count unique br.timestamps
keyBuf := sup.keyBuf[:0]
keyBuf = append(keyBuf[:0], 1)
keyBuf = encoding.MarshalInt64(keyBuf, br.timestamps[rowIdx])
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
if c.isConst {
// count unique const values
v := c.encodedValues[0]
2024-04-29 01:20:43 +00:00
if v == "" {
// Do not count empty values
return stateSizeIncrease
}
2024-04-30 21:03:34 +00:00
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
2024-04-30 21:03:34 +00:00
keyBuf = append(keyBuf, v...)
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
2024-04-29 01:20:43 +00:00
}
2024-04-30 21:03:34 +00:00
sup.keyBuf = keyBuf
return stateSizeIncrease
2024-04-29 01:20:43 +00:00
}
2024-04-30 21:03:34 +00:00
if c.valueType == valueTypeDict {
// count unique non-zero c.dictValues
dictIdx := c.encodedValues[rowIdx][0]
2024-05-04 22:28:01 +00:00
v := c.dictValues[dictIdx]
if v == "" {
2024-04-30 21:03:34 +00:00
// Do not count empty values
return stateSizeIncrease
}
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
keyBuf = append(keyBuf, v...)
2024-04-30 21:03:34 +00:00
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
sup.keyBuf = keyBuf
return stateSizeIncrease
}
2024-05-03 10:54:37 +00:00
// Count unique values for the given rowIdx
2024-05-04 22:28:01 +00:00
v := c.getValueAtRow(br, rowIdx)
if v == "" {
2024-04-30 21:03:34 +00:00
// Do not count empty values
return stateSizeIncrease
}
keyBuf := sup.keyBuf[:0]
2024-05-04 22:28:01 +00:00
keyBuf = append(keyBuf[:0], 0)
2024-04-30 21:03:34 +00:00
keyBuf = append(keyBuf, v...)
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
keyBuf = sup.keyBuf
2024-04-29 01:20:43 +00:00
return stateSizeIncrease
}
// Slow path for multiple columns.
allEmptyValues := true
keyBuf := sup.keyBuf[:0]
for _, f := range fields {
2024-04-30 21:03:34 +00:00
c := br.getColumnByName(f)
v := c.getValueAtRow(br, rowIdx)
2024-04-29 01:20:43 +00:00
if v != "" {
allEmptyValues = false
}
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
}
sup.keyBuf = keyBuf
if allEmptyValues {
// Do not count empty values
return stateSizeIncrease
}
if _, ok := m[string(keyBuf)]; !ok {
m[string(keyBuf)] = struct{}{}
stateSizeIncrease += len(keyBuf) + int(unsafe.Sizeof(""))
}
return stateSizeIncrease
}
2024-05-07 21:44:12 +00:00
func (sup *statsUniqCountProcessor) mergeState(sfp statsProcessor) {
src := sfp.(*statsUniqCountProcessor)
2024-04-29 01:20:43 +00:00
m := sup.m
for k := range src.m {
2024-05-03 10:54:37 +00:00
if _, ok := m[k]; !ok {
m[k] = struct{}{}
}
2024-04-29 01:20:43 +00:00
}
}
2024-05-07 21:44:12 +00:00
func (sup *statsUniqCountProcessor) finalizeStats() string {
2024-04-29 01:20:43 +00:00
n := uint64(len(sup.m))
2024-04-30 23:19:22 +00:00
return strconv.FormatUint(n, 10)
2024-04-29 01:20:43 +00:00
}
2024-05-07 21:44:12 +00:00
func parseStatsUniqCount(lex *lexer) (*statsUniqCount, error) {
fields, err := parseFieldNamesForStatsFunc(lex, "uniq_count")
2024-04-29 01:20:43 +00:00
if err != nil {
2024-05-03 09:15:09 +00:00
return nil, err
2024-04-29 01:20:43 +00:00
}
2024-05-07 21:44:12 +00:00
su := &statsUniqCount{
2024-04-29 01:20:43 +00:00
fields: fields,
containsStar: slices.Contains(fields, "*"),
}
return su, nil
}