2024-05-12 14:33:29 +00:00
|
|
|
package logstorage
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"strconv"
|
|
|
|
"unsafe"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
|
|
|
)
|
|
|
|
|
|
|
|
type statsCountUniq struct {
|
2024-05-22 19:01:20 +00:00
|
|
|
fields []string
|
|
|
|
limit uint64
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (su *statsCountUniq) String() string {
|
2024-05-22 19:01:20 +00:00
|
|
|
s := "count_uniq(" + statsFuncFieldsToString(su.fields) + ")"
|
2024-05-12 14:33:29 +00:00
|
|
|
if su.limit > 0 {
|
|
|
|
s += fmt.Sprintf(" limit %d", su.limit)
|
|
|
|
}
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
2024-05-20 02:08:30 +00:00
|
|
|
func (su *statsCountUniq) updateNeededFields(neededFields fieldsSet) {
|
2024-05-22 19:01:20 +00:00
|
|
|
updateNeededFieldsForStatsFunc(neededFields, su.fields)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (su *statsCountUniq) newStatsProcessor() (statsProcessor, int) {
|
|
|
|
sup := &statsCountUniqProcessor{
|
|
|
|
su: su,
|
|
|
|
|
|
|
|
m: make(map[string]struct{}),
|
|
|
|
}
|
|
|
|
return sup, int(unsafe.Sizeof(*sup))
|
|
|
|
}
|
|
|
|
|
|
|
|
type statsCountUniqProcessor struct {
|
|
|
|
su *statsCountUniq
|
|
|
|
|
|
|
|
m map[string]struct{}
|
|
|
|
|
|
|
|
columnValues [][]string
|
|
|
|
keyBuf []byte
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sup *statsCountUniqProcessor) updateStatsForAllRows(br *blockResult) int {
|
|
|
|
if sup.limitReached() {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
fields := sup.su.fields
|
|
|
|
|
|
|
|
stateSizeIncrease := 0
|
2024-05-22 19:01:20 +00:00
|
|
|
if len(fields) == 0 {
|
2024-05-12 14:33:29 +00:00
|
|
|
// Count unique rows
|
|
|
|
cs := br.getColumns()
|
2024-05-22 19:01:20 +00:00
|
|
|
|
|
|
|
columnValues := sup.columnValues[:0]
|
|
|
|
for _, c := range cs {
|
|
|
|
values := c.getValues(br)
|
|
|
|
columnValues = append(columnValues, values)
|
|
|
|
}
|
|
|
|
sup.columnValues = columnValues
|
|
|
|
|
2024-05-12 14:33:29 +00:00
|
|
|
keyBuf := sup.keyBuf[:0]
|
2024-09-25 14:16:53 +00:00
|
|
|
for i := 0; i < br.rowsLen; i++ {
|
2024-05-12 14:33:29 +00:00
|
|
|
seenKey := true
|
2024-05-22 19:01:20 +00:00
|
|
|
for _, values := range columnValues {
|
2024-05-12 14:33:29 +00:00
|
|
|
if i == 0 || values[i-1] != values[i] {
|
|
|
|
seenKey = false
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if seenKey {
|
|
|
|
// This key has been already counted.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
allEmptyValues := true
|
|
|
|
keyBuf = keyBuf[:0]
|
2024-05-22 19:01:20 +00:00
|
|
|
for j, values := range columnValues {
|
|
|
|
v := values[i]
|
2024-05-12 14:33:29 +00:00
|
|
|
if v != "" {
|
|
|
|
allEmptyValues = false
|
|
|
|
}
|
|
|
|
// Put column name into key, since every block can contain different set of columns for '*' selector.
|
2024-05-22 19:01:20 +00:00
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(cs[j].name))
|
2024-05-12 14:33:29 +00:00
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
|
|
|
|
}
|
|
|
|
if allEmptyValues {
|
|
|
|
// Do not count empty values
|
|
|
|
continue
|
|
|
|
}
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if len(fields) == 1 {
|
|
|
|
// Fast path for a single column.
|
|
|
|
// The unique key is formed as "<is_time> <value>",
|
|
|
|
// This guarantees that keys do not clash for different column types across blocks.
|
|
|
|
c := br.getColumnByName(fields[0])
|
|
|
|
if c.isTime {
|
2024-09-25 14:16:53 +00:00
|
|
|
// Count unique timestamps
|
|
|
|
timestamps := br.getTimestamps()
|
2024-05-12 14:33:29 +00:00
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
for i, timestamp := range timestamps {
|
|
|
|
if i > 0 && timestamps[i-1] == timestamps[i] {
|
|
|
|
// This timestamp has been already counted.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
keyBuf = append(keyBuf[:0], 1)
|
|
|
|
keyBuf = encoding.MarshalInt64(keyBuf, timestamp)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if c.isConst {
|
|
|
|
// count unique const values
|
2024-05-20 02:08:30 +00:00
|
|
|
v := c.valuesEncoded[0]
|
2024-05-12 14:33:29 +00:00
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if c.valueType == valueTypeDict {
|
|
|
|
// count unique non-zero c.dictValues
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
for _, v := range c.dictValues {
|
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
2024-05-20 02:08:30 +00:00
|
|
|
// Count unique values across values
|
2024-05-12 14:33:29 +00:00
|
|
|
values := c.getValues(br)
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
for i, v := range values {
|
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if i > 0 && values[i-1] == v {
|
|
|
|
// This value has been already counted.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
2024-05-13 23:49:20 +00:00
|
|
|
sup.keyBuf = keyBuf
|
2024-05-12 14:33:29 +00:00
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
|
|
|
// Slow path for multiple columns.
|
|
|
|
|
|
|
|
// Pre-calculate column values for byFields in order to speed up building group key in the loop below.
|
|
|
|
columnValues := sup.columnValues[:0]
|
|
|
|
for _, f := range fields {
|
|
|
|
c := br.getColumnByName(f)
|
|
|
|
values := c.getValues(br)
|
|
|
|
columnValues = append(columnValues, values)
|
|
|
|
}
|
|
|
|
sup.columnValues = columnValues
|
|
|
|
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
2024-09-25 14:16:53 +00:00
|
|
|
for i := 0; i < br.rowsLen; i++ {
|
2024-05-12 14:33:29 +00:00
|
|
|
seenKey := true
|
|
|
|
for _, values := range columnValues {
|
|
|
|
if i == 0 || values[i-1] != values[i] {
|
|
|
|
seenKey = false
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if seenKey {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
allEmptyValues := true
|
|
|
|
keyBuf = keyBuf[:0]
|
|
|
|
for _, values := range columnValues {
|
|
|
|
v := values[i]
|
|
|
|
if v != "" {
|
|
|
|
allEmptyValues = false
|
|
|
|
}
|
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
|
|
|
|
}
|
|
|
|
if allEmptyValues {
|
|
|
|
// Do not count empty values
|
|
|
|
continue
|
|
|
|
}
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sup *statsCountUniqProcessor) updateStatsForRow(br *blockResult, rowIdx int) int {
|
|
|
|
if sup.limitReached() {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
fields := sup.su.fields
|
|
|
|
|
|
|
|
stateSizeIncrease := 0
|
2024-05-22 19:01:20 +00:00
|
|
|
if len(fields) == 0 {
|
2024-05-12 14:33:29 +00:00
|
|
|
// Count unique rows
|
|
|
|
allEmptyValues := true
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
for _, c := range br.getColumns() {
|
|
|
|
v := c.getValueAtRow(br, rowIdx)
|
|
|
|
if v != "" {
|
|
|
|
allEmptyValues = false
|
|
|
|
}
|
|
|
|
// Put column name into key, since every block can contain different set of columns for '*' selector.
|
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(c.name))
|
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
|
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
|
|
|
|
if allEmptyValues {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if len(fields) == 1 {
|
|
|
|
// Fast path for a single column.
|
|
|
|
// The unique key is formed as "<is_time> <value>",
|
|
|
|
// This guarantees that keys do not clash for different column types across blocks.
|
|
|
|
c := br.getColumnByName(fields[0])
|
|
|
|
if c.isTime {
|
2024-09-25 14:16:53 +00:00
|
|
|
// Count unique timestamps
|
|
|
|
timestamps := br.getTimestamps()
|
2024-05-12 14:33:29 +00:00
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
keyBuf = append(keyBuf[:0], 1)
|
2024-09-25 14:16:53 +00:00
|
|
|
keyBuf = encoding.MarshalInt64(keyBuf, timestamps[rowIdx])
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if c.isConst {
|
|
|
|
// count unique const values
|
2024-05-20 02:08:30 +00:00
|
|
|
v := c.valuesEncoded[0]
|
2024-05-12 14:33:29 +00:00
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
if c.valueType == valueTypeDict {
|
|
|
|
// count unique non-zero c.dictValues
|
2024-05-20 02:08:30 +00:00
|
|
|
valuesEncoded := c.getValuesEncoded(br)
|
|
|
|
dictIdx := valuesEncoded[rowIdx][0]
|
2024-05-12 14:33:29 +00:00
|
|
|
v := c.dictValues[dictIdx]
|
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-13 23:49:20 +00:00
|
|
|
sup.keyBuf = keyBuf
|
2024-05-12 14:33:29 +00:00
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
|
|
|
// Count unique values for the given rowIdx
|
|
|
|
v := c.getValueAtRow(br, rowIdx)
|
|
|
|
if v == "" {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
keyBuf = append(keyBuf[:0], 0)
|
|
|
|
keyBuf = append(keyBuf, v...)
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-13 23:49:20 +00:00
|
|
|
sup.keyBuf = keyBuf
|
2024-05-12 14:33:29 +00:00
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
|
|
|
// Slow path for multiple columns.
|
|
|
|
allEmptyValues := true
|
|
|
|
keyBuf := sup.keyBuf[:0]
|
|
|
|
for _, f := range fields {
|
|
|
|
c := br.getColumnByName(f)
|
|
|
|
v := c.getValueAtRow(br, rowIdx)
|
|
|
|
if v != "" {
|
|
|
|
allEmptyValues = false
|
|
|
|
}
|
|
|
|
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
|
|
|
|
}
|
|
|
|
sup.keyBuf = keyBuf
|
|
|
|
|
|
|
|
if allEmptyValues {
|
|
|
|
// Do not count empty values
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
2024-05-22 19:01:20 +00:00
|
|
|
stateSizeIncrease += sup.updateState(keyBuf)
|
2024-05-12 14:33:29 +00:00
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sup *statsCountUniqProcessor) mergeState(sfp statsProcessor) {
|
|
|
|
if sup.limitReached() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
src := sfp.(*statsCountUniqProcessor)
|
|
|
|
m := sup.m
|
|
|
|
for k := range src.m {
|
|
|
|
if _, ok := m[k]; !ok {
|
|
|
|
m[k] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (sup *statsCountUniqProcessor) finalizeStats() string {
|
|
|
|
n := uint64(len(sup.m))
|
|
|
|
if limit := sup.su.limit; limit > 0 && n > limit {
|
|
|
|
n = limit
|
|
|
|
}
|
|
|
|
return strconv.FormatUint(n, 10)
|
|
|
|
}
|
|
|
|
|
2024-05-22 19:01:20 +00:00
|
|
|
func (sup *statsCountUniqProcessor) updateState(v []byte) int {
|
|
|
|
stateSizeIncrease := 0
|
|
|
|
if _, ok := sup.m[string(v)]; !ok {
|
|
|
|
sup.m[string(v)] = struct{}{}
|
|
|
|
stateSizeIncrease += len(v) + int(unsafe.Sizeof(""))
|
|
|
|
}
|
|
|
|
return stateSizeIncrease
|
|
|
|
}
|
|
|
|
|
2024-05-12 14:33:29 +00:00
|
|
|
func (sup *statsCountUniqProcessor) limitReached() bool {
|
|
|
|
limit := sup.su.limit
|
2024-09-29 08:50:25 +00:00
|
|
|
return limit > 0 && uint64(len(sup.m)) > limit
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func parseStatsCountUniq(lex *lexer) (*statsCountUniq, error) {
|
2024-05-22 19:01:20 +00:00
|
|
|
fields, err := parseStatsFuncFields(lex, "count_uniq")
|
2024-05-12 14:33:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
su := &statsCountUniq{
|
2024-05-22 19:01:20 +00:00
|
|
|
fields: fields,
|
2024-05-12 14:33:29 +00:00
|
|
|
}
|
|
|
|
if lex.isKeyword("limit") {
|
|
|
|
lex.nextToken()
|
|
|
|
n, ok := tryParseUint64(lex.token)
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("cannot parse 'limit %s' for 'count_uniq': %w", lex.token, err)
|
|
|
|
}
|
|
|
|
lex.nextToken()
|
|
|
|
su.limit = n
|
|
|
|
}
|
|
|
|
return su, nil
|
|
|
|
}
|