This commit is contained in:
Aliaksandr Valialkin 2024-05-28 15:50:39 +02:00
parent e8e49405ef
commit bd70c94a27
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
3 changed files with 134 additions and 33 deletions

View file

@ -3,6 +3,7 @@ package logstorage
import ( import (
"fmt" "fmt"
"math" "math"
"slices"
"strings" "strings"
"sync" "sync"
@ -27,7 +28,8 @@ type filterIn struct {
// qFieldName must be set to field name for obtaining values from if q is non-nil. // qFieldName must be set to field name for obtaining values from if q is non-nil.
qFieldName string qFieldName string
tokenSetsOnce sync.Once tokensOnce sync.Once
commonTokens []string
tokenSets [][]string tokenSets [][]string
stringValuesOnce sync.Once stringValuesOnce sync.Once
@ -74,28 +76,15 @@ func (fi *filterIn) updateNeededFields(neededFields fieldsSet) {
neededFields.add(fi.fieldName) neededFields.add(fi.fieldName)
} }
func (fi *filterIn) getTokenSets() [][]string { func (fi *filterIn) getTokens() ([]string, [][]string) {
fi.tokenSetsOnce.Do(fi.initTokenSets) fi.tokensOnce.Do(fi.initTokens)
return fi.tokenSets return fi.commonTokens, fi.tokenSets
} }
// It is faster to match every row in the block instead of checking too big number of tokenSets against bloom filter. func (fi *filterIn) initTokens() {
const maxTokenSetsToInit = 1000 commonTokens, tokenSets := getCommonTokensAndTokenSets(fi.values)
func (fi *filterIn) initTokenSets() { fi.commonTokens = commonTokens
values := fi.values
tokenSetsLen := len(values)
if tokenSetsLen > maxTokenSetsToInit {
tokenSetsLen = maxTokenSetsToInit
}
tokenSets := make([][]string, 0, tokenSetsLen+1)
for _, v := range values {
tokens := tokenizeStrings(nil, []string{v})
tokenSets = append(tokenSets, tokens)
if len(tokens) > maxTokenSetsToInit {
break
}
}
fi.tokenSets = tokenSets fi.tokenSets = tokenSets
} }
@ -385,47 +374,47 @@ func (fi *filterIn) applyToBlockSearch(bs *blockSearch, bm *bitmap) {
return return
} }
tokenSets := fi.getTokenSets() commonTokens, tokenSets := fi.getTokens()
switch ch.valueType { switch ch.valueType {
case valueTypeString: case valueTypeString:
stringValues := fi.getStringValues() stringValues := fi.getStringValues()
matchAnyValue(bs, ch, bm, stringValues, tokenSets) matchAnyValue(bs, ch, bm, stringValues, commonTokens, tokenSets)
case valueTypeDict: case valueTypeDict:
stringValues := fi.getStringValues() stringValues := fi.getStringValues()
matchValuesDictByAnyValue(bs, ch, bm, stringValues) matchValuesDictByAnyValue(bs, ch, bm, stringValues)
case valueTypeUint8: case valueTypeUint8:
binValues := fi.getUint8Values() binValues := fi.getUint8Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeUint16: case valueTypeUint16:
binValues := fi.getUint16Values() binValues := fi.getUint16Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeUint32: case valueTypeUint32:
binValues := fi.getUint32Values() binValues := fi.getUint32Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeUint64: case valueTypeUint64:
binValues := fi.getUint64Values() binValues := fi.getUint64Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeFloat64: case valueTypeFloat64:
binValues := fi.getFloat64Values() binValues := fi.getFloat64Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeIPv4: case valueTypeIPv4:
binValues := fi.getIPv4Values() binValues := fi.getIPv4Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
case valueTypeTimestampISO8601: case valueTypeTimestampISO8601:
binValues := fi.getTimestampISO8601Values() binValues := fi.getTimestampISO8601Values()
matchAnyValue(bs, ch, bm, binValues, tokenSets) matchAnyValue(bs, ch, bm, binValues, commonTokens, tokenSets)
default: default:
logger.Panicf("FATAL: %s: unknown valueType=%d", bs.partPath(), ch.valueType) logger.Panicf("FATAL: %s: unknown valueType=%d", bs.partPath(), ch.valueType)
} }
} }
func matchAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, values map[string]struct{}, tokenSets [][]string) { func matchAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, values map[string]struct{}, commonTokens []string, tokenSets [][]string) {
if len(values) == 0 { if len(values) == 0 {
bm.resetBits() bm.resetBits()
return return
} }
if !matchBloomFilterAnyTokenSet(bs, ch, tokenSets) { if !matchBloomFilterAnyTokenSet(bs, ch, commonTokens, tokenSets) {
bm.resetBits() bm.resetBits()
return return
} }
@ -435,7 +424,10 @@ func matchAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, values map[str
}) })
} }
func matchBloomFilterAnyTokenSet(bs *blockSearch, ch *columnHeader, tokenSets [][]string) bool { func matchBloomFilterAnyTokenSet(bs *blockSearch, ch *columnHeader, commonTokens []string, tokenSets [][]string) bool {
if len(commonTokens) > 0 {
return matchBloomFilterAllTokens(bs, ch, commonTokens)
}
if len(tokenSets) == 0 { if len(tokenSets) == 0 {
return false return false
} }
@ -453,6 +445,9 @@ func matchBloomFilterAnyTokenSet(bs *blockSearch, ch *columnHeader, tokenSets []
return false return false
} }
// It is faster to match every row in the block instead of checking too big number of tokenSets against bloom filter.
const maxTokenSetsToInit = 1000
func matchValuesDictByAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, values map[string]struct{}) { func matchValuesDictByAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, values map[string]struct{}) {
bb := bbPool.Get() bb := bbPool.Get()
for _, v := range ch.valuesDict.values { for _, v := range ch.valuesDict.values {
@ -465,3 +460,44 @@ func matchValuesDictByAnyValue(bs *blockSearch, ch *columnHeader, bm *bitmap, va
matchEncodedValuesDict(bs, ch, bm, bb.B) matchEncodedValuesDict(bs, ch, bm, bb.B)
bbPool.Put(bb) bbPool.Put(bb)
} }
func getCommonTokensAndTokenSets(values []string) ([]string, [][]string) {
tokenSets := make([][]string, len(values))
for i, v := range values {
tokenSets[i] = tokenizeStrings(nil, []string{v})
}
commonTokens := getCommonTokens(tokenSets)
if len(commonTokens) == 0 {
return nil, tokenSets
}
return commonTokens, nil
}
func getCommonTokens(tokenSets [][]string) []string {
if len(tokenSets) == 0 {
return nil
}
m := make(map[string]struct{}, len(tokenSets[0]))
for _, token := range tokenSets[0] {
m[token] = struct{}{}
}
for _, tokens := range tokenSets[1:] {
if len(m) == 0 {
return nil
}
for token := range m {
if !slices.Contains(tokens, token) {
delete(m, token)
}
}
}
tokens := make([]string, 0, len(m))
for token := range m {
tokens = append(tokens, token)
}
return tokens
}

View file

@ -1,6 +1,8 @@
package logstorage package logstorage
import ( import (
"reflect"
"slices"
"testing" "testing"
) )
@ -688,3 +690,32 @@ func TestFilterIn(t *testing.T) {
testFilterMatchForColumns(t, columns, fi, "_msg", nil) testFilterMatchForColumns(t, columns, fi, "_msg", nil)
}) })
} }
func TestGetCommonTokensAndTokenSets(t *testing.T) {
f := func(values []string, commonTokensExpected []string, tokenSetsExpected [][]string) {
t.Helper()
commonTokens, tokenSets := getCommonTokensAndTokenSets(values)
slices.Sort(commonTokens)
if !reflect.DeepEqual(commonTokens, commonTokensExpected) {
t.Fatalf("unexpected commonTokens for values=%q\ngot\n%q\nwant\n%q", values, commonTokens, commonTokensExpected)
}
for i, tokens := range tokenSets {
slices.Sort(tokens)
tokensExpected := tokenSetsExpected[i]
if !reflect.DeepEqual(tokens, tokensExpected) {
t.Fatalf("unexpected tokens for value=%q\ngot\n%q\nwant\n%q", values[i], tokens, tokensExpected)
}
}
}
f(nil, nil, nil)
f([]string{"foo"}, []string{"foo"}, nil)
f([]string{"foo", "foo"}, []string{"foo"}, nil)
f([]string{"foo", "bar", "bar", "foo"}, nil, [][]string{{"foo"}, {"bar"}, {"bar"}, {"foo"}})
f([]string{"foo", "foo bar", "bar foo"}, []string{"foo"}, nil)
f([]string{"a foo bar", "bar abc foo", "foo abc a bar"}, []string{"bar", "foo"}, nil)
f([]string{"a xfoo bar", "xbar abc foo", "foo abc a bar"}, nil, [][]string{{"a", "bar", "xfoo"}, {"abc", "foo", "xbar"}, {"a", "abc", "bar", "foo"}})
}

View file

@ -167,3 +167,37 @@ func TestPipeMath(t *testing.T) {
}, },
}) })
} }
func TestPipeMathUpdateNeededFields(t *testing.T) {
f := func(s string, neededFields, unneededFields, neededFieldsExpected, unneededFieldsExpected string) {
t.Helper()
expectPipeNeededFields(t, s, neededFields, unneededFields, neededFieldsExpected, unneededFieldsExpected)
}
// all the needed fields
f("math (x + 1) as y", "*", "", "*", "y")
// all the needed fields, unneeded fields do not intersect with src and dst
f("math (x + 1) as y", "*", "f1,f2", "*", "f1,f2,y")
// all the needed fields, unneeded fields intersect with src
f("math (x + 1) as y", "*", "f1,x", "*", "f1,y")
// all the needed fields, unneeded fields intersect with dst
f("math (x + 1) as y", "*", "f1,y", "*", "f1,y")
// all the needed fields, unneeded fields intersect with src and dst
f("math (x + 1) as y", "*", "f1,x,y", "*", "f1,x,y")
// needed fields do not intersect with src and dst
f("math (x + 1) as y", "f1,f2", "", "f1,f2", "")
// needed fields intersect with src
f("math (x + 1) as y", "f1,x", "", "f1,x", "")
// needed fields intersect with dst
f("math (x + 1) as y", "f1,y", "", "f1,x", "")
// needed fields intersect with src and dst
f("math (x + 1) as y", "f1,x,y", "", "f1,x", "")
}