lib/logstorage: preserve the order of tokens to check against bloom filters in AND filters

Previously tokens from AND filters were extracted in random order. This could slow down
checking them agains bloom filters if the most specific tokens go at the beginning of the AND filters.
Preserve the original order of tokens when matching them against bloom filters,
so the user could control the performance of the query by putting the most specific AND filters
at the beginning of the query.

While at it, add tests for getCommonTokensForAndFilters() and getCommonTokensForOrFilters().

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556
This commit is contained in:
Aliaksandr Valialkin 2024-09-08 12:23:45 +02:00
parent eaee2d7db4
commit 45a3713bdb
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
4 changed files with 199 additions and 17 deletions

View file

@ -118,7 +118,7 @@ func (fa *filterAnd) initByFieldTokens() {
} }
func getCommonTokensForAndFilters(filters []filter) []fieldTokens { func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
m := make(map[string]map[string]struct{}) m := make(map[string][]string)
var fieldNames []string var fieldNames []string
mergeFieldTokens := func(fieldName string, tokens []string) { mergeFieldTokens := func(fieldName string, tokens []string) {
@ -127,15 +127,10 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
} }
fieldName = getCanonicalColumnName(fieldName) fieldName = getCanonicalColumnName(fieldName)
mTokens, ok := m[fieldName] if _, ok := m[fieldName]; !ok {
if !ok {
fieldNames = append(fieldNames, fieldName) fieldNames = append(fieldNames, fieldName)
mTokens = make(map[string]struct{})
m[fieldName] = mTokens
}
for _, token := range tokens {
mTokens[token] = struct{}{}
} }
m[fieldName] = append(m[fieldName], tokens...)
} }
for _, f := range filters { for _, f := range filters {
@ -169,8 +164,13 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
var byFieldTokens []fieldTokens var byFieldTokens []fieldTokens
for _, fieldName := range fieldNames { for _, fieldName := range fieldNames {
mTokens := m[fieldName] mTokens := m[fieldName]
seenTokens := make(map[string]struct{})
tokens := make([]string, 0, len(mTokens)) tokens := make([]string, 0, len(mTokens))
for token := range mTokens { for _, token := range mTokens {
if _, ok := seenTokens[token]; ok {
continue
}
seenTokens[token] = struct{}{}
tokens = append(tokens, token) tokens = append(tokens, token)
} }

View file

@ -1,6 +1,7 @@
package logstorage package logstorage
import ( import (
"reflect"
"testing" "testing"
) )
@ -75,3 +76,92 @@ func TestFilterAnd(t *testing.T) {
f(`foo:foo* AND !foo:~bar`, []int{0}) f(`foo:foo* AND !foo:~bar`, []int{0})
f(`foo:foo* AND foo:!~bar`, []int{0}) f(`foo:foo* AND foo:!~bar`, []int{0})
} }
func TestGetCommonTokensForAndFilters(t *testing.T) {
f := func(qStr string, tokensExpected []fieldTokens) {
t.Helper()
q, err := ParseQuery(qStr)
if err != nil {
t.Fatalf("unexpected error in ParseQuery: %s", err)
}
fa, ok := q.f.(*filterAnd)
if !ok {
t.Fatalf("unexpected filter type: %T; want *filterAnd", q.f)
}
tokens := getCommonTokensForAndFilters(fa.filters)
if len(tokens) != len(tokensExpected) {
t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected)
}
for i, ft := range tokens {
ftExpected := tokensExpected[i]
if ft.field != ftExpected.field {
t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens)
}
if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) {
t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens)
}
}
}
f(`foo AND bar`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo", "bar"},
},
})
f(`="foo bar" AND ="a foo"* AND "bar foo" AND "foo bar"* AND ~"foo qwe bar.+" AND seq(x, bar, "foo qwe")`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo", "bar", "a", "qwe", "x"},
},
})
// extract common tokens from OR filters
f(`foo AND (bar OR ~"x bar baz")`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo", "bar"},
},
})
// star matches any non-empty token, so it is skipped
f(`foo bar *`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo", "bar"},
},
})
f(`* *`, nil)
// empty filter must be skipped
f(`foo "" bar`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo", "bar"},
},
})
f(`"" ""`, nil)
// unknown filters must be skipped
f(`_time:5m !foo "bar baz" x`, []fieldTokens{
{
field: "_msg",
tokens: []string{"bar", "baz", "x"},
},
})
// distinct field names
f(`foo:x bar:"a bc" (foo:y OR (bar:qwe AND foo:"z y a"))`, []fieldTokens{
{
field: "foo",
tokens: []string{"x", "y"},
},
{
field: "bar",
tokens: []string{"a", "bc"},
},
})
}

View file

@ -182,17 +182,11 @@ func getCommonTokensForOrFilters(filters []filter) []fieldTokens {
if len(tokenss) != len(filters) { if len(tokenss) != len(filters) {
// The filter for the given fieldName is missing in some OR filters, // The filter for the given fieldName is missing in some OR filters,
// so it is impossible to extract common tokens from these filters. // so it is impossible to extract common tokens from these filters.
// Give up extracting common tokens from the remaining filters, continue
// since they may not cover log entries matching fieldName filters.
// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
return nil
} }
commonTokens := getCommonTokens(tokenss) commonTokens := getCommonTokens(tokenss)
if len(commonTokens) == 0 { if len(commonTokens) == 0 {
// Give up extracting common tokens from the remaining filters, continue
// since they may not cover log entries matching fieldName filters.
// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
return nil
} }
byFieldTokens = append(byFieldTokens, fieldTokens{ byFieldTokens = append(byFieldTokens, fieldTokens{
field: fieldName, field: fieldName,

View file

@ -1,6 +1,7 @@
package logstorage package logstorage
import ( import (
"reflect"
"testing" "testing"
) )
@ -70,3 +71,100 @@ func TestFilterOr(t *testing.T) {
f(`foo:baz or !foo:~foo`, []int{2, 3, 5, 7, 8, 9}) f(`foo:baz or !foo:~foo`, []int{2, 3, 5, 7, 8, 9})
f(`foo:baz or foo:!~foo`, []int{2, 3, 5, 7, 8, 9}) f(`foo:baz or foo:!~foo`, []int{2, 3, 5, 7, 8, 9})
} }
func TestGetCommonTokensForOrFilters(t *testing.T) {
f := func(qStr string, tokensExpected []fieldTokens) {
t.Helper()
q, err := ParseQuery(qStr)
if err != nil {
t.Fatalf("unexpected error in ParseQuery: %s", err)
}
fo, ok := q.f.(*filterOr)
if !ok {
t.Fatalf("unexpected filter type: %T; want *filterOr", q.f)
}
tokens := getCommonTokensForOrFilters(fo.filters)
if len(tokens) != len(tokensExpected) {
t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected)
}
for i, ft := range tokens {
ftExpected := tokensExpected[i]
if ft.field != ftExpected.field {
t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens)
}
if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) {
t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens)
}
}
}
// no common tokens
f(`foo OR bar`, nil)
// star filter matches non-empty value; it is skipped, since one of OR filters may contain an empty filter
f(`* OR foo`, nil)
f(`foo or *`, nil)
f(`* or *`, nil)
f(`"" or * or foo`, nil)
// empty filter suppresses all the common tokens.
f(`"" or foo or "foo bar"`, nil)
f(`foo or ""`, nil)
f(`"" or ""`, nil)
// common foo token
f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo"},
},
})
// regexp ending on foo token doesn't contain foo token, since it may match foobar.
f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo"`, nil)
// regexp starting from foo token doesn't contain foo token, since it may match barfoo.
f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"foo bar"`, nil)
// prefix filter ending on foo doesn't contain foo token, since it may match foobar.
f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo"* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, nil)
// bar and foo are common tokens
f(`"bar foo baz" OR (foo AND "bar x" AND foobar)`, []fieldTokens{
{
field: "_msg",
tokens: []string{"bar", "foo"},
},
})
// bar and foo are common tokens, x:foobar should be ignored, since it doesn't present in every OR filter
f(`"bar foo baz" OR (foo AND "bar x" AND x:foobar)`, []fieldTokens{
{
field: "_msg",
tokens: []string{"bar", "foo"},
},
})
// common tokens for distinct fields
f(`(foo AND x:a) OR (x:"b a c" AND ~"aaa foo ")`, []fieldTokens{
{
field: "_msg",
tokens: []string{"foo"},
},
{
field: "x",
tokens: []string{"a"},
},
})
// zero common tokens for distinct fields
f(`(foo AND x:a) OR (x:"b c" AND ~"aaa foo" AND y:z)`, nil)
// negative filter removes all the matching
f(`foo OR !"foo bar"`, nil)
// time filter removes all the matching
f(`_time:5m or foo`, nil)
}