From 45a3713bdb0510cc7982e67a868e8389a9dd7f82 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Sun, 8 Sep 2024 12:23:45 +0200 Subject: [PATCH] lib/logstorage: preserve the order of tokens to check against bloom filters in AND filters Previously tokens from AND filters were extracted in random order. This could slow down checking them agains bloom filters if the most specific tokens go at the beginning of the AND filters. Preserve the original order of tokens when matching them against bloom filters, so the user could control the performance of the query by putting the most specific AND filters at the beginning of the query. While at it, add tests for getCommonTokensForAndFilters() and getCommonTokensForOrFilters(). Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556 --- lib/logstorage/filter_and.go | 18 +++--- lib/logstorage/filter_and_test.go | 90 ++++++++++++++++++++++++++++ lib/logstorage/filter_or.go | 10 +--- lib/logstorage/filter_or_test.go | 98 +++++++++++++++++++++++++++++++ 4 files changed, 199 insertions(+), 17 deletions(-) diff --git a/lib/logstorage/filter_and.go b/lib/logstorage/filter_and.go index 131050ed3..cf0fa8db2 100644 --- a/lib/logstorage/filter_and.go +++ b/lib/logstorage/filter_and.go @@ -118,7 +118,7 @@ func (fa *filterAnd) initByFieldTokens() { } func getCommonTokensForAndFilters(filters []filter) []fieldTokens { - m := make(map[string]map[string]struct{}) + m := make(map[string][]string) var fieldNames []string mergeFieldTokens := func(fieldName string, tokens []string) { @@ -127,15 +127,10 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens { } fieldName = getCanonicalColumnName(fieldName) - mTokens, ok := m[fieldName] - if !ok { + if _, ok := m[fieldName]; !ok { fieldNames = append(fieldNames, fieldName) - mTokens = make(map[string]struct{}) - m[fieldName] = mTokens - } - for _, token := range tokens { - mTokens[token] = struct{}{} } + m[fieldName] = append(m[fieldName], tokens...) } for _, f := range filters { @@ -169,8 +164,13 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens { var byFieldTokens []fieldTokens for _, fieldName := range fieldNames { mTokens := m[fieldName] + seenTokens := make(map[string]struct{}) tokens := make([]string, 0, len(mTokens)) - for token := range mTokens { + for _, token := range mTokens { + if _, ok := seenTokens[token]; ok { + continue + } + seenTokens[token] = struct{}{} tokens = append(tokens, token) } diff --git a/lib/logstorage/filter_and_test.go b/lib/logstorage/filter_and_test.go index 41dd2d19f..223fffb7a 100644 --- a/lib/logstorage/filter_and_test.go +++ b/lib/logstorage/filter_and_test.go @@ -1,6 +1,7 @@ package logstorage import ( + "reflect" "testing" ) @@ -75,3 +76,92 @@ func TestFilterAnd(t *testing.T) { f(`foo:foo* AND !foo:~bar`, []int{0}) f(`foo:foo* AND foo:!~bar`, []int{0}) } + +func TestGetCommonTokensForAndFilters(t *testing.T) { + f := func(qStr string, tokensExpected []fieldTokens) { + t.Helper() + + q, err := ParseQuery(qStr) + if err != nil { + t.Fatalf("unexpected error in ParseQuery: %s", err) + } + fa, ok := q.f.(*filterAnd) + if !ok { + t.Fatalf("unexpected filter type: %T; want *filterAnd", q.f) + } + tokens := getCommonTokensForAndFilters(fa.filters) + + if len(tokens) != len(tokensExpected) { + t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected) + } + for i, ft := range tokens { + ftExpected := tokensExpected[i] + if ft.field != ftExpected.field { + t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens) + } + if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) { + t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens) + } + } + } + + f(`foo AND bar`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + + f(`="foo bar" AND ="a foo"* AND "bar foo" AND "foo bar"* AND ~"foo qwe bar.+" AND seq(x, bar, "foo qwe")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar", "a", "qwe", "x"}, + }, + }) + + // extract common tokens from OR filters + f(`foo AND (bar OR ~"x bar baz")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + + // star matches any non-empty token, so it is skipped + f(`foo bar *`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + f(`* *`, nil) + + // empty filter must be skipped + f(`foo "" bar`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + f(`"" ""`, nil) + + // unknown filters must be skipped + f(`_time:5m !foo "bar baz" x`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "baz", "x"}, + }, + }) + + // distinct field names + f(`foo:x bar:"a bc" (foo:y OR (bar:qwe AND foo:"z y a"))`, []fieldTokens{ + { + field: "foo", + tokens: []string{"x", "y"}, + }, + { + field: "bar", + tokens: []string{"a", "bc"}, + }, + }) +} diff --git a/lib/logstorage/filter_or.go b/lib/logstorage/filter_or.go index ab268f745..39e49a0d6 100644 --- a/lib/logstorage/filter_or.go +++ b/lib/logstorage/filter_or.go @@ -182,17 +182,11 @@ func getCommonTokensForOrFilters(filters []filter) []fieldTokens { if len(tokenss) != len(filters) { // The filter for the given fieldName is missing in some OR filters, // so it is impossible to extract common tokens from these filters. - // Give up extracting common tokens from the remaining filters, - // since they may not cover log entries matching fieldName filters. - // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 - return nil + continue } commonTokens := getCommonTokens(tokenss) if len(commonTokens) == 0 { - // Give up extracting common tokens from the remaining filters, - // since they may not cover log entries matching fieldName filters. - // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 - return nil + continue } byFieldTokens = append(byFieldTokens, fieldTokens{ field: fieldName, diff --git a/lib/logstorage/filter_or_test.go b/lib/logstorage/filter_or_test.go index c7bc860ee..371941a5c 100644 --- a/lib/logstorage/filter_or_test.go +++ b/lib/logstorage/filter_or_test.go @@ -1,6 +1,7 @@ package logstorage import ( + "reflect" "testing" ) @@ -70,3 +71,100 @@ func TestFilterOr(t *testing.T) { f(`foo:baz or !foo:~foo`, []int{2, 3, 5, 7, 8, 9}) f(`foo:baz or foo:!~foo`, []int{2, 3, 5, 7, 8, 9}) } + +func TestGetCommonTokensForOrFilters(t *testing.T) { + f := func(qStr string, tokensExpected []fieldTokens) { + t.Helper() + + q, err := ParseQuery(qStr) + if err != nil { + t.Fatalf("unexpected error in ParseQuery: %s", err) + } + fo, ok := q.f.(*filterOr) + if !ok { + t.Fatalf("unexpected filter type: %T; want *filterOr", q.f) + } + tokens := getCommonTokensForOrFilters(fo.filters) + + if len(tokens) != len(tokensExpected) { + t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected) + } + for i, ft := range tokens { + ftExpected := tokensExpected[i] + if ft.field != ftExpected.field { + t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens) + } + if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) { + t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens) + } + } + } + + // no common tokens + f(`foo OR bar`, nil) + + // star filter matches non-empty value; it is skipped, since one of OR filters may contain an empty filter + f(`* OR foo`, nil) + f(`foo or *`, nil) + f(`* or *`, nil) + f(`"" or * or foo`, nil) + + // empty filter suppresses all the common tokens. + f(`"" or foo or "foo bar"`, nil) + f(`foo or ""`, nil) + f(`"" or ""`, nil) + + // common foo token + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo"}, + }, + }) + + // regexp ending on foo token doesn't contain foo token, since it may match foobar. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo"`, nil) + + // regexp starting from foo token doesn't contain foo token, since it may match barfoo. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"foo bar"`, nil) + + // prefix filter ending on foo doesn't contain foo token, since it may match foobar. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo"* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, nil) + + // bar and foo are common tokens + f(`"bar foo baz" OR (foo AND "bar x" AND foobar)`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "foo"}, + }, + }) + + // bar and foo are common tokens, x:foobar should be ignored, since it doesn't present in every OR filter + f(`"bar foo baz" OR (foo AND "bar x" AND x:foobar)`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "foo"}, + }, + }) + + // common tokens for distinct fields + f(`(foo AND x:a) OR (x:"b a c" AND ~"aaa foo ")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo"}, + }, + { + field: "x", + tokens: []string{"a"}, + }, + }) + + // zero common tokens for distinct fields + f(`(foo AND x:a) OR (x:"b c" AND ~"aaa foo" AND y:z)`, nil) + + // negative filter removes all the matching + f(`foo OR !"foo bar"`, nil) + + // time filter removes all the matching + f(`_time:5m or foo`, nil) +}