lib/logstorage: preserve the order of tokens to check against bloom filters in AND filters

Previously tokens from AND filters were extracted in random order. This could slow down checking them agains bloom filters if the most specific tokens go at the beginning of the AND filters. Preserve the original order of tokens when matching them against bloom filters, so the user could control the performance of the query by putting the most specific AND filters at the beginning of the query. While at it, add tests for getCommonTokensForAndFilters() and getCommonTokensForOrFilters(). Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556
2024-11-21 14:44:00 +00:00 · 2024-09-08 12:23:45 +02:00 · 2024-09-08 12:23:45 +02:00 · 45a3713bdb
commit 45a3713bdb
parent eaee2d7db4
4 changed files with 199 additions and 17 deletions
--- a/lib/logstorage/filter_and.go
+++ b/lib/logstorage/filter_and.go
@ -118,7 +118,7 @@ func (fa *filterAnd) initByFieldTokens() {
 }

 func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
-	m := make(map[string]map[string]struct{})
+	m := make(map[string][]string)
 	var fieldNames []string

 	mergeFieldTokens := func(fieldName string, tokens []string) {
@ -127,15 +127,10 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
 		}

 		fieldName = getCanonicalColumnName(fieldName)
-		mTokens, ok := m[fieldName]
-		if !ok {
+		if _, ok := m[fieldName]; !ok {
 			fieldNames = append(fieldNames, fieldName)
-			mTokens = make(map[string]struct{})
-			m[fieldName] = mTokens
-		}
-		for _, token := range tokens {
-			mTokens[token] = struct{}{}
 		}
+		m[fieldName] = append(m[fieldName], tokens...)
 	}

 	for _, f := range filters {
@ -169,8 +164,13 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens {
 	var byFieldTokens []fieldTokens
 	for _, fieldName := range fieldNames {
 		mTokens := m[fieldName]
+		seenTokens := make(map[string]struct{})
 		tokens := make([]string, 0, len(mTokens))
-		for token := range mTokens {
+		for _, token := range mTokens {
+			if _, ok := seenTokens[token]; ok {
+				continue
+			}
+			seenTokens[token] = struct{}{}
 			tokens = append(tokens, token)
 		}

--- a/lib/logstorage/filter_and_test.go
+++ b/lib/logstorage/filter_and_test.go
@ -1,6 +1,7 @@
 package logstorage

 import (
+	"reflect"
 	"testing"
 )

@ -75,3 +76,92 @@ func TestFilterAnd(t *testing.T) {
 	f(`foo:foo* AND !foo:~bar`, []int{0})
 	f(`foo:foo* AND foo:!~bar`, []int{0})
 }
+
+func TestGetCommonTokensForAndFilters(t *testing.T) {
+	f := func(qStr string, tokensExpected []fieldTokens) {
+		t.Helper()
+
+		q, err := ParseQuery(qStr)
+		if err != nil {
+			t.Fatalf("unexpected error in ParseQuery: %s", err)
+		}
+		fa, ok := q.f.(*filterAnd)
+		if !ok {
+			t.Fatalf("unexpected filter type: %T; want *filterAnd", q.f)
+		}
+		tokens := getCommonTokensForAndFilters(fa.filters)
+
+		if len(tokens) != len(tokensExpected) {
+			t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected)
+		}
+		for i, ft := range tokens {
+			ftExpected := tokensExpected[i]
+			if ft.field != ftExpected.field {
+				t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens)
+			}
+			if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) {
+				t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens)
+			}
+		}
+	}
+
+	f(`foo AND bar`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo", "bar"},
+		},
+	})
+
+	f(`="foo bar" AND ="a foo"* AND "bar foo" AND "foo bar"* AND ~"foo qwe bar.+" AND seq(x, bar, "foo qwe")`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo", "bar", "a", "qwe", "x"},
+		},
+	})
+
+	// extract common tokens from OR filters
+	f(`foo AND (bar OR ~"x bar baz")`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo", "bar"},
+		},
+	})
+
+	// star matches any non-empty token, so it is skipped
+	f(`foo bar *`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo", "bar"},
+		},
+	})
+	f(`* *`, nil)
+
+	// empty filter must be skipped
+	f(`foo "" bar`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo", "bar"},
+		},
+	})
+	f(`"" ""`, nil)
+
+	// unknown filters must be skipped
+	f(`_time:5m !foo "bar baz" x`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"bar", "baz", "x"},
+		},
+	})
+
+	// distinct field names
+	f(`foo:x bar:"a bc" (foo:y OR (bar:qwe AND foo:"z y a"))`, []fieldTokens{
+		{
+			field:  "foo",
+			tokens: []string{"x", "y"},
+		},
+		{
+			field:  "bar",
+			tokens: []string{"a", "bc"},
+		},
+	})
+}
--- a/lib/logstorage/filter_or.go
+++ b/lib/logstorage/filter_or.go
@ -182,17 +182,11 @@ func getCommonTokensForOrFilters(filters []filter) []fieldTokens {
 		if len(tokenss) != len(filters) {
 			// The filter for the given fieldName is missing in some OR filters,
 			// so it is impossible to extract common tokens from these filters.
-			// Give up extracting common tokens from the remaining filters,
-			// since they may not cover log entries matching fieldName filters.
-			// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
-			return nil
+			continue
 		}
 		commonTokens := getCommonTokens(tokenss)
 		if len(commonTokens) == 0 {
-			// Give up extracting common tokens from the remaining filters,
-			// since they may not cover log entries matching fieldName filters.
-			// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
-			return nil
+			continue
 		}
 		byFieldTokens = append(byFieldTokens, fieldTokens{
 			field:        fieldName,
--- a/lib/logstorage/filter_or_test.go
+++ b/lib/logstorage/filter_or_test.go
@ -1,6 +1,7 @@
 package logstorage

 import (
+	"reflect"
 	"testing"
 )

@ -70,3 +71,100 @@ func TestFilterOr(t *testing.T) {
 	f(`foo:baz or !foo:~foo`, []int{2, 3, 5, 7, 8, 9})
 	f(`foo:baz or foo:!~foo`, []int{2, 3, 5, 7, 8, 9})
 }
+
+func TestGetCommonTokensForOrFilters(t *testing.T) {
+	f := func(qStr string, tokensExpected []fieldTokens) {
+		t.Helper()
+
+		q, err := ParseQuery(qStr)
+		if err != nil {
+			t.Fatalf("unexpected error in ParseQuery: %s", err)
+		}
+		fo, ok := q.f.(*filterOr)
+		if !ok {
+			t.Fatalf("unexpected filter type: %T; want *filterOr", q.f)
+		}
+		tokens := getCommonTokensForOrFilters(fo.filters)
+
+		if len(tokens) != len(tokensExpected) {
+			t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected)
+		}
+		for i, ft := range tokens {
+			ftExpected := tokensExpected[i]
+			if ft.field != ftExpected.field {
+				t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens)
+			}
+			if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) {
+				t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens)
+			}
+		}
+	}
+
+	// no common tokens
+	f(`foo OR bar`, nil)
+
+	// star filter matches non-empty value; it is skipped, since one of OR filters may contain an empty filter
+	f(`* OR foo`, nil)
+	f(`foo or *`, nil)
+	f(`* or *`, nil)
+	f(`"" or * or foo`, nil)
+
+	// empty filter suppresses all the common tokens.
+	f(`"" or foo or "foo bar"`, nil)
+	f(`foo or ""`, nil)
+	f(`"" or ""`, nil)
+
+	// common foo token
+	f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo"},
+		},
+	})
+
+	// regexp ending on foo token doesn't contain foo token, since it may match foobar.
+	f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo"`, nil)
+
+	// regexp starting from foo token doesn't contain foo token, since it may match barfoo.
+	f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"foo bar"`, nil)
+
+	// prefix filter ending on foo doesn't contain foo token, since it may match foobar.
+	f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo"* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, nil)
+
+	// bar and foo are common tokens
+	f(`"bar foo baz" OR (foo AND "bar x" AND foobar)`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"bar", "foo"},
+		},
+	})
+
+	// bar and foo are common tokens, x:foobar should be ignored, since it doesn't present in every OR filter
+	f(`"bar foo baz" OR (foo AND "bar x" AND x:foobar)`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"bar", "foo"},
+		},
+	})
+
+	// common tokens for distinct fields
+	f(`(foo AND x:a) OR (x:"b a c" AND ~"aaa foo ")`, []fieldTokens{
+		{
+			field:  "_msg",
+			tokens: []string{"foo"},
+		},
+		{
+			field:  "x",
+			tokens: []string{"a"},
+		},
+	})
+
+	// zero common tokens for distinct fields
+	f(`(foo AND x:a) OR (x:"b c" AND ~"aaa foo" AND y:z)`, nil)
+
+	// negative filter removes all the matching
+	f(`foo OR !"foo bar"`, nil)
+
+	// time filter removes all the matching
+	f(`_time:5m or foo`, nil)
+}