From 5763a957effe5ee79eeb3e8dc6d461733b49705e Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Thu, 5 Sep 2024 14:14:57 +0200
Subject: [PATCH] lib/logstorage: properly fix incorrect extraction of common
 tokens for `OR` filters at distinct log fields

Previously (f1:foo OR f2:bar) was incorrectly returning `foo` token for `f1` and `bar` token for `f2`.
These tokens were used for checking against bloom filter for every data block, so the data block,
which didn't contain simultaneously `foo` token for `f1` field and `bar` token for `f2` field, was skipped.
This was incorrect, since such a block may contain logs matching the original OR filter.

The fix is to return common tokens from `OR`-delimted filters only if these tokens exist at EVERY such filter
for the given field name. If some `OR`-delimited filter misses the given field name, then `OR`-delimited filters
do not contain common tokens, which could be used for checking against bloom filter.

While at it, add more tests covering various edge cases for filters delimited by AND and OR.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556
---
 docs/VictoriaLogs/CHANGELOG.md    |   2 +-
 lib/logstorage/filter_and.go      |   9 +-
 lib/logstorage/filter_and_test.go | 237 +++++++++++++++++++++++++++++-
 lib/logstorage/filter_or.go       |  27 +++-
 lib/logstorage/filter_or_test.go  | 194 ++++++++++++++++++++++++
 5 files changed, 457 insertions(+), 12 deletions(-)

diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md
index 1859237204..5cf6b55040 100644
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@@ -29,7 +29,7 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta
 
 * BUGFIX: properly handle Logstash requests for Elasticsearch configuration when using `outputs.elasticsearch` in Logstash pipelines. Previously, the requests could be rejected with `400 Bad Request` response. Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4750).
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix `not found index.js` error when loading vmui in VictoriaLogs. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6764). Thanks to @yincongcyincong for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6770).
-* BUGFIX: fix filtering when logical operators `AND` and `OR` are used in query expression. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554) for details. Thanks to @yincongcyincong for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556).
+* BUGFIX: properly execute queries with `OR` [filters](https://docs.victoriametrics.com/victorialogs/logsql/#logical-filter) for distinct [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). For example, `field1:foo OR field2:bar`. Previously logs matching these filters may be skipped during querying. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554) for details. Thanks to @yincongcyincong for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556).
 
 ## [v0.28.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.28.0-victorialogs)
 
diff --git a/lib/logstorage/filter_and.go b/lib/logstorage/filter_and.go
index 1ad922c1ff..513d3e23bf 100644
--- a/lib/logstorage/filter_and.go
+++ b/lib/logstorage/filter_and.go
@@ -112,10 +112,6 @@ func (fa *filterAnd) getByFieldTokens() []fieldTokens {
 	return fa.byFieldTokens
 }
 
-// https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
-// and filter shouldn't return or filter which result in
-// bloom filter execute error interception.
-// detail see: https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556#issuecomment-2323643507
 func (fa *filterAnd) initByFieldTokens() {
 	m := make(map[string]map[string]struct{})
 	var fieldNames []string
@@ -157,6 +153,11 @@ func (fa *filterAnd) initByFieldTokens() {
 		case *filterSequence:
 			tokens := t.getTokens()
 			mergeFieldTokens(t.fieldName, tokens)
+		case *filterOr:
+			bfts := t.getByFieldTokens()
+			for _, bft := range bfts {
+				mergeFieldTokens(bft.field, bft.tokens)
+			}
 		}
 	}
 
diff --git a/lib/logstorage/filter_and_test.go b/lib/logstorage/filter_and_test.go
index 080ed790bd..c0f9ddf695 100644
--- a/lib/logstorage/filter_and_test.go
+++ b/lib/logstorage/filter_and_test.go
@@ -25,7 +25,8 @@ func TestFilterAnd(t *testing.T) {
 		},
 	}
 
-	//non-empty intersection
+	// non-empty intersection
+	// foo:a AND foo:abc*
 	fa := &filterAnd{
 		filters: []filter{
 			&filterPhrase{
@@ -41,6 +42,7 @@ func TestFilterAnd(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fa, "foo", []int{2, 6})
 
 	// reverse non-empty intersection
+	// foo:abc* AND foo:a
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPrefix{
@@ -56,6 +58,7 @@ func TestFilterAnd(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fa, "foo", []int{2, 6})
 
 	// the first filter mismatch
+	// foo:bc* AND foo:a
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPrefix{
@@ -71,6 +74,7 @@ func TestFilterAnd(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fa, "foo", nil)
 
 	// the last filter mismatch
+	// foo:abc AND foo:foo*
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPhrase{
@@ -86,6 +90,7 @@ func TestFilterAnd(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fa, "foo", nil)
 
 	// empty intersection
+	// foo:foo AND foo:abc*
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPhrase{
@@ -101,6 +106,7 @@ func TestFilterAnd(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fa, "foo", nil)
 
 	// reverse empty intersection
+	// foo:abc* AND foo:foo
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPrefix{
@@ -115,6 +121,88 @@ func TestFilterAnd(t *testing.T) {
 	}
 	testFilterMatchForColumns(t, columns, fa, "foo", nil)
 
+	// empty value
+	// foo:"" AND bar:""
+	fa = &filterAnd{
+		filters: []filter{
+			&filterExact{
+				fieldName: "foo",
+				value:     "",
+			},
+			&filterExact{
+				fieldName: "bar",
+				value:     "",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{5})
+
+	// non-existing field with empty value
+	// foo:foo* AND bar:""
+	fa = &filterAnd{
+		filters: []filter{
+			&filterPrefix{
+				fieldName: "foo",
+				prefix:    "foo",
+			},
+			&filterExact{
+				fieldName: "bar",
+				value:     "",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 3, 4, 6})
+
+	// reverse non-existing field with empty value
+	// bar:"" AND foo:foo*
+	fa = &filterAnd{
+		filters: []filter{
+			&filterExact{
+				fieldName: "bar",
+				value:     "",
+			},
+			&filterPrefix{
+				fieldName: "foo",
+				prefix:    "foo",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 3, 4, 6})
+
+	// non-existing field with non-empty value
+	// foo:foo* AND bar:*
+	fa = &filterAnd{
+		filters: []filter{
+			&filterPrefix{
+				fieldName: "foo",
+				prefix:    "foo",
+			},
+			&filterPrefix{
+				fieldName: "bar",
+				prefix:    "",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", nil)
+
+	// reverse non-existing field with non-empty value
+	// bar:* AND foo:foo*
+	fa = &filterAnd{
+		filters: []filter{
+			&filterPrefix{
+				fieldName: "bar",
+				prefix:    "",
+			},
+			&filterPrefix{
+				fieldName: "foo",
+				prefix:    "foo",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", nil)
+
+	// https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
+	// foo:"a foo"* AND (foo:="a foobar" OR boo:bbbbbbb)
 	fa = &filterAnd{
 		filters: []filter{
 			&filterPrefix{
@@ -136,4 +224,151 @@ func TestFilterAnd(t *testing.T) {
 		},
 	}
 	testFilterMatchForColumns(t, columns, fa, "foo", []int{1})
+
+	// foo:"a foo"* AND (foo:"abcd foobar" OR foo:foobar)
+	fa = &filterAnd{
+		filters: []filter{
+			&filterPrefix{
+				fieldName: "foo",
+				prefix:    "a foo",
+			},
+			&filterOr{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "abcd foobar",
+					},
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foobar",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{1, 6})
+
+	// (foo:foo* OR bar:baz) AND (bar:x OR foo:a)
+	fa = &filterAnd{
+		filters: []filter{
+			&filterOr{
+				filters: []filter{
+					&filterPrefix{
+						fieldName: "foo",
+						prefix:    "foo",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "baz",
+					},
+				},
+			},
+			&filterOr{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "x",
+					},
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "a",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 3, 4, 6})
+
+	// (foo:foo* OR bar:baz) AND (bar:x OR foo:xyz)
+	fa = &filterAnd{
+		filters: []filter{
+			&filterOr{
+				filters: []filter{
+					&filterPrefix{
+						fieldName: "foo",
+						prefix:    "foo",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "baz",
+					},
+				},
+			},
+			&filterOr{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "x",
+					},
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "xyz",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", nil)
+
+	// (foo:foo* OR bar:baz) AND (bar:* OR foo:xyz)
+	fa = &filterAnd{
+		filters: []filter{
+			&filterOr{
+				filters: []filter{
+					&filterPrefix{
+						fieldName: "foo",
+						prefix:    "foo",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "baz",
+					},
+				},
+			},
+			&filterOr{
+				filters: []filter{
+					&filterPrefix{
+						fieldName: "bar",
+						prefix:    "",
+					},
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "xyz",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", nil)
+
+	// (foo:foo* OR bar:baz) AND (bar:"" OR foo:xyz)
+	fa = &filterAnd{
+		filters: []filter{
+			&filterOr{
+				filters: []filter{
+					&filterPrefix{
+						fieldName: "foo",
+						prefix:    "foo",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "baz",
+					},
+				},
+			},
+			&filterOr{
+				filters: []filter{
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "xyz",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 3, 4, 6})
 }
diff --git a/lib/logstorage/filter_or.go b/lib/logstorage/filter_or.go
index fa5aab0217..db281c1b65 100644
--- a/lib/logstorage/filter_or.go
+++ b/lib/logstorage/filter_or.go
@@ -171,13 +171,28 @@ func (fo *filterOr) initByFieldTokens() {
 
 	var byFieldTokens []fieldTokens
 	for _, fieldName := range fieldNames {
-		commonTokens := getCommonTokens(m[fieldName])
-		if len(commonTokens) > 0 {
-			byFieldTokens = append(byFieldTokens, fieldTokens{
-				field:  fieldName,
-				tokens: commonTokens,
-			})
+		tokenss := m[fieldName]
+		if len(tokenss) != len(fo.filters) {
+			// The filter for the given fieldName is missing in some OR filters,
+			// so it is impossible to extract common tokens from these filters.
+			// Give up extracting common tokens from the remaining filters,
+			// since they may not cover log entries matching fieldName filters.
+			// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
+			byFieldTokens = nil
+			break
 		}
+		commonTokens := getCommonTokens(tokenss)
+		if len(commonTokens) == 0 {
+			// Give up extracting common tokens from the remaining filters,
+			// since they may not cover log entries matching fieldName filters.
+			// This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554
+			byFieldTokens = nil
+			break
+		}
+		byFieldTokens = append(byFieldTokens, fieldTokens{
+			field:  fieldName,
+			tokens: commonTokens,
+		})
 	}
 
 	fo.byFieldTokens = byFieldTokens
diff --git a/lib/logstorage/filter_or_test.go b/lib/logstorage/filter_or_test.go
index 944e6cc07b..d5855de70a 100644
--- a/lib/logstorage/filter_or_test.go
+++ b/lib/logstorage/filter_or_test.go
@@ -26,6 +26,7 @@ func TestFilterOr(t *testing.T) {
 	}
 
 	// non-empty union
+	// foo:23 OR foo:abc
 	fo := &filterOr{
 		filters: []filter{
 			&filterPhrase{
@@ -41,6 +42,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{2, 6, 9})
 
 	// reverse non-empty union
+	// foo:abc OR foo:23
 	fo = &filterOr{
 		filters: []filter{
 			&filterPrefix{
@@ -56,6 +58,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{2, 6, 9})
 
 	// first empty result, second non-empty result
+	// foo:xabc* OR foo:23
 	fo = &filterOr{
 		filters: []filter{
 			&filterPrefix{
@@ -71,6 +74,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{9})
 
 	// first non-empty result, second empty result
+	// foo:23 OR foo:xabc*
 	fo = &filterOr{
 		filters: []filter{
 			&filterPhrase{
@@ -86,6 +90,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{9})
 
 	// first match all
+	// foo:a OR foo:23
 	fo = &filterOr{
 		filters: []filter{
 			&filterPhrase{
@@ -101,6 +106,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})
 
 	// second match all
+	// foo:23 OR foo:a
 	fo = &filterOr{
 		filters: []filter{
 			&filterPrefix{
@@ -116,6 +122,7 @@ func TestFilterOr(t *testing.T) {
 	testFilterMatchForColumns(t, columns, fo, "foo", []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})
 
 	// both empty results
+	// foo:x23 OR foo:xabc
 	fo = &filterOr{
 		filters: []filter{
 			&filterPhrase{
@@ -129,4 +136,191 @@ func TestFilterOr(t *testing.T) {
 		},
 	}
 	testFilterMatchForColumns(t, columns, fo, "foo", nil)
+
+	// non-existing column (last)
+	// foo:23 OR bar:xabc*
+	fo = &filterOr{
+		filters: []filter{
+			&filterPhrase{
+				fieldName: "foo",
+				phrase:    "23",
+			},
+			&filterPrefix{
+				fieldName: "bar",
+				prefix:    "xabc",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{9})
+
+	// non-existing column (first)
+	// bar:xabc* OR foo:23
+	fo = &filterOr{
+		filters: []filter{
+			&filterPhrase{
+				fieldName: "foo",
+				phrase:    "23",
+			},
+			&filterPrefix{
+				fieldName: "bar",
+				prefix:    "xabc",
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{9})
+
+	// (foo:23 AND bar:"") OR (foo:foo AND bar:*)
+	fo = &filterOr{
+		filters: []filter{
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "23",
+					},
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+				},
+			},
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foo",
+					},
+					&filterPrefix{
+						fieldName: "bar",
+						prefix:    "",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{9})
+
+	// (foo:23 AND bar:"") OR (foo:foo AND bar:"")
+	fo = &filterOr{
+		filters: []filter{
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "23",
+					},
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+				},
+			},
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foo",
+					},
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{0, 9})
+
+	// (foo:23 AND bar:"") OR (foo:foo AND baz:"")
+	fo = &filterOr{
+		filters: []filter{
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "23",
+					},
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+				},
+			},
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foo",
+					},
+					&filterExact{
+						fieldName: "baz",
+						value:     "",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{0, 9})
+
+	// (foo:23 AND bar:abc) OR (foo:foo AND bar:"")
+	fo = &filterOr{
+		filters: []filter{
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "23",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "abc",
+					},
+				},
+			},
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foo",
+					},
+					&filterExact{
+						fieldName: "bar",
+						value:     "",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", []int{0})
+
+	// (foo:23 AND bar:abc) OR (foo:foo AND bar:*)
+	fo = &filterOr{
+		filters: []filter{
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "23",
+					},
+					&filterPhrase{
+						fieldName: "bar",
+						phrase:    "abc",
+					},
+				},
+			},
+			&filterAnd{
+				filters: []filter{
+					&filterPhrase{
+						fieldName: "foo",
+						phrase:    "foo",
+					},
+					&filterPrefix{
+						fieldName: "bar",
+						prefix:    "",
+					},
+				},
+			},
+		},
+	}
+	testFilterMatchForColumns(t, columns, fo, "foo", nil)
 }