diff --git a/lib/logstorage/filter_regexp.go b/lib/logstorage/filter_regexp.go index 28aa4aa40..5e88ada0b 100644 --- a/lib/logstorage/filter_regexp.go +++ b/lib/logstorage/filter_regexp.go @@ -2,6 +2,8 @@ package logstorage import ( "fmt" + "sync" + "unicode/utf8" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil" @@ -13,6 +15,9 @@ import ( type filterRegexp struct { fieldName string re *regexutil.Regex + + tokens []string + tokensOnce sync.Once } func (fr *filterRegexp) String() string { @@ -23,6 +28,37 @@ func (fr *filterRegexp) updateNeededFields(neededFields fieldsSet) { neededFields.add(fr.fieldName) } +func (fr *filterRegexp) getTokens() []string { + fr.tokensOnce.Do(fr.initTokens) + return fr.tokens +} + +func (fr *filterRegexp) initTokens() { + literals := fr.re.GetLiterals() + for i, literal := range literals { + literals[i] = skipFirstLastToken(literal) + } + fr.tokens = tokenizeStrings(nil, literals) +} + +func skipFirstLastToken(s string) string { + for { + r, runeSize := utf8.DecodeRuneInString(s) + if !isTokenRune(r) { + break + } + s = s[runeSize:] + } + for { + r, runeSize := utf8.DecodeLastRuneInString(s) + if !isTokenRune(r) { + break + } + s = s[:len(s)-runeSize] + } + return s +} + func (fr *filterRegexp) applyToBlockResult(br *blockResult, bm *bitmap) { re := fr.re applyToBlockResultGeneric(br, bm, fr.fieldName, "", func(v, _ string) bool { @@ -53,31 +89,37 @@ func (fr *filterRegexp) applyToBlockSearch(bs *blockSearch, bm *bitmap) { return } + tokens := fr.getTokens() + switch ch.valueType { case valueTypeString: - matchStringByRegexp(bs, ch, bm, re) + matchStringByRegexp(bs, ch, bm, re, tokens) case valueTypeDict: matchValuesDictByRegexp(bs, ch, bm, re) case valueTypeUint8: - matchUint8ByRegexp(bs, ch, bm, re) + matchUint8ByRegexp(bs, ch, bm, re, tokens) case valueTypeUint16: - matchUint16ByRegexp(bs, ch, bm, re) + matchUint16ByRegexp(bs, ch, bm, re, tokens) case valueTypeUint32: - matchUint32ByRegexp(bs, ch, bm, re) + matchUint32ByRegexp(bs, ch, bm, re, tokens) case valueTypeUint64: - matchUint64ByRegexp(bs, ch, bm, re) + matchUint64ByRegexp(bs, ch, bm, re, tokens) case valueTypeFloat64: - matchFloat64ByRegexp(bs, ch, bm, re) + matchFloat64ByRegexp(bs, ch, bm, re, tokens) case valueTypeIPv4: - matchIPv4ByRegexp(bs, ch, bm, re) + matchIPv4ByRegexp(bs, ch, bm, re, tokens) case valueTypeTimestampISO8601: - matchTimestampISO8601ByRegexp(bs, ch, bm, re) + matchTimestampISO8601ByRegexp(bs, ch, bm, re, tokens) default: logger.Panicf("FATAL: %s: unknown valueType=%d", bs.partPath(), ch.valueType) } } -func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toTimestampISO8601String(bs, bb, v) @@ -86,7 +128,11 @@ func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap bbPool.Put(bb) } -func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toIPv4String(bs, bb, v) @@ -95,7 +141,11 @@ func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexu bbPool.Put(bb) } -func matchFloat64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchFloat64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toFloat64String(bs, bb, v) @@ -117,13 +167,21 @@ func matchValuesDictByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re * bbPool.Put(bb) } -func matchStringByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchStringByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } visitValues(bs, ch, bm, func(v string) bool { return re.MatchString(v) }) } -func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toUint8String(bs, bb, v) @@ -132,7 +190,11 @@ func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regex bbPool.Put(bb) } -func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toUint16String(bs, bb, v) @@ -141,7 +203,11 @@ func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *rege bbPool.Put(bb) } -func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toUint32String(bs, bb, v) @@ -150,7 +216,11 @@ func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *rege bbPool.Put(bb) } -func matchUint64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) { +func matchUint64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) { + if !matchBloomFilterAllTokens(bs, ch, tokens) { + bm.resetBits() + return + } bb := bbPool.Get() visitValues(bs, ch, bm, func(v string) bool { s := toUint64String(bs, bb, v) diff --git a/lib/logstorage/filter_regexp_test.go b/lib/logstorage/filter_regexp_test.go index 30173290f..8939b6a57 100644 --- a/lib/logstorage/filter_regexp_test.go +++ b/lib/logstorage/filter_regexp_test.go @@ -370,6 +370,25 @@ func TestFilterRegexp(t *testing.T) { }) } +func TestSkipFirstLastToken(t *testing.T) { + f := func(s, resultExpected string) { + t.Helper() + + result := skipFirstLastToken(s) + if result != resultExpected { + t.Fatalf("unexpected result in skipFirstLastToken(%q); got %q; want %q", s, result, resultExpected) + } + } + + f("", "") + f("foobar", "") + f("foo bar", " ") + f("foo bar baz", " bar ") + f(" foo bar baz", " foo bar ") + f(",foo bar baz!", ",foo bar baz!") + f("фыад длоа д!", " длоа д!") +} + func mustCompileRegex(expr string) *regexutil.Regex { re, err := regexutil.NewRegex(expr) if err != nil { diff --git a/lib/regexutil/regex.go b/lib/regexutil/regex.go index 93963de35..8cdfacbf8 100644 --- a/lib/regexutil/regex.go +++ b/lib/regexutil/regex.go @@ -94,6 +94,32 @@ func (r *Regex) MatchString(s string) bool { return r.matchStringWithPrefix(s) } +// GetLiterals returns literals for r. +func (r *Regex) GetLiterals() []string { + sre := mustParseRegexp(r.exprStr) + for sre.Op == syntax.OpCapture { + sre = sre.Sub[0] + } + + v, ok := getLiteral(sre) + if ok { + return []string{v} + } + + if sre.Op != syntax.OpConcat { + return nil + } + + var a []string + for _, sub := range sre.Sub { + v, ok := getLiteral(sub) + if ok { + a = append(a, v) + } + } + return a +} + // String returns string represetnation for r func (r *Regex) String() string { return r.exprStr diff --git a/lib/regexutil/regex_test.go b/lib/regexutil/regex_test.go index d0e3a0380..2fedc2d81 100644 --- a/lib/regexutil/regex_test.go +++ b/lib/regexutil/regex_test.go @@ -1,6 +1,7 @@ package regexutil import ( + "reflect" "testing" ) @@ -144,3 +145,27 @@ func TestRegexMatchString(t *testing.T) { f("foo(bar|baz)", "a fooxfooban a", false) f("foo(bar|baz)", "a fooxfooban foobar a", true) } + +func TestGetLiterals(t *testing.T) { + f := func(expr string, literalsExpected []string) { + t.Helper() + + r, err := NewRegex(expr) + if err != nil { + t.Fatalf("cannot parse %q: %s", expr, err) + } + literals := r.GetLiterals() + if !reflect.DeepEqual(literals, literalsExpected) { + t.Fatalf("unexpected literals; got %q; want %q", literals, literalsExpected) + } + } + + f("", nil) + f("foo bar baz", []string{"foo bar baz"}) + f("foo.*bar(a|b)baz.+", []string{"foo", "bar", "baz"}) + f("(foo[ab](?:bar))", []string{"foo", "bar"}) + f("foo|bar", nil) + f("((foo|bar)baz xxx(?:yzabc))", []string{"baz xxxyzabc"}) + f("((foo|bar)baz xxx(?:yzabc)*)", []string{"baz xxx"}) + f("((foo|bar)baz? xxx(?:yzabc)*)", []string{"ba", " xxx"}) +}