This commit is contained in:
Aliaksandr Valialkin 2024-05-23 22:54:21 +02:00
parent 892afe25d1
commit 03b9d7977d
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
4 changed files with 156 additions and 16 deletions

View file

@ -2,6 +2,8 @@ package logstorage
import (
"fmt"
"sync"
"unicode/utf8"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
@ -13,6 +15,9 @@ import (
type filterRegexp struct {
fieldName string
re *regexutil.Regex
tokens []string
tokensOnce sync.Once
}
func (fr *filterRegexp) String() string {
@ -23,6 +28,37 @@ func (fr *filterRegexp) updateNeededFields(neededFields fieldsSet) {
neededFields.add(fr.fieldName)
}
func (fr *filterRegexp) getTokens() []string {
fr.tokensOnce.Do(fr.initTokens)
return fr.tokens
}
func (fr *filterRegexp) initTokens() {
literals := fr.re.GetLiterals()
for i, literal := range literals {
literals[i] = skipFirstLastToken(literal)
}
fr.tokens = tokenizeStrings(nil, literals)
}
func skipFirstLastToken(s string) string {
for {
r, runeSize := utf8.DecodeRuneInString(s)
if !isTokenRune(r) {
break
}
s = s[runeSize:]
}
for {
r, runeSize := utf8.DecodeLastRuneInString(s)
if !isTokenRune(r) {
break
}
s = s[:len(s)-runeSize]
}
return s
}
func (fr *filterRegexp) applyToBlockResult(br *blockResult, bm *bitmap) {
re := fr.re
applyToBlockResultGeneric(br, bm, fr.fieldName, "", func(v, _ string) bool {
@ -53,31 +89,37 @@ func (fr *filterRegexp) applyToBlockSearch(bs *blockSearch, bm *bitmap) {
return
}
tokens := fr.getTokens()
switch ch.valueType {
case valueTypeString:
matchStringByRegexp(bs, ch, bm, re)
matchStringByRegexp(bs, ch, bm, re, tokens)
case valueTypeDict:
matchValuesDictByRegexp(bs, ch, bm, re)
case valueTypeUint8:
matchUint8ByRegexp(bs, ch, bm, re)
matchUint8ByRegexp(bs, ch, bm, re, tokens)
case valueTypeUint16:
matchUint16ByRegexp(bs, ch, bm, re)
matchUint16ByRegexp(bs, ch, bm, re, tokens)
case valueTypeUint32:
matchUint32ByRegexp(bs, ch, bm, re)
matchUint32ByRegexp(bs, ch, bm, re, tokens)
case valueTypeUint64:
matchUint64ByRegexp(bs, ch, bm, re)
matchUint64ByRegexp(bs, ch, bm, re, tokens)
case valueTypeFloat64:
matchFloat64ByRegexp(bs, ch, bm, re)
matchFloat64ByRegexp(bs, ch, bm, re, tokens)
case valueTypeIPv4:
matchIPv4ByRegexp(bs, ch, bm, re)
matchIPv4ByRegexp(bs, ch, bm, re, tokens)
case valueTypeTimestampISO8601:
matchTimestampISO8601ByRegexp(bs, ch, bm, re)
matchTimestampISO8601ByRegexp(bs, ch, bm, re, tokens)
default:
logger.Panicf("FATAL: %s: unknown valueType=%d", bs.partPath(), ch.valueType)
}
}
func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toTimestampISO8601String(bs, bb, v)
@ -86,7 +128,11 @@ func matchTimestampISO8601ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap
bbPool.Put(bb)
}
func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toIPv4String(bs, bb, v)
@ -95,7 +141,11 @@ func matchIPv4ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexu
bbPool.Put(bb)
}
func matchFloat64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchFloat64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toFloat64String(bs, bb, v)
@ -117,13 +167,21 @@ func matchValuesDictByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *
bbPool.Put(bb)
}
func matchStringByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchStringByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
visitValues(bs, ch, bm, func(v string) bool {
return re.MatchString(v)
})
}
func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toUint8String(bs, bb, v)
@ -132,7 +190,11 @@ func matchUint8ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regex
bbPool.Put(bb)
}
func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toUint16String(bs, bb, v)
@ -141,7 +203,11 @@ func matchUint16ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *rege
bbPool.Put(bb)
}
func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toUint32String(bs, bb, v)
@ -150,7 +216,11 @@ func matchUint32ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *rege
bbPool.Put(bb)
}
func matchUint64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex) {
func matchUint64ByRegexp(bs *blockSearch, ch *columnHeader, bm *bitmap, re *regexutil.Regex, tokens []string) {
if !matchBloomFilterAllTokens(bs, ch, tokens) {
bm.resetBits()
return
}
bb := bbPool.Get()
visitValues(bs, ch, bm, func(v string) bool {
s := toUint64String(bs, bb, v)

View file

@ -370,6 +370,25 @@ func TestFilterRegexp(t *testing.T) {
})
}
func TestSkipFirstLastToken(t *testing.T) {
f := func(s, resultExpected string) {
t.Helper()
result := skipFirstLastToken(s)
if result != resultExpected {
t.Fatalf("unexpected result in skipFirstLastToken(%q); got %q; want %q", s, result, resultExpected)
}
}
f("", "")
f("foobar", "")
f("foo bar", " ")
f("foo bar baz", " bar ")
f(" foo bar baz", " foo bar ")
f(",foo bar baz!", ",foo bar baz!")
f("фыад длоа д!", " длоа д!")
}
func mustCompileRegex(expr string) *regexutil.Regex {
re, err := regexutil.NewRegex(expr)
if err != nil {

View file

@ -94,6 +94,32 @@ func (r *Regex) MatchString(s string) bool {
return r.matchStringWithPrefix(s)
}
// GetLiterals returns literals for r.
func (r *Regex) GetLiterals() []string {
sre := mustParseRegexp(r.exprStr)
for sre.Op == syntax.OpCapture {
sre = sre.Sub[0]
}
v, ok := getLiteral(sre)
if ok {
return []string{v}
}
if sre.Op != syntax.OpConcat {
return nil
}
var a []string
for _, sub := range sre.Sub {
v, ok := getLiteral(sub)
if ok {
a = append(a, v)
}
}
return a
}
// String returns string represetnation for r
func (r *Regex) String() string {
return r.exprStr

View file

@ -1,6 +1,7 @@
package regexutil
import (
"reflect"
"testing"
)
@ -144,3 +145,27 @@ func TestRegexMatchString(t *testing.T) {
f("foo(bar|baz)", "a fooxfooban a", false)
f("foo(bar|baz)", "a fooxfooban foobar a", true)
}
func TestGetLiterals(t *testing.T) {
f := func(expr string, literalsExpected []string) {
t.Helper()
r, err := NewRegex(expr)
if err != nil {
t.Fatalf("cannot parse %q: %s", expr, err)
}
literals := r.GetLiterals()
if !reflect.DeepEqual(literals, literalsExpected) {
t.Fatalf("unexpected literals; got %q; want %q", literals, literalsExpected)
}
}
f("", nil)
f("foo bar baz", []string{"foo bar baz"})
f("foo.*bar(a|b)baz.+", []string{"foo", "bar", "baz"})
f("(foo[ab](?:bar))", []string{"foo", "bar"})
f("foo|bar", nil)
f("((foo|bar)baz xxx(?:yzabc))", []string{"baz xxxyzabc"})
f("((foo|bar)baz xxx(?:yzabc)*)", []string{"baz xxx"})
f("((foo|bar)baz? xxx(?:yzabc)*)", []string{"ba", " xxx"})
}