evaluate the execution cost of all tag filters (#824)

* evaluate the execution cost of all tag filters

* fix suffixes typo
This commit is contained in:
faceair 2020-10-16 16:46:55 -05:00 committed by Aliaksandr Valialkin
parent 35791d9b29
commit 8ddf089deb
3 changed files with 251 additions and 23 deletions

View file

@ -2186,7 +2186,7 @@ func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, filter *uint64set
} }
metricIDs := &uint64set.Set{} metricIDs := &uint64set.Set{}
if len(tf.orSuffixes) > 0 { if len(tf.orSuffixes) > 0 {
// Fast path for orSuffixes - seek for rows for each value from orSuffxies. // Fast path for orSuffixes - seek for rows for each value from orSuffixes.
if err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs); err != nil { if err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs); err != nil {
if err == errFallbackToMetricNameMatch { if err == errFallbackToMetricNameMatch {
return nil, err return nil, err
@ -2596,6 +2596,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter
// This way we limit the amount of work below by applying more specific filters at first. // This way we limit the amount of work below by applying more specific filters at first.
type tagFilterWithCount struct { type tagFilterWithCount struct {
tf *tagFilter tf *tagFilter
cost uint64
count uint64 count uint64
} }
tfsWithCount := make([]tagFilterWithCount, len(tfs.tfs)) tfsWithCount := make([]tagFilterWithCount, len(tfs.tfs))
@ -2611,13 +2612,14 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter
} }
tfsWithCount[i] = tagFilterWithCount{ tfsWithCount[i] = tagFilterWithCount{
tf: tf, tf: tf,
cost: count * tf.matchCost,
count: count, count: count,
} }
} }
sort.Slice(tfsWithCount, func(i, j int) bool { sort.Slice(tfsWithCount, func(i, j int) bool {
a, b := &tfsWithCount[i], &tfsWithCount[j] a, b := &tfsWithCount[i], &tfsWithCount[j]
if a.count != b.count { if a.cost != b.cost {
return a.count < b.count return a.cost < b.cost
} }
return a.tf.Less(b.tf) return a.tf.Less(b.tf)
}) })

View file

@ -165,6 +165,7 @@ type tagFilter struct {
value []byte value []byte
isNegative bool isNegative bool
isRegexp bool isRegexp bool
matchCost uint64
// Prefix always contains {nsPrefixTagToMetricIDs, AccountID, ProjectID, key}. // Prefix always contains {nsPrefixTagToMetricIDs, AccountID, ProjectID, key}.
// Additionally it contains: // Additionally it contains:
@ -285,6 +286,7 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp
// during the search for matching metricIDs. // during the search for matching metricIDs.
tf.orSuffixes = append(tf.orSuffixes[:0], "") tf.orSuffixes = append(tf.orSuffixes[:0], "")
tf.isEmptyMatch = len(prefix) == 0 tf.isEmptyMatch = len(prefix) == 0
tf.matchCost = defaultCost
return nil return nil
} }
rcv, err := getRegexpFromCache(expr) rcv, err := getRegexpFromCache(expr)
@ -293,6 +295,7 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp
} }
tf.orSuffixes = append(tf.orSuffixes[:0], rcv.orValues...) tf.orSuffixes = append(tf.orSuffixes[:0], rcv.orValues...)
tf.reSuffixMatch = rcv.reMatch tf.reSuffixMatch = rcv.reMatch
tf.matchCost = rcv.reCost
tf.isEmptyMatch = len(prefix) == 0 && tf.reSuffixMatch(nil) tf.isEmptyMatch = len(prefix) == 0 && tf.reSuffixMatch(nil)
if !tf.isNegative && len(key) == 0 && strings.IndexByte(rcv.literalSuffix, '.') >= 0 { if !tf.isNegative && len(key) == 0 && strings.IndexByte(rcv.literalSuffix, '.') >= 0 {
// Reverse suffix is needed only for non-negative regexp filters on __name__ that contains dots. // Reverse suffix is needed only for non-negative regexp filters on __name__ that contains dots.
@ -357,6 +360,7 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) {
sExpr := string(expr) sExpr := string(expr)
orValues := getOrValues(sExpr) orValues := getOrValues(sExpr)
var reMatch func(b []byte) bool var reMatch func(b []byte) bool
var reCost uint64
var literalSuffix string var literalSuffix string
if len(orValues) > 0 { if len(orValues) > 0 {
if len(orValues) == 1 { if len(orValues) == 1 {
@ -364,6 +368,7 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) {
reMatch = func(b []byte) bool { reMatch = func(b []byte) bool {
return string(b) == v return string(b) == v
} }
reCost = defaultLiteralCost
} else { } else {
reMatch = func(b []byte) bool { reMatch = func(b []byte) bool {
for _, v := range orValues { for _, v := range orValues {
@ -373,14 +378,16 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) {
} }
return false return false
} }
reCost = uint64(len(orValues)) * defaultLiteralCost
} }
} else { } else {
reMatch, literalSuffix = getOptimizedReMatchFunc(re.Match, sExpr) reMatch, literalSuffix, reCost = getOptimizedReMatchFunc(re.Match, sExpr)
} }
// Put the reMatch in the cache. // Put the reMatch in the cache.
rcv.orValues = orValues rcv.orValues = orValues
rcv.reMatch = reMatch rcv.reMatch = reMatch
rcv.reCost = reCost
rcv.literalSuffix = literalSuffix rcv.literalSuffix = literalSuffix
regexpCacheLock.Lock() regexpCacheLock.Lock()
@ -415,32 +422,40 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) {
// It returns reMatch if it cannot find optimized function. // It returns reMatch if it cannot find optimized function.
// //
// It also returns literal suffix from the expr. // It also returns literal suffix from the expr.
func getOptimizedReMatchFunc(reMatch func(b []byte) bool, expr string) (func(b []byte) bool, string) { func getOptimizedReMatchFunc(reMatch func(b []byte) bool, expr string) (func(b []byte) bool, string, uint64) {
sre, err := syntax.Parse(expr, syntax.Perl) sre, err := syntax.Parse(expr, syntax.Perl)
if err != nil { if err != nil {
logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err) logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err)
} }
if matchFunc, literalSuffix := getOptimizedReMatchFuncExt(reMatch, sre); matchFunc != nil { if matchFunc, literalSuffix, reCost := getOptimizedReMatchFuncExt(reMatch, sre); matchFunc != nil {
// Found optimized function for matching the expr. // Found optimized function for matching the expr.
suffixUnescaped := tagCharsReverseRegexpEscaper.Replace(literalSuffix) suffixUnescaped := tagCharsReverseRegexpEscaper.Replace(literalSuffix)
return matchFunc, suffixUnescaped return matchFunc, suffixUnescaped, reCost
} }
// Fall back to un-optimized reMatch. // Fall back to un-optimized reMatch.
return reMatch, "" return reMatch, "", defaultReCost
} }
func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) (func(b []byte) bool, string) { // The following & default cost values are returned from BenchmarkOptimizedReMatchCost
var (
defaultCost uint64 = 1
defaultLiteralCost uint64 = 3
defaultReCost uint64 = 140
)
func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) (func(b []byte) bool, string, uint64) {
if isDotStar(sre) { if isDotStar(sre) {
// '.*' // '.*'
return func(b []byte) bool { return func(b []byte) bool {
return true return true
}, "" }, "", 1
} }
if isDotPlus(sre) { if isDotPlus(sre) {
// '.+' // '.+'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > 0 return len(b) > 0
}, "" }, "", 1
} }
switch sre.Op { switch sre.Op {
case syntax.OpCapture: case syntax.OpCapture:
@ -448,13 +463,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
return getOptimizedReMatchFuncExt(reMatch, sre.Sub[0]) return getOptimizedReMatchFuncExt(reMatch, sre.Sub[0])
case syntax.OpLiteral: case syntax.OpLiteral:
if !isLiteral(sre) { if !isLiteral(sre) {
return nil, "" return nil, "", 0
} }
s := string(sre.Rune) s := string(sre.Rune)
// Literal match // Literal match
return func(b []byte) bool { return func(b []byte) bool {
return string(b) == s return string(b) == s
}, s }, s, defaultLiteralCost
case syntax.OpConcat: case syntax.OpConcat:
if len(sre.Sub) == 2 { if len(sre.Sub) == 2 {
if isLiteral(sre.Sub[0]) { if isLiteral(sre.Sub[0]) {
@ -463,13 +478,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
// 'prefix.*' // 'prefix.*'
return func(b []byte) bool { return func(b []byte) bool {
return bytes.HasPrefix(b, prefix) return bytes.HasPrefix(b, prefix)
}, "" }, "", 2
} }
if isDotPlus(sre.Sub[1]) { if isDotPlus(sre.Sub[1]) {
// 'prefix.+' // 'prefix.+'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > len(prefix) && bytes.HasPrefix(b, prefix) return len(b) > len(prefix) && bytes.HasPrefix(b, prefix)
}, "" }, "", 2
} }
} }
if isLiteral(sre.Sub[1]) { if isLiteral(sre.Sub[1]) {
@ -478,13 +493,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
// '.*suffix' // '.*suffix'
return func(b []byte) bool { return func(b []byte) bool {
return bytes.HasSuffix(b, suffix) return bytes.HasSuffix(b, suffix)
}, string(suffix) }, string(suffix), 3
} }
if isDotPlus(sre.Sub[0]) { if isDotPlus(sre.Sub[0]) {
// '.+suffix' // '.+suffix'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > len(suffix) && bytes.HasSuffix(b[1:], suffix) return len(b) > len(suffix) && bytes.HasSuffix(b[1:], suffix)
}, string(suffix) }, string(suffix), 3
} }
} }
} }
@ -495,13 +510,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
// '.*middle.*' // '.*middle.*'
return func(b []byte) bool { return func(b []byte) bool {
return bytes.Contains(b, middle) return bytes.Contains(b, middle)
}, "" }, "", 5
} }
if isDotPlus(sre.Sub[2]) { if isDotPlus(sre.Sub[2]) {
// '.*middle.+' // '.*middle.+'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > len(middle) && bytes.Contains(b[:len(b)-1], middle) return len(b) > len(middle) && bytes.Contains(b[:len(b)-1], middle)
}, "" }, "", 5
} }
} }
if isDotPlus(sre.Sub[0]) { if isDotPlus(sre.Sub[0]) {
@ -509,13 +524,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
// '.+middle.*' // '.+middle.*'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > len(middle) && bytes.Contains(b[1:], middle) return len(b) > len(middle) && bytes.Contains(b[1:], middle)
}, "" }, "", 5
} }
if isDotPlus(sre.Sub[2]) { if isDotPlus(sre.Sub[2]) {
// '.+middle.+' // '.+middle.+'
return func(b []byte) bool { return func(b []byte) bool {
return len(b) > len(middle)+1 && bytes.Contains(b[1:len(b)-1], middle) return len(b) > len(middle)+1 && bytes.Contains(b[1:len(b)-1], middle)
}, "" }, "", 5
} }
} }
} }
@ -549,9 +564,9 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp)
} }
// Fall back to slow path. // Fall back to slow path.
return reMatch(bOrig) return reMatch(bOrig)
}, string(suffix) }, string(suffix), defaultReCost
default: default:
return nil, "" return nil, "", 0
} }
} }
@ -738,6 +753,7 @@ var (
type regexpCacheValue struct { type regexpCacheValue struct {
orValues []string orValues []string
reMatch func(b []byte) bool reMatch func(b []byte) bool
reCost uint64
literalSuffix string literalSuffix string
} }

View file

@ -1,6 +1,8 @@
package storage package storage
import ( import (
"bytes"
"regexp"
"testing" "testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@ -307,3 +309,211 @@ func BenchmarkTagFilterMatchSuffix(b *testing.B) {
}) })
}) })
} }
// Run the following command to get the execution cost of all matches
//
// go test -run=none -bench=BenchmarkOptimizedReMatchCost -count 20 | tee cost.txt
// benchstat ./cost.txt
//
// Calculate the multiplier of default for each match overhead.
func BenchmarkOptimizedReMatchCost(b *testing.B) {
b.Run("default", func(b *testing.B) {
reMatch := func(b []byte) bool {
return len(b) == 0
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run("literal match", func(b *testing.B) {
s := "foo1.bar.baz.sss.ddd"
reMatch := func(b []byte) bool {
return string(b) == s
}
suffix := []byte(s)
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run("foo|bar|baz", func(b *testing.B) {
s := []string{"foo", "bar", "baz"}
reMatch := func(b []byte) bool {
for _, v := range s {
if string(b) == v {
return true
}
}
return false
}
suffix := []byte("ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".*", func(b *testing.B) {
reMatch := func(b []byte) bool {
return true
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".+", func(b *testing.B) {
reMatch := func(b []byte) bool {
return len(b) > 0
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run("prefix.*", func(b *testing.B) {
s := []byte("foo1.bar")
reMatch := func(b []byte) bool {
return bytes.HasPrefix(b, s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run("prefix.+", func(b *testing.B) {
s := []byte("foo1.bar")
reMatch := func(b []byte) bool {
return len(b) > len(s) && bytes.HasPrefix(b, s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".*suffix", func(b *testing.B) {
s := []byte("sss.ddd")
reMatch := func(b []byte) bool {
return bytes.HasSuffix(b, s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".+suffix", func(b *testing.B) {
s := []byte("sss.ddd")
reMatch := func(b []byte) bool {
return len(b) > len(s) && bytes.HasSuffix(b[1:], s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".*middle.*", func(b *testing.B) {
s := []byte("bar.baz")
reMatch := func(b []byte) bool {
return bytes.Contains(b, s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".*middle.+", func(b *testing.B) {
s := []byte("bar.baz")
reMatch := func(b []byte) bool {
return len(b) > len(s) && bytes.Contains(b[:len(b)-1], s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".+middle.*", func(b *testing.B) {
s := []byte("bar.baz")
reMatch := func(b []byte) bool {
return len(b) > len(s) && bytes.Contains(b[1:], s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run(".+middle.+", func(b *testing.B) {
s := []byte("bar.baz")
reMatch := func(b []byte) bool {
return len(b) > len(s)+1 && bytes.Contains(b[1:len(b)-1], s)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
b.Run("default", func(b *testing.B) {
re := regexp.MustCompile(`foo[^.]*?\.bar\.baz\.[^.]*?\.ddd`)
reMatch := func(b []byte) bool {
return re.Match(b)
}
suffix := []byte("foo1.bar.baz.sss.ddd")
b.ReportAllocs()
b.SetBytes(int64(1))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
reMatch(suffix)
}
})
})
}