lib/promrelabel: optimize matching for commonly used regex patterns in if option

The following regex patterns are optimized:

- literal string match, e.g. "foo"
- prefix match, e.g. "foo.*" and "foo.+"
- substring match, e.g. ".*foo.*" and ".+foo.+"
- alternate values match, e.g. "foo|bar|baz"
This commit is contained in:
Aliaksandr Valialkin 2022-08-26 14:53:02 +03:00
parent 0ad3bbadd3
commit 7afe8450fc
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
5 changed files with 318 additions and 10 deletions

View file

@ -23,7 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs. * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs.
* FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922). * FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922).
* FEATURE: improve the performance for `action: keep`, `action: drop`, `action: labelkeep` and `action: labeldrop` relabeling rules for `regex` containing the list of matching values. For example, `regex: "foo|bar|baz"`. * FEATURE: improve the performance for relabeling rules with commonly used regular expressions in `regex` and `if` fields such as `some_string`, `prefix.*`, `prefix.+`, `foo|bar|baz`, `.*foo.*` and `.+foo.+`.
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert/<tenantID>/opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert/<tenantID>/opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015).
* FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details. * FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details.
* FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`. * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`.

View file

@ -3,10 +3,10 @@ package promrelabel
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"regexp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
"github.com/VictoriaMetrics/metricsql" "github.com/VictoriaMetrics/metricsql"
) )
@ -105,7 +105,7 @@ type labelFilter struct {
value string value string
// re contains compiled regexp for `=~` and `!~` op. // re contains compiled regexp for `=~` and `!~` op.
re *regexp.Regexp re *regexutil.PromRegex
} }
func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) { func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
@ -115,10 +115,7 @@ func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
value: mlf.Value, value: mlf.Value,
} }
if lf.op == "=~" || lf.op == "!~" { if lf.op == "=~" || lf.op == "!~" {
// PromQL regexps are anchored by default. re, err := regexutil.NewPromRegex(lf.value)
// See https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors
reString := "^(?:" + lf.value + ")$"
re, err := regexp.Compile(reString)
if err != nil { if err != nil {
return nil, fmt.Errorf("cannot parse regexp for %s: %w", mlf.AppendString(nil), err) return nil, fmt.Errorf("cannot parse regexp for %s: %w", mlf.AppendString(nil), err)
} }
@ -134,9 +131,9 @@ func (lf *labelFilter) match(labels []prompbmarshal.Label) bool {
case "!=": case "!=":
return !lf.equalValue(labels) return !lf.equalValue(labels)
case "=~": case "=~":
return lf.equalRegexp(labels) return lf.matchRegexp(labels)
case "!~": case "!~":
return !lf.equalRegexp(labels) return !lf.matchRegexp(labels)
default: default:
logger.Panicf("BUG: unexpected operation for label filter: %s", lf.op) logger.Panicf("BUG: unexpected operation for label filter: %s", lf.op)
} }
@ -161,7 +158,7 @@ func (lf *labelFilter) equalValue(labels []prompbmarshal.Label) bool {
return false return false
} }
func (lf *labelFilter) equalRegexp(labels []prompbmarshal.Label) bool { func (lf *labelFilter) matchRegexp(labels []prompbmarshal.Label) bool {
labelNameMatches := 0 labelNameMatches := 0
for _, label := range labels { for _, label := range labels {
if toCanonicalLabelName(label.Name) != lf.label { if toCanonicalLabelName(label.Name) != lf.label {

119
lib/regexutil/promregex.go Normal file
View file

@ -0,0 +1,119 @@
package regexutil
import (
"regexp"
"strings"
)
// PromRegex implements an optimized string matching for Prometheus-like regex.
//
// The following regexs are optimized:
//
// - plain string such as "foobar"
// - alternate strings such as "foo|bar|baz"
// - prefix match such as "foo.*" or "foo.+"
// - substring match such as ".*foo.*" or ".+bar.+"
type PromRegex struct {
// prefix contains literal prefix for regex.
// For example, prefix="foo" for regex="foo(a|b)"
prefix string
// Suffix contains regex suffix left after removing the prefix.
// For example, suffix="a|b" for regex="foo(a|b)"
suffix string
// substrDotStar contains literal string for regex suffix=".*string.*"
substrDotStar string
// substrDotPlus contains literal string for regex suffix=".+string.+"
substrDotPlus string
// orValues contains or values for the suffix regex.
// For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz"
orValues []string
// reSuffix contains an anchored regexp built from suffix:
// "^(?:suffix)$"
reSuffix *regexp.Regexp
}
// NewPromRegex returns PromRegex for the given expr.
func NewPromRegex(expr string) (*PromRegex, error) {
if _, err := regexp.Compile(expr); err != nil {
return nil, err
}
prefix, suffix := Simplify(expr)
orValues := GetOrValues(suffix)
substrDotStar := getSubstringLiteral(suffix, ".*")
substrDotPlus := getSubstringLiteral(suffix, ".+")
// It is expected that Optimize returns valid regexp in suffix, so use MustCompile here.
// Anchor suffix to the beginning and the end of the matching string.
suffixExpr := "^(?:" + suffix + ")$"
reSuffix := regexp.MustCompile(suffixExpr)
pr := &PromRegex{
prefix: prefix,
suffix: suffix,
substrDotStar: substrDotStar,
substrDotPlus: substrDotPlus,
orValues: orValues,
reSuffix: reSuffix,
}
return pr, nil
}
// MatchString retruns true if s matches pr.
//
// The pr is automatically anchored to the beginning and to the end
// of the matching string with '^' and '$'.
func (pr *PromRegex) MatchString(s string) bool {
if !strings.HasPrefix(s, pr.prefix) {
// Fast path - s has another prefix than pr.
return false
}
s = s[len(pr.prefix):]
if len(pr.orValues) > 0 {
// Fast path - pr contains only alternate strings such as 'foo|bar|baz'
for _, v := range pr.orValues {
if s == v {
return true
}
}
return false
}
if pr.substrDotStar != "" {
// Fast path - pr contains ".*someText.*"
return strings.Contains(s, pr.substrDotStar)
}
if pr.substrDotPlus != "" {
// Fast path - pr contains ".+someText.+"
n := strings.Index(s, pr.substrDotPlus)
return n > 0 && n + len(pr.substrDotPlus) < len(s)
}
switch pr.suffix {
case ".*":
// Fast path - the pr contains "prefix.*"
return true
case ".+":
// Fast path - the pr contains "prefix.+"
return len(s) > 0
}
// Fall back to slow path by matching the original regexp.
return pr.reSuffix.MatchString(s)
}
func getSubstringLiteral(expr, prefixSuffix string) string {
if !strings.HasPrefix(expr, prefixSuffix) {
return ""
}
expr = expr[len(prefixSuffix):]
if !strings.HasSuffix(expr, prefixSuffix) {
return ""
}
expr = expr[:len(expr)-len(prefixSuffix)]
prefix, suffix := Simplify(expr)
if suffix != "" {
return ""
}
return prefix
}

View file

@ -0,0 +1,90 @@
package regexutil
import (
"regexp"
"testing"
)
func TestPromRegexParseFailure(t *testing.T) {
f := func(expr string) {
t.Helper()
pr, err := NewPromRegex(expr)
if err == nil {
t.Fatalf("expecting non-nil error for expr=%s", expr)
}
if pr != nil {
t.Fatalf("expecting nil pr for expr=%s", expr)
}
}
f("fo[bar")
f("foo(bar")
}
func TestPromRegex(t *testing.T) {
f := func(expr, s string, resultExpected bool) {
t.Helper()
pr, err := NewPromRegex(expr)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
result := pr.MatchString(s)
if result != resultExpected {
t.Fatalf("unexpected result when matching %s against %s; got %v; want %v", expr, s, result, resultExpected)
}
// Make sure the result is the same for regular regexp
exprAnchored := "^(?:" + expr + ")$"
re := regexp.MustCompile(exprAnchored)
result = re.MatchString(s)
if result != resultExpected {
t.Fatalf("unexpected result when matching %s against %s during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected)
}
}
f("", "", true)
f("", "foo", false)
f("foo", "", false)
f(".*", "", true)
f(".*", "foo", true)
f(".+", "", false)
f(".+", "foo", true)
f("foo.*", "bar", false)
f("foo.*", "foo", true)
f("foo.*", "foobar", true)
f("foo.+", "bar", false)
f("foo.+", "foo", false)
f("foo.+", "foobar", true)
f("foo|bar", "", false)
f("foo|bar", "a", false)
f("foo|bar", "foo", true)
f("foo|bar", "bar", true)
f("foo|bar", "foobar", false)
f("foo(bar|baz)", "a", false)
f("foo(bar|baz)", "foobar", true)
f("foo(bar|baz)", "foobaz", true)
f("foo(bar|baz)", "foobaza", false)
f("foo(bar|baz)", "foobal", false)
f("^foo|b(ar)$", "foo", true)
f("^foo|b(ar)$", "bar", true)
f("^foo|b(ar)$", "ar", false)
f(".*foo.*", "foo", true)
f(".*foo.*", "afoobar", true)
f(".*foo.*", "abc", false)
f("foo.*bar.*", "foobar", true)
f("foo.*bar.*", "foo_bar_", true)
f("foo.*bar.*", "foobaz", false)
f(".+foo.+", "foo", false)
f(".+foo.+", "afoobar", true)
f(".+foo.+", "afoo", false)
f(".+foo.+", "abc", false)
f("foo.+bar.+", "foobar", false)
f("foo.+bar.+", "foo_bar_", true)
f("foo.+bar.+", "foobaz", false)
f(".+foo.*", "foo", false)
f(".+foo.*", "afoo", true)
f(".+foo.*", "afoobar", true)
f(".*(a|b).*", "a", true)
f(".*(a|b).*", "ax", true)
f(".*(a|b).*", "xa", true)
f(".*(a|b).*", "xay", true)
f(".*(a|b).*", "xzy", false)
}

View file

@ -0,0 +1,102 @@
package regexutil
import (
"fmt"
"regexp"
"testing"
)
func BenchmarkPromRegexMatchString(b *testing.B) {
b.Run("unpotimized-noprefix-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "xbar.*|baz", "xbarz", true)
})
b.Run("unpotimized-noprefix-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "xbar.*|baz", "zfoobarz", false)
})
b.Run("unpotimized-prefix-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "foobarz", true)
})
b.Run("unpotimized-prefix-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "zfoobarz", false)
})
b.Run("literal-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo", "foo", true)
})
b.Run("literal-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo", "bar", false)
})
b.Run("prefix-dot-star-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo.*", "foobar", true)
})
b.Run("prefix-dot-star-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo.*", "afoobar", false)
})
b.Run("prefix-dot-plus-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo.+", "foobar", true)
})
b.Run("prefix-dot-plus-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo.+", "afoobar", false)
})
b.Run("or-values-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo|bar|baz", "baz", true)
})
b.Run("or-values-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "foo|bar|baz", "abaz", false)
})
b.Run("prefix-or-values-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "xbaz", true)
})
b.Run("prefix-or-values-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "abaz", false)
})
b.Run("substring-dot-star-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, ".*foo.*", "afoobar", true)
})
b.Run("substring-dot-star-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, ".*foo.*", "abarbaz", false)
})
b.Run("substring-dot-plus-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, ".+foo.+", "afoobar", true)
})
b.Run("substring-dot-plus-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, ".+foo.+", "abarbaz", false)
})
b.Run("prefix-substring-dot-star-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "a.*foo.*", "afoobar", true)
})
b.Run("prefix-substring-dot-star-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "a.*foo.*", "abarbaz", false)
})
b.Run("prefix-substring-dot-plus-match", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "a.+foo.+", "abfoobar", true)
})
b.Run("prefix-substring-dot-plus-mismatch", func(b *testing.B) {
benchmarkPromRegexMatchString(b, "a.+foo.+", "abarbaz", false)
})
}
func benchmarkPromRegexMatchString(b *testing.B, expr, s string, resultExpected bool) {
pr, err := NewPromRegex(expr)
if err != nil {
panic(fmt.Errorf("unexpected error: %s", err))
}
re := regexp.MustCompile("^(?:" + expr + ")$")
f := func(b *testing.B, matchString func(s string) bool) {
b.SetBytes(1)
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
result := matchString(s)
if result != resultExpected {
panic(fmt.Errorf("unexpected result when matching %s against %s; got %v; want %v", s, expr, result, resultExpected))
}
}
})
}
b.Run("PromRegex", func(b *testing.B) {
f(b, pr.MatchString)
})
b.Run("StandardRegex", func(b *testing.B) {
f(b, re.MatchString)
})
}