lib/promrelabel: optimize matching for commonly used regex patterns in if option

The following regex patterns are optimized: - literal string match, e.g. "foo" - prefix match, e.g. "foo.*" and "foo.+" - substring match, e.g. ".*foo.*" and ".+foo.+" - alternate values match, e.g. "foo|bar|baz"
2024-11-21 14:44:00 +00:00 · 2022-08-26 14:53:02 +03:00 · 2022-08-26 14:53:02 +03:00 · 7afe8450fc
commit 7afe8450fc
parent 0ad3bbadd3
5 changed files with 318 additions and 10 deletions
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -23,7 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs.
 * FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922).
-* FEATURE: improve the performance for `action: keep`, `action: drop`, `action: labelkeep` and `action: labeldrop` relabeling rules for `regex` containing the list of matching values. For example, `regex: "foo|bar|baz"`.
+* FEATURE: improve the performance for relabeling rules with commonly used regular expressions in `regex` and `if` fields such as `some_string`, `prefix.*`, `prefix.+`, `foo|bar|baz`, `.*foo.*` and `.+foo.+`.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert/<tenantID>/opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015).
 * FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details.
 * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`.
--- a/lib/promrelabel/if_expression.go
+++ b/lib/promrelabel/if_expression.go
@ -3,10 +3,10 @@ package promrelabel
 import (
 	"encoding/json"
 	"fmt"
 	"regexp"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
 	"github.com/VictoriaMetrics/metricsql"
 )
@ -105,7 +105,7 @@ type labelFilter struct {
 	value string
 	// re contains compiled regexp for `=~` and `!~` op.
-	re *regexp.Regexp
+	re *regexutil.PromRegex
 }
 func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
@ -115,10 +115,7 @@ func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
 		value: mlf.Value,
 	}
 	if lf.op == "=~" || lf.op == "!~" {
-		// PromQL regexps are anchored by default.
+		re, err := regexutil.NewPromRegex(lf.value)
 		// See https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors
 		reString := "^(?:" + lf.value + ")$"
 		re, err := regexp.Compile(reString)
 		if err != nil {
 			return nil, fmt.Errorf("cannot parse regexp for %s: %w", mlf.AppendString(nil), err)
 		}
@ -134,9 +131,9 @@ func (lf *labelFilter) match(labels []prompbmarshal.Label) bool {
 	case "!=":
 		return !lf.equalValue(labels)
 	case "=~":
-		return lf.equalRegexp(labels)
+		return lf.matchRegexp(labels)
 	case "!~":
-		return !lf.equalRegexp(labels)
+		return !lf.matchRegexp(labels)
 	default:
 		logger.Panicf("BUG: unexpected operation for label filter: %s", lf.op)
 	}
@ -161,7 +158,7 @@ func (lf *labelFilter) equalValue(labels []prompbmarshal.Label) bool {
 	return false
 }
-func (lf *labelFilter) equalRegexp(labels []prompbmarshal.Label) bool {
+func (lf *labelFilter) matchRegexp(labels []prompbmarshal.Label) bool {
 	labelNameMatches := 0
 	for _, label := range labels {
 		if toCanonicalLabelName(label.Name) != lf.label {
--- a/lib/regexutil/promregex.go
+++ b/lib/regexutil/promregex.go
@ -0,0 +1,119 @@
 package regexutil
 import (
 	"regexp"
 	"strings"
 )
 // PromRegex implements an optimized string matching for Prometheus-like regex.
 //
 // The following regexs are optimized:
 //
 // - plain string such as "foobar"
 // - alternate strings such as "foo|bar|baz"
 // - prefix match such as "foo.*" or "foo.+"
 // - substring match such as ".*foo.*" or ".+bar.+"
 type PromRegex struct {
 	// prefix contains literal prefix for regex.
 	// For example, prefix="foo" for regex="foo(a|b)"
 	prefix   string
 	// Suffix contains regex suffix left after removing the prefix.
 	// For example, suffix="a|b" for regex="foo(a|b)"
 	suffix string
 	// substrDotStar contains literal string for regex suffix=".*string.*"
 	substrDotStar string
 	// substrDotPlus contains literal string for regex suffix=".+string.+"
 	substrDotPlus string
 	// orValues contains or values for the suffix regex.
 	// For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz"
 	orValues []string
 	// reSuffix contains an anchored regexp built from suffix:
 	// "^(?:suffix)$"
 	reSuffix *regexp.Regexp
 }
 // NewPromRegex returns PromRegex for the given expr.
 func NewPromRegex(expr string) (*PromRegex, error) {
 	if _, err := regexp.Compile(expr); err != nil {
 		return nil, err
 	}
 	prefix, suffix := Simplify(expr)
 	orValues := GetOrValues(suffix)
 	substrDotStar := getSubstringLiteral(suffix, ".*")
 	substrDotPlus := getSubstringLiteral(suffix, ".+")
 	// It is expected that Optimize returns valid regexp in suffix, so use MustCompile here.
 	// Anchor suffix to the beginning and the end of the matching string.
 	suffixExpr := "^(?:" + suffix + ")$"
 	reSuffix := regexp.MustCompile(suffixExpr)
 	pr := &PromRegex{
 		prefix:   prefix,
 		suffix: suffix,
 		substrDotStar: substrDotStar,
 		substrDotPlus: substrDotPlus,
 		orValues: orValues,
 		reSuffix:       reSuffix,
 	}
 	return pr, nil
 }
 // MatchString retruns true if s matches pr.
 //
 // The pr is automatically anchored to the beginning and to the end
 // of the matching string with '^' and '$'.
 func (pr *PromRegex) MatchString(s string) bool {
 	if !strings.HasPrefix(s, pr.prefix) {
 		// Fast path - s has another prefix than pr.
 		return false
 	}
 	s = s[len(pr.prefix):]
 	if len(pr.orValues) > 0 {
 		// Fast path - pr contains only alternate strings such as 'foo|bar|baz'
 		for _, v := range pr.orValues {
 			if s == v {
 				return true
 			}
 		}
 		return false
 	}
 	if pr.substrDotStar != "" {
 		// Fast path - pr contains ".*someText.*"
 		return strings.Contains(s, pr.substrDotStar)
 	}
 	if pr.substrDotPlus != "" {
 		// Fast path - pr contains ".+someText.+"
 		n := strings.Index(s, pr.substrDotPlus)
 		return n > 0 && n + len(pr.substrDotPlus) < len(s)
 	}
 	switch pr.suffix {
 	case ".*":
 		// Fast path - the pr contains "prefix.*"
 		return true
 	case ".+":
 		// Fast path - the pr contains "prefix.+"
 		return len(s) > 0
 	}
 	// Fall back to slow path by matching the original regexp.
 	return pr.reSuffix.MatchString(s)
 }
 func getSubstringLiteral(expr, prefixSuffix string) string {
 	if !strings.HasPrefix(expr, prefixSuffix) {
 		return ""
 	}
 	expr = expr[len(prefixSuffix):]
 	if !strings.HasSuffix(expr, prefixSuffix) {
 		return ""
 	}
 	expr = expr[:len(expr)-len(prefixSuffix)]
 	prefix, suffix := Simplify(expr)
 	if suffix != "" {
 		return ""
 	}
 	return prefix
 }
--- a/lib/regexutil/promregex_test.go
+++ b/lib/regexutil/promregex_test.go
@ -0,0 +1,90 @@
 package regexutil
 import (
 	"regexp"
 	"testing"
 )
 func TestPromRegexParseFailure(t *testing.T) {
 	f := func(expr string) {
 		t.Helper()
 		pr, err := NewPromRegex(expr)
 		if err == nil {
 			t.Fatalf("expecting non-nil error for expr=%s", expr)
 		}
 		if pr != nil {
 			t.Fatalf("expecting nil pr for expr=%s", expr)
 		}
 	}
 	f("fo[bar")
 	f("foo(bar")
 }
 func TestPromRegex(t *testing.T) {
 	f := func(expr, s string, resultExpected bool) {
 		t.Helper()
 		pr, err := NewPromRegex(expr)
 		if err != nil {
 			t.Fatalf("unexpected error: %s", err)
 		}
 		result := pr.MatchString(s)
 		if result != resultExpected {
 			t.Fatalf("unexpected result when matching %s against %s; got %v; want %v", expr, s, result, resultExpected)
 		}
 		// Make sure the result is the same for regular regexp
 		exprAnchored := "^(?:" + expr + ")$"
 		re := regexp.MustCompile(exprAnchored)
 		result = re.MatchString(s)
 		if result != resultExpected {
 			t.Fatalf("unexpected result when matching %s against %s during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected)
 		}
 	}
 	f("", "", true)
 	f("", "foo", false)
 	f("foo", "", false)
 	f(".*", "", true)
 	f(".*", "foo", true)
 	f(".+", "", false)
 	f(".+", "foo", true)
 	f("foo.*", "bar", false)
 	f("foo.*", "foo", true)
 	f("foo.*", "foobar", true)
 	f("foo.+", "bar", false)
 	f("foo.+", "foo", false)
 	f("foo.+", "foobar", true)
 	f("foo|bar", "", false)
 	f("foo|bar", "a", false)
 	f("foo|bar", "foo", true)
 	f("foo|bar", "bar", true)
 	f("foo|bar", "foobar", false)
 	f("foo(bar|baz)", "a", false)
 	f("foo(bar|baz)", "foobar", true)
 	f("foo(bar|baz)", "foobaz", true)
 	f("foo(bar|baz)", "foobaza", false)
 	f("foo(bar|baz)", "foobal", false)
 	f("^foo|b(ar)$", "foo", true)
 	f("^foo|b(ar)$", "bar", true)
 	f("^foo|b(ar)$", "ar", false)
 	f(".*foo.*", "foo", true)
 	f(".*foo.*", "afoobar", true)
 	f(".*foo.*", "abc", false)
 	f("foo.*bar.*", "foobar", true)
 	f("foo.*bar.*", "foo_bar_", true)
 	f("foo.*bar.*", "foobaz", false)
 	f(".+foo.+", "foo", false)
 	f(".+foo.+", "afoobar", true)
 	f(".+foo.+", "afoo", false)
 	f(".+foo.+", "abc", false)
 	f("foo.+bar.+", "foobar", false)
 	f("foo.+bar.+", "foo_bar_", true)
 	f("foo.+bar.+", "foobaz", false)
 	f(".+foo.*", "foo", false)
 	f(".+foo.*", "afoo", true)
 	f(".+foo.*", "afoobar", true)
 	f(".*(a|b).*", "a", true)
 	f(".*(a|b).*", "ax", true)
 	f(".*(a|b).*", "xa", true)
 	f(".*(a|b).*", "xay", true)
 	f(".*(a|b).*", "xzy", false)
 }
--- a/lib/regexutil/promregex_timing_test.go
+++ b/lib/regexutil/promregex_timing_test.go
@ -0,0 +1,102 @@
 package regexutil
 import (
 	"fmt"
 	"regexp"
 	"testing"
 )
 func BenchmarkPromRegexMatchString(b *testing.B) {
 	b.Run("unpotimized-noprefix-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "xbar.*|baz", "xbarz", true)
 	})
 	b.Run("unpotimized-noprefix-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "xbar.*|baz", "zfoobarz", false)
 	})
 	b.Run("unpotimized-prefix-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "foobarz", true)
 	})
 	b.Run("unpotimized-prefix-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "zfoobarz", false)
 	})
 	b.Run("literal-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo", "foo", true)
 	})
 	b.Run("literal-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo", "bar", false)
 	})
 	b.Run("prefix-dot-star-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo.*", "foobar", true)
 	})
 	b.Run("prefix-dot-star-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo.*", "afoobar", false)
 	})
 	b.Run("prefix-dot-plus-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo.+", "foobar", true)
 	})
 	b.Run("prefix-dot-plus-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo.+", "afoobar", false)
 	})
 	b.Run("or-values-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo|bar|baz", "baz", true)
 	})
 	b.Run("or-values-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "foo|bar|baz", "abaz", false)
 	})
 	b.Run("prefix-or-values-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "xbaz", true)
 	})
 	b.Run("prefix-or-values-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "abaz", false)
 	})
 	b.Run("substring-dot-star-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, ".*foo.*", "afoobar", true)
 	})
 	b.Run("substring-dot-star-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, ".*foo.*", "abarbaz", false)
 	})
 	b.Run("substring-dot-plus-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, ".+foo.+", "afoobar", true)
 	})
 	b.Run("substring-dot-plus-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, ".+foo.+", "abarbaz", false)
 	})
 	b.Run("prefix-substring-dot-star-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "a.*foo.*", "afoobar", true)
 	})
 	b.Run("prefix-substring-dot-star-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "a.*foo.*", "abarbaz", false)
 	})
 	b.Run("prefix-substring-dot-plus-match", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "a.+foo.+", "abfoobar", true)
 	})
 	b.Run("prefix-substring-dot-plus-mismatch", func(b *testing.B) {
 		benchmarkPromRegexMatchString(b, "a.+foo.+", "abarbaz", false)
 	})
 }
 func benchmarkPromRegexMatchString(b *testing.B, expr, s string, resultExpected bool) {
 	pr, err := NewPromRegex(expr)
 	if err != nil {
 		panic(fmt.Errorf("unexpected error: %s", err))
 	}
 	re := regexp.MustCompile("^(?:" + expr + ")$")
 	f := func(b *testing.B, matchString func(s string) bool) {
 		b.SetBytes(1)
 		b.ReportAllocs()
 		b.RunParallel(func(pb *testing.PB) {
 			for pb.Next() {
 				result := matchString(s)
 				if result != resultExpected {
 					panic(fmt.Errorf("unexpected result when matching %s against %s; got %v; want %v", s, expr, result, resultExpected))
 				}
 			}
 		})
 	}
 	b.Run("PromRegex", func(b *testing.B) {
 		f(b, pr.MatchString)
 	})
 	b.Run("StandardRegex", func(b *testing.B) {
 		f(b, re.MatchString)
 	})
 }