From 0d4ea03a73470fac08bf2ec5babc738a8eeacf51 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 24 Aug 2022 17:54:26 +0300 Subject: [PATCH] lib/promrelabel: optimize `action: {labeldrop,labelkeep,keep,drop}` with `regex` containing alternate values For example, the following relabeling rule must work much faster now: - action: labeldrop regex: "foo|bar|baz" --- docs/CHANGELOG.md | 1 + lib/promrelabel/config.go | 4 + lib/promrelabel/relabel.go | 9 +++ lib/promrelabel/relabel_test.go | 58 ++++++++++++++ lib/promrelabel/relabel_timing_test.go | 79 ++++++++++++++++++ lib/regexutil/regexutil.go | 107 +++++++++++++++++++++++++ lib/regexutil/regexutil_test.go | 48 +++++++++++ lib/storage/tag_filters.go | 90 +-------------------- lib/storage/tag_filters_test.go | 40 --------- 9 files changed, 308 insertions(+), 128 deletions(-) create mode 100644 lib/regexutil/regexutil.go create mode 100644 lib/regexutil/regexutil_test.go diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2d19773dda..0bb4376b96 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -23,6 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs. * FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922). +* FEATURE: improve the performance for `action: keep`, `action: drop`, `action: labelkeep` and `action: labeldrop` relabeling rules for `regex` containing the list of matching values. For example, `regex: "foo|bar|baz"`. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert//opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015). * FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details. * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`. diff --git a/lib/promrelabel/config.go b/lib/promrelabel/config.go index bd31e99b17..a7881dbbf5 100644 --- a/lib/promrelabel/config.go +++ b/lib/promrelabel/config.go @@ -8,6 +8,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate" "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil" "gopkg.in/yaml.v2" ) @@ -199,6 +200,7 @@ func parseRelabelConfig(rc *RelabelConfig) (*parsedRelabelConfig, error) { targetLabel := rc.TargetLabel regexCompiled := defaultRegexForRelabelConfig regexOriginalCompiled := defaultOriginalRegexForRelabelConfig + var regexOrValues []string if rc.Regex != nil { regex := rc.Regex.S regexOrig := regex @@ -215,6 +217,7 @@ func parseRelabelConfig(rc *RelabelConfig) (*parsedRelabelConfig, error) { return nil, fmt.Errorf("cannot parse `regex` %q: %w", regexOrig, err) } regexOriginalCompiled = reOriginal + regexOrValues = regexutil.GetOrValues(regexOrig) } modulus := rc.Modulus replacement := "$1" @@ -344,6 +347,7 @@ func parseRelabelConfig(rc *RelabelConfig) (*parsedRelabelConfig, error) { graphiteLabelRules: graphiteLabelRules, regexOriginal: regexOriginalCompiled, + regexOrValues: regexOrValues, hasCaptureGroupInTargetLabel: strings.Contains(targetLabel, "$"), hasCaptureGroupInReplacement: strings.Contains(replacement, "$"), diff --git a/lib/promrelabel/relabel.go b/lib/promrelabel/relabel.go index 1d6a926229..01d2a88e22 100644 --- a/lib/promrelabel/relabel.go +++ b/lib/promrelabel/relabel.go @@ -29,6 +29,7 @@ type parsedRelabelConfig struct { graphiteLabelRules []graphiteLabelRule regexOriginal *regexp.Regexp + regexOrValues []string hasCaptureGroupInTargetLabel bool hasCaptureGroupInReplacement bool @@ -413,6 +414,14 @@ func (prc *parsedRelabelConfig) replaceStringSubmatches(s, replacement string, h } func (prc *parsedRelabelConfig) matchString(s string) bool { + if len(prc.regexOrValues) > 0 { + for _, orValue := range prc.regexOrValues { + if s == orValue { + return true + } + } + return false + } prefix, complete := prc.regexOriginal.LiteralPrefix() if complete { return prefix == s diff --git a/lib/promrelabel/relabel_test.go b/lib/promrelabel/relabel_test.go index 52b501bc5b..64f90bd846 100644 --- a/lib/promrelabel/relabel_test.go +++ b/lib/promrelabel/relabel_test.go @@ -726,3 +726,61 @@ func TestFillLabelReferences(t *testing.T) { f(`{{bar}}-aa`, `foo{bar="baz"}`, `baz-aa`) f(`{{bar}}-aa{{__name__}}.{{bar}}{{non-existing-label}}`, `foo{bar="baz"}`, `baz-aafoo.baz`) } + +func TestRegexpMatchStringSuccess(t *testing.T) { + f := func(pattern, s string) { + t.Helper() + rc := &RelabelConfig{ + Action: "labeldrop", + Regex: &MultiLineRegex{ + S: pattern, + }, + } + prc, err := parseRelabelConfig(rc) + if err != nil { + t.Fatalf("unexpected error in parseRelabelConfig: %s", err) + } + if !prc.matchString(s) { + t.Fatalf("unexpected matchString(%q) result; got false; want true", s) + } + } + f("", "") + f("foo", "foo") + f(".*", "") + f(".*", "foo") + f("foo.*", "foobar") + f("foo.+", "foobar") + f("f.+o", "foo") + f("foo|bar", "bar") + f("^(foo|bar)$", "foo") + f("foo.+", "foobar") + f("^foo$", "foo") +} + +func TestRegexpMatchStringFailure(t *testing.T) { + f := func(pattern, s string) { + t.Helper() + rc := &RelabelConfig{ + Action: "labeldrop", + Regex: &MultiLineRegex{ + S: pattern, + }, + } + prc, err := parseRelabelConfig(rc) + if err != nil { + t.Fatalf("unexpected error in parseRelabelConfig: %s", err) + } + if prc.matchString(s) { + t.Fatalf("unexpected matchString(%q) result; got true; want false", s) + } + } + f("", "foo") + f("foo", "") + f("foo.*", "foa") + f("foo.+", "foo") + f("f.+o", "foor") + f("foo|bar", "barz") + f("^(foo|bar)$", "xfoo") + f("foo.+", "foo") + f("^foo$", "foobar") +} diff --git a/lib/promrelabel/relabel_timing_test.go b/lib/promrelabel/relabel_timing_test.go index cddb84e12f..3bf61babd7 100644 --- a/lib/promrelabel/relabel_timing_test.go +++ b/lib/promrelabel/relabel_timing_test.go @@ -2,11 +2,90 @@ package promrelabel import ( "fmt" + "regexp" "testing" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" ) +func BenchmarkMatchRegexOrValuesMatchOptimized(b *testing.B) { + const pattern = "foo|bar|baz|abc" + const s = "foo" + rc := &RelabelConfig{ + Action: "labeldrop", + Regex: &MultiLineRegex{ + S: pattern, + }, + } + prc, err := parseRelabelConfig(rc) + if err != nil { + panic(fmt.Errorf("unexpected error in parseRelabelConfig: %s", err)) + } + b.ReportAllocs() + b.SetBytes(1) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if !prc.matchString(s) { + panic(fmt.Errorf("unexpected string mismatch for pattern=%q, s=%q", pattern, s)) + } + } + }) +} + +func BenchmarkMatchRegexOrValuesMismatchOptimized(b *testing.B) { + const pattern = "foo|bar|baz|abc" + const s = "qwert" + rc := &RelabelConfig{ + Action: "labeldrop", + Regex: &MultiLineRegex{ + S: pattern, + }, + } + prc, err := parseRelabelConfig(rc) + if err != nil { + panic(fmt.Errorf("unexpected error in parseRelabelConfig: %s", err)) + } + b.ReportAllocs() + b.SetBytes(1) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if prc.matchString(s) { + panic(fmt.Errorf("unexpected string match for pattern=%q, s=%q", pattern, s)) + } + } + }) +} + +func BenchmarkMatchRegexOrValuesMatchUnoptimized(b *testing.B) { + const pattern = "foo|bar|baz|abc" + const s = "foo" + re := regexp.MustCompile(pattern) + b.ReportAllocs() + b.SetBytes(1) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if !re.MatchString(s) { + panic(fmt.Errorf("unexpected string mismatch for pattern=%q, s=%q", pattern, s)) + } + } + }) +} + +func BenchmarkMatchRegexOrValuesMismatchUnoptimized(b *testing.B) { + const pattern = "foo|bar|baz|abc" + const s = "qwert" + re := regexp.MustCompile(pattern) + b.ReportAllocs() + b.SetBytes(1) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if re.MatchString(s) { + panic(fmt.Errorf("unexpected string match for pattern=%q, s=%q", pattern, s)) + } + } + }) +} + func BenchmarkApplyRelabelConfigs(b *testing.B) { b.Run("replace-label-copy", func(b *testing.B) { pcs := mustParseRelabelConfigs(` diff --git a/lib/regexutil/regexutil.go b/lib/regexutil/regexutil.go new file mode 100644 index 0000000000..0589b16a6d --- /dev/null +++ b/lib/regexutil/regexutil.go @@ -0,0 +1,107 @@ +package regexutil + +import ( + "regexp/syntax" + "sort" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" +) + +// GetOrValues returns "or" values from the given regexp expr. +// +// E.g. it returns ["foo", "bar"] for "foo|bar" regexp. +// It returns an empty list if it is impossible to extract "or" values from the regexp. +// It returns [""] for "" regexp. +func GetOrValues(expr string) []string { + sre, err := syntax.Parse(expr, syntax.Perl) + if err != nil { + logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err) + } + orValues := getOrValuesExt(sre) + + // Sort orValues for faster index seek later + sort.Strings(orValues) + + return orValues +} + +func getOrValuesExt(sre *syntax.Regexp) []string { + switch sre.Op { + case syntax.OpCapture: + return getOrValuesExt(sre.Sub[0]) + case syntax.OpLiteral: + if !isLiteral(sre) { + return nil + } + return []string{string(sre.Rune)} + case syntax.OpEmptyMatch: + return []string{""} + case syntax.OpBeginText, syntax.OpEndText: + return []string{""} + case syntax.OpAlternate: + a := make([]string, 0, len(sre.Sub)) + for _, reSub := range sre.Sub { + ca := getOrValuesExt(reSub) + if len(ca) == 0 { + return nil + } + a = append(a, ca...) + if len(a) > maxOrValues { + // It is cheaper to use regexp here. + return nil + } + } + return a + case syntax.OpCharClass: + a := make([]string, 0, len(sre.Rune)/2) + for i := 0; i < len(sre.Rune); i += 2 { + start := sre.Rune[i] + end := sre.Rune[i+1] + for start <= end { + a = append(a, string(start)) + start++ + if len(a) > maxOrValues { + // It is cheaper to use regexp here. + return nil + } + } + } + return a + case syntax.OpConcat: + if len(sre.Sub) < 1 { + return []string{""} + } + prefixes := getOrValuesExt(sre.Sub[0]) + if len(prefixes) == 0 { + return nil + } + sre.Sub = sre.Sub[1:] + suffixes := getOrValuesExt(sre) + if len(suffixes) == 0 { + return nil + } + if len(prefixes)*len(suffixes) > maxOrValues { + // It is cheaper to use regexp here. + return nil + } + a := make([]string, 0, len(prefixes)*len(suffixes)) + for _, prefix := range prefixes { + for _, suffix := range suffixes { + s := prefix + suffix + a = append(a, s) + } + } + return a + default: + return nil + } +} + +func isLiteral(sre *syntax.Regexp) bool { + if sre.Op == syntax.OpCapture { + return isLiteral(sre.Sub[0]) + } + return sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 +} + +const maxOrValues = 100 diff --git a/lib/regexutil/regexutil_test.go b/lib/regexutil/regexutil_test.go new file mode 100644 index 0000000000..99a91acb33 --- /dev/null +++ b/lib/regexutil/regexutil_test.go @@ -0,0 +1,48 @@ +package regexutil + +import ( + "reflect" + "testing" +) + +func TestGetOrValues(t *testing.T) { + f := func(s string, valuesExpected []string) { + t.Helper() + values := GetOrValues(s) + if !reflect.DeepEqual(values, valuesExpected) { + t.Fatalf("unexpected values for s=%q; got %q; want %q", s, values, valuesExpected) + } + } + + f("", []string{""}) + f("foo", []string{"foo"}) + f("^foo$", []string{"foo"}) + f("|foo", []string{"", "foo"}) + f("|foo|", []string{"", "", "foo"}) + f("foo.+", nil) + f("foo.*", nil) + f(".*", nil) + f("foo|.*", nil) + f("foobar", []string{"foobar"}) + f("z|x|c", []string{"c", "x", "z"}) + f("foo|bar", []string{"bar", "foo"}) + f("(foo|bar)", []string{"bar", "foo"}) + f("(foo|bar)baz", []string{"barbaz", "foobaz"}) + f("[a-z][a-z]", nil) + f("[a-d]", []string{"a", "b", "c", "d"}) + f("x[a-d]we", []string{"xawe", "xbwe", "xcwe", "xdwe"}) + f("foo(bar|baz)", []string{"foobar", "foobaz"}) + f("foo(ba[rz]|(xx|o))", []string{"foobar", "foobaz", "fooo", "fooxx"}) + f("foo(?:bar|baz)x(qwe|rt)", []string{"foobarxqwe", "foobarxrt", "foobazxqwe", "foobazxrt"}) + f("foo(bar||baz)", []string{"foo", "foobar", "foobaz"}) + f("(a|b|c)(d|e|f|0|1|2)(g|h|k|x|y|z)", nil) + f("(?i)foo", nil) + f("(?i)(foo|bar)", nil) + f("^foo|bar$", []string{"bar", "foo"}) + f("^(foo|bar)$", []string{"bar", "foo"}) + f("^a(foo|b(?:a|r))$", []string{"aba", "abr", "afoo"}) + // This is incorrect conversion, because the regexp matches nothing. + // It is OK for now, since such regexps are uncommon in practice. + // TODO: properly handle this case. + f("^a(^foo|bar$)z$", []string{"abarz", "afooz"}) +} diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index cf064b3b4d..7b547ae256 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -15,6 +15,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/lrucache" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil" ) // convertToCompositeTagFilterss converts tfss to composite filters. @@ -523,7 +524,7 @@ func getRegexpFromCache(expr []byte) (*regexpCacheValue, error) { } sExpr := string(expr) - orValues := getOrValues(sExpr) + orValues := regexutil.GetOrValues(sExpr) var reMatch func(b []byte) bool var reCost uint64 var literalSuffix string @@ -787,93 +788,6 @@ func isLiteral(sre *syntax.Regexp) bool { return sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 } -func getOrValues(expr string) []string { - sre, err := syntax.Parse(expr, syntax.Perl) - if err != nil { - logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err) - } - orValues := getOrValuesExt(sre) - - // Sort orValues for faster index seek later - sort.Strings(orValues) - - return orValues -} - -func getOrValuesExt(sre *syntax.Regexp) []string { - switch sre.Op { - case syntax.OpCapture: - return getOrValuesExt(sre.Sub[0]) - case syntax.OpLiteral: - if !isLiteral(sre) { - return nil - } - return []string{string(sre.Rune)} - case syntax.OpEmptyMatch: - return []string{""} - case syntax.OpBeginText, syntax.OpEndText: - return []string{""} - case syntax.OpAlternate: - a := make([]string, 0, len(sre.Sub)) - for _, reSub := range sre.Sub { - ca := getOrValuesExt(reSub) - if len(ca) == 0 { - return nil - } - a = append(a, ca...) - if len(a) > maxOrValues { - // It is cheaper to use regexp here. - return nil - } - } - return a - case syntax.OpCharClass: - a := make([]string, 0, len(sre.Rune)/2) - for i := 0; i < len(sre.Rune); i += 2 { - start := sre.Rune[i] - end := sre.Rune[i+1] - for start <= end { - a = append(a, string(start)) - start++ - if len(a) > maxOrValues { - // It is cheaper to use regexp here. - return nil - } - } - } - return a - case syntax.OpConcat: - if len(sre.Sub) < 1 { - return []string{""} - } - prefixes := getOrValuesExt(sre.Sub[0]) - if len(prefixes) == 0 { - return nil - } - sre.Sub = sre.Sub[1:] - suffixes := getOrValuesExt(sre) - if len(suffixes) == 0 { - return nil - } - if len(prefixes)*len(suffixes) > maxOrValues { - // It is cheaper to use regexp here. - return nil - } - a := make([]string, 0, len(prefixes)*len(suffixes)) - for _, prefix := range prefixes { - for _, suffix := range suffixes { - s := prefix + suffix - a = append(a, s) - } - } - return a - default: - return nil - } -} - -const maxOrValues = 100 - var tagCharsRegexpEscaper = strings.NewReplacer( "\\x00", "\\x000", // escapeChar "\x00", "\\x000", // escapeChar diff --git a/lib/storage/tag_filters_test.go b/lib/storage/tag_filters_test.go index 3d692c8713..b9fedec63e 100644 --- a/lib/storage/tag_filters_test.go +++ b/lib/storage/tag_filters_test.go @@ -1145,46 +1145,6 @@ func TestTagFilterMatchSuffix(t *testing.T) { }) } -func TestGetOrValues(t *testing.T) { - f := func(s string, valuesExpected []string) { - t.Helper() - values := getOrValues(s) - if !reflect.DeepEqual(values, valuesExpected) { - t.Fatalf("unexpected values for s=%q; got %q; want %q", s, values, valuesExpected) - } - } - - f("", []string{""}) - f("|foo", []string{"", "foo"}) - f("|foo|", []string{"", "", "foo"}) - f("foo.+", nil) - f("foo.*", nil) - f(".*", nil) - f("foo|.*", nil) - f("foobar", []string{"foobar"}) - f("z|x|c", []string{"c", "x", "z"}) - f("foo|bar", []string{"bar", "foo"}) - f("(foo|bar)", []string{"bar", "foo"}) - f("(foo|bar)baz", []string{"barbaz", "foobaz"}) - f("[a-z][a-z]", nil) - f("[a-d]", []string{"a", "b", "c", "d"}) - f("x[a-d]we", []string{"xawe", "xbwe", "xcwe", "xdwe"}) - f("foo(bar|baz)", []string{"foobar", "foobaz"}) - f("foo(ba[rz]|(xx|o))", []string{"foobar", "foobaz", "fooo", "fooxx"}) - f("foo(?:bar|baz)x(qwe|rt)", []string{"foobarxqwe", "foobarxrt", "foobazxqwe", "foobazxrt"}) - f("foo(bar||baz)", []string{"foo", "foobar", "foobaz"}) - f("(a|b|c)(d|e|f|0|1|2)(g|h|k|x|y|z)", nil) - f("(?i)foo", nil) - f("(?i)(foo|bar)", nil) - f("^foo|bar$", []string{"bar", "foo"}) - f("^(foo|bar)$", []string{"bar", "foo"}) - f("^a(foo|b(?:a|r))$", []string{"aba", "abr", "afoo"}) - // This is incorrect conversion, because the regexp matches nothing. - // It is OK for now, since such regexps are uncommon in practice. - // TODO: properly handle this case. - f("^a(^foo|bar$)z$", []string{"abarz", "afooz"}) -} - func TestGetRegexpPrefix(t *testing.T) { f := func(t *testing.T, s, expectedPrefix, expectedSuffix string) { t.Helper()