From 91b006f0a72bfb6deb548afb0927d9f409b9cf2e Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 23 May 2024 17:32:42 +0200 Subject: [PATCH] wip --- lib/regexutil/promregex.go | 91 +++++++----------- lib/regexutil/regex.go | 165 ++++++++++++++++++++++++++++++++ lib/regexutil/regex_test.go | 125 ++++++++++++++++++++++++ lib/regexutil/regexutil.go | 119 ++++++++++++++++++----- lib/regexutil/regexutil_test.go | 20 ++-- 5 files changed, 429 insertions(+), 91 deletions(-) create mode 100644 lib/regexutil/regex.go create mode 100644 lib/regexutil/regex_test.go diff --git a/lib/regexutil/promregex.go b/lib/regexutil/promregex.go index 60ffe3f2f..eeb2f9667 100644 --- a/lib/regexutil/promregex.go +++ b/lib/regexutil/promregex.go @@ -23,9 +23,11 @@ type PromRegex struct { // For example, prefix="foo" for regex="foo(a|b)" prefix string - // Suffix contains regex suffix left after removing the prefix. - // For example, suffix="a|b" for regex="foo(a|b)" - suffix string + // isSuffixDotStar is set to true if suffix is ".*" + isSuffixDotStar bool + + // isSuffixDotPlus is set to true if suffix is ".+" + isSuffixDotPlus bool // substrDotStar contains literal string for regex suffix=".*string.*" substrDotStar string @@ -48,8 +50,10 @@ func NewPromRegex(expr string) (*PromRegex, error) { } prefix, suffix := SimplifyPromRegex(expr) orValues := GetOrValuesPromRegex(suffix) - substrDotStar := getSubstringLiteral(suffix, ".*") - substrDotPlus := getSubstringLiteral(suffix, ".+") + isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar) + isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus) + substrDotStar := getSubstringLiteral(suffix, syntax.OpStar) + substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus) // It is expected that Optimize returns valid regexp in suffix, so use MustCompile here. // Anchor suffix to the beginning and the end of the matching string. suffixExpr := "^(?:" + suffix + ")$" @@ -57,7 +61,8 @@ func NewPromRegex(expr string) (*PromRegex, error) { reSuffixMatcher := bytesutil.NewFastStringMatcher(reSuffix.MatchString) pr := &PromRegex{ prefix: prefix, - suffix: suffix, + isSuffixDotStar: isSuffixDotStar, + isSuffixDotPlus: isSuffixDotPlus, substrDotStar: substrDotStar, substrDotPlus: substrDotPlus, orValues: orValues, @@ -71,19 +76,21 @@ func NewPromRegex(expr string) (*PromRegex, error) { // The pr is automatically anchored to the beginning and to the end // of the matching string with '^' and '$'. func (pr *PromRegex) MatchString(s string) bool { - if !strings.HasPrefix(s, pr.prefix) { - // Fast path - s has another prefix than pr. - return false - } - s = s[len(pr.prefix):] - if len(pr.orValues) > 0 { - // Fast path - pr contains only alternate strings such as 'foo|bar|baz' - for _, v := range pr.orValues { - if s == v { - return true - } + if len(pr.prefix) > 0 { + if !strings.HasPrefix(s, pr.prefix) { + // Fast path - s has another prefix than pr. + return false } - return false + s = s[len(pr.prefix):] + } + + if pr.isSuffixDotStar { + // Fast path - the pr contains "prefix.*" + return true + } + if pr.isSuffixDotPlus { + // Fast path - the pr contains "prefix.+" + return len(s) > 0 } if pr.substrDotStar != "" { // Fast path - pr contains ".*someText.*" @@ -94,45 +101,17 @@ func (pr *PromRegex) MatchString(s string) bool { n := strings.Index(s, pr.substrDotPlus) return n > 0 && n+len(pr.substrDotPlus) < len(s) } - switch pr.suffix { - case ".*": - // Fast path - the pr contains "prefix.*" - return true - case ".+": - // Fast path - the pr contains "prefix.+" - return len(s) > 0 + + if len(pr.orValues) > 0 { + // Fast path - pr contains only alternate strings such as 'foo|bar|baz' + for _, v := range pr.orValues { + if s == v { + return true + } + } + return false } + // Fall back to slow path by matching the original regexp. return pr.reSuffixMatcher.Match(s) } - -// getSubstringLiteral returns regex part from expr surrounded by prefixSuffix. -// -// For example, if expr=".+foo.+" and prefixSuffix=".+", then the function returns "foo". -// -// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix -// or if the regex part surrounded by prefixSuffix contains alternate regexps. -func getSubstringLiteral(expr, prefixSuffix string) string { - // Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix. - sre, err := syntax.Parse(expr, syntax.Perl) - if err != nil { - return "" - } - if sre.Op == syntax.OpAlternate { - return "" - } - - if !strings.HasPrefix(expr, prefixSuffix) { - return "" - } - expr = expr[len(prefixSuffix):] - if !strings.HasSuffix(expr, prefixSuffix) { - return "" - } - expr = expr[:len(expr)-len(prefixSuffix)] - prefix, suffix := SimplifyPromRegex(expr) - if suffix != "" { - return "" - } - return prefix -} diff --git a/lib/regexutil/regex.go b/lib/regexutil/regex.go new file mode 100644 index 000000000..9ae668174 --- /dev/null +++ b/lib/regexutil/regex.go @@ -0,0 +1,165 @@ +package regexutil + +import ( + "regexp" + "regexp/syntax" + "strings" +) + +// Regex implements an optimized string matching for Go regex. +// +// The following regexs are optimized: +// +// - plain string such as "foobar" +// - alternate strings such as "foo|bar|baz" +// - prefix match such as "foo.*" or "foo.+" +// - substring match such as ".*foo.*" or ".+bar.+" +type Regex struct { + // prefix contains literal prefix for regex. + // For example, prefix="foo" for regex="foo(a|b)" + prefix string + + // isSuffixDotStar is set to true if suffix is ".*" + isSuffixDotStar bool + + // isSuffixDotPlus is set to true if suffix is ".+" + isSuffixDotPlus bool + + // substrDotStar contains literal string for regex suffix=".*string.*" + substrDotStar string + + // substrDotPlus contains literal string for regex suffix=".+string.+" + substrDotPlus string + + // orValues contains or values for the suffix regex. + // For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz" + orValues []string + + // re is the original regexp. + re *regexp.Regexp +} + +// NewRegex returns Regex for the given expr. +func NewRegex(expr string) (*Regex, error) { + if _, err := regexp.Compile(expr); err != nil { + return nil, err + } + prefix, suffix := SimplifyRegex(expr) + orValues := GetOrValuesRegex(suffix) + isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar) + isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus) + substrDotStar := getSubstringLiteral(suffix, syntax.OpStar) + substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus) + + var re *regexp.Regexp + if len(orValues) == 0 && substrDotStar == "" && substrDotPlus == "" && suffix != ".*" && suffix != ".+" { + suffixAnchored := suffix + if len(prefix) > 0 { + suffixAnchored = "^(?:" + suffix + ")" + } + // The suffixAnchored must be properly compiled, since it has been already checked above. + // Otherwise it is a bug, which must be fixed. + re = regexp.MustCompile(suffixAnchored) + } + r := &Regex{ + prefix: prefix, + isSuffixDotStar: isSuffixDotStar, + isSuffixDotPlus: isSuffixDotPlus, + substrDotStar: substrDotStar, + substrDotPlus: substrDotPlus, + orValues: orValues, + re: re, + } + return r, nil +} + +// MatchString returns true if s matches pr. +func (r *Regex) MatchString(s string) bool { + if len(r.prefix) == 0 { + return r.matchStringNoPrefix(s) + } + return r.matchStringWithPrefix(s) +} + +func (r *Regex) matchStringNoPrefix(s string) bool { + if r.isSuffixDotStar { + return true + } + if r.isSuffixDotPlus { + return len(s) > 0 + } + if r.substrDotStar != "" { + // Fast path - r contains ".*someText.*" + return strings.Contains(s, r.substrDotStar) + } + if r.substrDotPlus != "" { + // Fast path - r contains ".+someText.+" + n := strings.Index(s, r.substrDotPlus) + return n > 0 && n+len(r.substrDotPlus) < len(s) + } + + if len(r.orValues) == 0 { + // Fall back to slow path by matching the original regexp. + return r.re.MatchString(s) + } + + // Fast path - compare s to pr.orValues + for _, v := range r.orValues { + if strings.Contains(s, v) { + return true + } + } + return false +} + +func (r *Regex) matchStringWithPrefix(s string) bool { + n := strings.Index(s, r.prefix) + if n < 0 { + // Fast path - s doesn't contain the needed prefix + return false + } + sNext := s[n+1:] + s = s[n+len(r.prefix):] + + if r.isSuffixDotStar { + return true + } + if r.isSuffixDotPlus { + return len(s) > 0 + } + if r.substrDotStar != "" { + // Fast path - r contains ".*someText.*" + return strings.Contains(s, r.substrDotStar) + } + if r.substrDotPlus != "" { + // Fast path - r contains ".+someText.+" + n := strings.Index(s, r.substrDotPlus) + return n > 0 && n+len(r.substrDotPlus) < len(s) + } + + for { + if len(r.orValues) == 0 { + // Fall back to slow path by matching the original regexp. + if r.re.MatchString(s) { + return true + } + } else { + // Fast path - compare s to pr.orValues + for _, v := range r.orValues { + if strings.HasPrefix(s, v) { + return true + } + } + } + + // Mismatch. Try again starting from the next char. + s = sNext + n := strings.Index(s, r.prefix) + if n < 0 { + // Fast path - s doesn't contain the needed prefix + return false + } + sNext = s[n+1:] + s = s[n+len(r.prefix):] + } +} diff --git a/lib/regexutil/regex_test.go b/lib/regexutil/regex_test.go new file mode 100644 index 000000000..a2f0c79ad --- /dev/null +++ b/lib/regexutil/regex_test.go @@ -0,0 +1,125 @@ +package regexutil + +import ( + "testing" +) + +func TestRegexMatchString(t *testing.T) { + f := func(regex, s string, resultExpected bool) { + t.Helper() + + re, err := NewRegex(regex) + if err != nil { + t.Fatalf("cannot parse %q: %s", regex, err) + } + result := re.MatchString(s) + if result != resultExpected { + t.Fatalf("unexpected result when matching %q against regex=%q; got %v; want %v", s, regex, result, resultExpected) + } + } + + f("", "", true) + f("", "foo", true) + f("foo", "", false) + f(".*", "", true) + f(".*", "foo", true) + f(".+", "", false) + f(".+", "foo", true) + f("foo.*", "bar", false) + f("foo.*", "foo", true) + f("foo.*", "a foo", true) + f("foo.*", "a foo a", true) + f("foo.*", "foobar", true) + f("foo.*", "a foobar", true) + f("foo.+", "bar", false) + f("foo.+", "foo", false) + f("foo.+", "a foo", false) + f("foo.+", "foobar", true) + f("foo.+", "a foobar", true) + f("foo|bar", "", false) + f("foo|bar", "a", false) + f("foo|bar", "foo", true) + f("foo|bar", "a foo", true) + f("foo|bar", "foo a", true) + f("foo|bar", "a foo a", true) + f("foo|bar", "bar", true) + f("foo|bar", "foobar", true) + f("foo(bar|baz)", "a", false) + f("foo(bar|baz)", "foobar", true) + f("foo(bar|baz)", "foobaz", true) + f("foo(bar|baz)", "foobaza", true) + f("foo(bar|baz)", "a foobaz a", true) + f("foo(bar|baz)", "foobal", false) + f("^foo|b(ar)$", "foo", true) + f("^foo|b(ar)$", "foo a", true) + f("^foo|b(ar)$", "a foo", false) + f("^foo|b(ar)$", "bar", true) + f("^foo|b(ar)$", "a bar", true) + f("^foo|b(ar)$", "barz", false) + f("^foo|b(ar)$", "ar", false) + f(".*foo.*", "foo", true) + f(".*foo.*", "afoobar", true) + f(".*foo.*", "abc", false) + f("foo.*bar.*", "foobar", true) + f("foo.*bar.*", "foo_bar_", true) + f("foo.*bar.*", "a foo bar baz", true) + f("foo.*bar.*", "foobaz", false) + f("foo.*bar.*", "baz foo", false) + f(".+foo.+", "foo", false) + f(".+foo.+", "afoobar", true) + f(".+foo.+", "afoo", false) + f(".+foo.+", "abc", false) + f("foo.+bar.+", "foobar", false) + f("foo.+bar.+", "foo_bar_", true) + f("foo.+bar.+", "a foo_bar_", true) + f("foo.+bar.+", "foobaz", false) + f("foo.+bar.+", "abc", false) + f(".+foo.*", "foo", false) + f(".+foo.*", "afoo", true) + f(".+foo.*", "afoobar", true) + f(".*(a|b).*", "a", true) + f(".*(a|b).*", "ax", true) + f(".*(a|b).*", "xa", true) + f(".*(a|b).*", "xay", true) + f(".*(a|b).*", "xzy", false) + f("^(?:true)$", "true", true) + f("^(?:true)$", "false", false) + + f(".+;|;.+", ";", false) + f(".+;|;.+", "foo", false) + f(".+;|;.+", "foo;bar", true) + f(".+;|;.+", "foo;", true) + f(".+;|;.+", ";foo", true) + f(".+foo|bar|baz.+", "foo", false) + f(".+foo|bar|baz.+", "afoo", true) + f(".+foo|bar|baz.+", "fooa", false) + f(".+foo|bar|baz.+", "afooa", true) + f(".+foo|bar|baz.+", "bar", true) + f(".+foo|bar|baz.+", "abar", true) + f(".+foo|bar|baz.+", "abara", true) + f(".+foo|bar|baz.+", "bara", true) + f(".+foo|bar|baz.+", "baz", false) + f(".+foo|bar|baz.+", "baza", true) + f(".+foo|bar|baz.+", "abaz", false) + f(".+foo|bar|baz.+", "abaza", true) + f(".+foo|bar|baz.+", "afoo|bar|baza", true) + f(".+(foo|bar|baz).+", "bar", false) + f(".+(foo|bar|baz).+", "bara", false) + f(".+(foo|bar|baz).+", "abar", false) + f(".+(foo|bar|baz).+", "abara", true) + f(".+(foo|bar|baz).+", "afooa", true) + f(".+(foo|bar|baz).+", "abaza", true) + + f(".*;|;.*", ";", true) + f(".*;|;.*", "foo", false) + f(".*;|;.*", "foo;bar", true) + f(".*;|;.*", "foo;", true) + f(".*;|;.*", ";foo", true) + + f("^bar", "foobarbaz", false) + f("^foo", "foobarbaz", true) + f("bar$", "foobarbaz", false) + f("baz$", "foobarbaz", true) + f("(bar$|^foo)", "foobarbaz", true) + f("(bar$^boo)", "foobarbaz", false) +} diff --git a/lib/regexutil/regexutil.go b/lib/regexutil/regexutil.go index 64480c2d7..5aa641279 100644 --- a/lib/regexutil/regexutil.go +++ b/lib/regexutil/regexutil.go @@ -18,6 +18,16 @@ func RemoveStartEndAnchors(expr string) string { return expr } +// GetOrValuesRegex returns "or" values from the given regexp expr. +// +// It returns ["foo", "bar"] for "foo|bar" regexp. +// It returns ["foo"] for "foo" regexp. +// It returns [""] for "" regexp. +// It returns an empty list if it is impossible to extract "or" values from the regexp. +func GetOrValuesRegex(expr string) []string { + return getOrValuesRegex(expr, true) +} + // GetOrValuesPromRegex returns "or" values from the given Prometheus-like regexp expr. // // It ignores start and end anchors ('^') and ('$') at the start and the end of expr. @@ -27,15 +37,19 @@ func RemoveStartEndAnchors(expr string) string { // It returns an empty list if it is impossible to extract "or" values from the regexp. func GetOrValuesPromRegex(expr string) []string { expr = RemoveStartEndAnchors(expr) - prefix, tailExpr := SimplifyPromRegex(expr) + return getOrValuesRegex(expr, false) +} + +func getOrValuesRegex(expr string, keepAnchors bool) []string { + prefix, tailExpr := simplifyRegex(expr, keepAnchors) if tailExpr == "" { return []string{prefix} } - sre, err := syntax.Parse(tailExpr, syntax.Perl) + sre, err := syntax.Parse(tailExpr, regexParseFlags) if err != nil { panic(fmt.Errorf("BUG: unexpected error when parsing verified tailExpr=%q: %w", tailExpr, err)) } - orValues := getOrValuesExt(sre) + orValues := getOrValues(sre) // Sort orValues for faster index seek later sort.Strings(orValues) @@ -50,10 +64,10 @@ func GetOrValuesPromRegex(expr string) []string { return orValues } -func getOrValuesExt(sre *syntax.Regexp) []string { +func getOrValues(sre *syntax.Regexp) []string { switch sre.Op { case syntax.OpCapture: - return getOrValuesExt(sre.Sub[0]) + return getOrValues(sre.Sub[0]) case syntax.OpLiteral: if !isLiteral(sre) { return nil @@ -64,7 +78,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string { case syntax.OpAlternate: a := make([]string, 0, len(sre.Sub)) for _, reSub := range sre.Sub { - ca := getOrValuesExt(reSub) + ca := getOrValues(reSub) if len(ca) == 0 { return nil } @@ -94,7 +108,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string { if len(sre.Sub) < 1 { return []string{""} } - prefixes := getOrValuesExt(sre.Sub[0]) + prefixes := getOrValues(sre.Sub[0]) if len(prefixes) == 0 { return nil } @@ -102,7 +116,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string { return prefixes } sre.Sub = sre.Sub[1:] - suffixes := getOrValuesExt(sre) + suffixes := getOrValues(sre) if len(suffixes) == 0 { return nil } @@ -132,21 +146,33 @@ func isLiteral(sre *syntax.Regexp) bool { const maxOrValues = 100 +// SimplifyRegex simplifies the given regexp expr. +// +// It returns plaintext pefix and the remaining regular expression +// without capturing parens. +func SimplifyRegex(expr string) (string, string) { + return simplifyRegex(expr, true) +} + // SimplifyPromRegex simplifies the given Prometheus-like expr. // // It returns plaintext prefix and the remaining regular expression -// with dropped '^' and '$' anchors at the beginning and the end +// with dropped '^' and '$' anchors at the beginning and at the end // of the regular expression. // // The function removes capturing parens from the expr, // so it cannot be used when capturing parens are necessary. func SimplifyPromRegex(expr string) (string, string) { - sre, err := syntax.Parse(expr, syntax.Perl) + return simplifyRegex(expr, false) +} + +func simplifyRegex(expr string, keepAnchors bool) (string, string) { + sre, err := syntax.Parse(expr, regexParseFlags) if err != nil { // Cannot parse the regexp. Return it all as prefix. return expr, "" } - sre = simplifyRegexp(sre, false) + sre = simplifyRegexp(sre, keepAnchors, keepAnchors) if sre == emptyRegexp { return "", "" } @@ -162,7 +188,7 @@ func SimplifyPromRegex(expr string) (string, string) { if len(sre.Sub) == 0 { return prefix, "" } - sre = simplifyRegexp(sre, true) + sre = simplifyRegexp(sre, true, keepAnchors) } } if _, err := syntax.Compile(sre); err != nil { @@ -171,17 +197,19 @@ func SimplifyPromRegex(expr string) (string, string) { } s := sre.String() s = strings.ReplaceAll(s, "(?:)", "") - s = strings.ReplaceAll(s, "(?-s:.)", ".") - s = strings.ReplaceAll(s, "(?-m:$)", "$") + s = strings.ReplaceAll(s, "(?s:.)", ".") + s = strings.ReplaceAll(s, "(?m:$)", "$") return prefix, s } -func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp { +func simplifyRegexp(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp { s := sre.String() for { - sre = simplifyRegexpExt(sre, hasPrefix, false) + sre = simplifyRegexpExt(sre, keepBeginOp, keepEndOp) sre = sre.Simplify() - if sre.Op == syntax.OpBeginText || sre.Op == syntax.OpEndText { + if !keepBeginOp && sre.Op == syntax.OpBeginText { + sre = emptyRegexp + } else if !keepEndOp && sre.Op == syntax.OpEndText { sre = emptyRegexp } sNew := sre.String() @@ -189,7 +217,7 @@ func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp { return sre } var err error - sre, err = syntax.Parse(sNew, syntax.Perl) + sre, err = syntax.Parse(sNew, regexParseFlags) if err != nil { panic(fmt.Errorf("BUG: cannot parse simplified regexp %q: %w", sNew, err)) } @@ -197,18 +225,18 @@ func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp { } } -func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Regexp { +func simplifyRegexpExt(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp { switch sre.Op { case syntax.OpCapture: // Substitute all the capture regexps with non-capture regexps. sre.Op = syntax.OpAlternate - sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix) + sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp) if sre.Sub[0] == emptyRegexp { return emptyRegexp } return sre case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: - sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix) + sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp) if sre.Sub[0] == emptyRegexp { return emptyRegexp } @@ -216,13 +244,13 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re case syntax.OpAlternate: // Do not remove empty captures from OpAlternate, since this may break regexp. for i, sub := range sre.Sub { - sre.Sub[i] = simplifyRegexpExt(sub, hasPrefix, hasSuffix) + sre.Sub[i] = simplifyRegexpExt(sub, keepBeginOp, keepEndOp) } return sre case syntax.OpConcat: subs := sre.Sub[:0] for i, sub := range sre.Sub { - sub = simplifyRegexpExt(sub, hasPrefix || len(subs) > 0, hasSuffix || i+1 < len(sre.Sub)) + sub = simplifyRegexpExt(sub, keepBeginOp || len(subs) > 0, keepEndOp || i+1 < len(sre.Sub)) if sub != emptyRegexp { subs = append(subs, sub) } @@ -230,12 +258,12 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re sre.Sub = subs // Remove anchros from the beginning and the end of regexp, since they // will be added later. - if !hasPrefix { + if !keepBeginOp { for len(sre.Sub) > 0 && sre.Sub[0].Op == syntax.OpBeginText { sre.Sub = sre.Sub[1:] } } - if !hasSuffix { + if !keepEndOp { for len(sre.Sub) > 0 && sre.Sub[len(sre.Sub)-1].Op == syntax.OpEndText { sre.Sub = sre.Sub[:len(sre.Sub)-1] } @@ -254,6 +282,47 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re } } +// getSubstringLiteral returns regex part from expr surrounded by .+ or .* depending on the prefixSuffixOp. +// +// For example, if expr=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo". +// +// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix +// or if the regex part surrounded by prefixSuffix contains alternate regexps. +func getSubstringLiteral(expr string, prefixSuffixOp syntax.Op) string { + // Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix. + sre, err := syntax.Parse(expr, regexParseFlags) + if err != nil { + return "" + } + if sre.Op != syntax.OpConcat { + return "" + } + if len(sre.Sub) != 3 { + return "" + } + if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) || !isLiteral(sre.Sub[1]) { + return "" + } + return string(sre.Sub[1].Rune) +} + +func isDotOpRegexp(expr string, op syntax.Op) bool { + sre, err := syntax.Parse(expr, regexParseFlags) + if err != nil { + return false + } + return isDotOp(sre, op) +} + +func isDotOp(sre *syntax.Regexp, op syntax.Op) bool { + if sre.Op != op { + return false + } + return sre.Sub[0].Op == syntax.OpAnyChar +} + var emptyRegexp = &syntax.Regexp{ Op: syntax.OpEmptyMatch, } + +const regexParseFlags = syntax.Perl | syntax.DotNL diff --git a/lib/regexutil/regexutil_test.go b/lib/regexutil/regexutil_test.go index 51ca2a481..95ad50a5b 100644 --- a/lib/regexutil/regexutil_test.go +++ b/lib/regexutil/regexutil_test.go @@ -77,7 +77,7 @@ func TestSimplifyPromRegex(t *testing.T) { f("^foobar|foobaz", "fooba", "[rz]") f("^foobar|^foobaz$", "fooba", "[rz]") f("foobar|foobaz", "fooba", "[rz]") - f("(?:^foobar|^foobaz)aa.*", "fooba", "(?-s:[rz]aa.*)") + f("(?:^foobar|^foobaz)aa.*", "fooba", "(?s:[rz]aa.*)") f("foo[bar]+", "foo", "[abr]+") f("foo[a-z]+", "foo", "[a-z]+") f("foo[bar]*", "foo", "[abr]*") @@ -88,12 +88,12 @@ func TestSimplifyPromRegex(t *testing.T) { f("foo[^x]*", "foo", "[^x]*") f("foo[x]*bar", "foo", "x*bar") f("fo\\Bo[x]*bar?", "fo", "\\Box*bar?") - f("foo.+bar", "foo", "(?-s:.+bar)") - f("a(b|c.*).+", "a", "(?-s:(?:b|c.*).+)") + f("foo.+bar", "foo", "(?s:.+bar)") + f("a(b|c.*).+", "a", "(?s:(?:b|c.*).+)") f("ab|ac", "a", "[bc]") f("(?i)xyz", "", "(?i:XYZ)") f("(?i)foo|bar", "", "(?i:FOO|BAR)") - f("(?i)up.+x", "", "(?i-s:UP.+X)") + f("(?i)up.+x", "", "(?is:UP.+X)") f("(?smi)xy.*z$", "", "(?ims:XY.*Z$)") // test invalid regexps @@ -111,12 +111,12 @@ func TestSimplifyPromRegex(t *testing.T) { f("(foo|bar$)x*", "", "(?-m:(?:foo|bar$)x*)") // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297 - f(".+;|;.+", "", "(?-s:.+;|;.+)") - f("^(.+);|;(.+)$", "", "(?-s:.+;|;.+)") - f("^(.+);$|^;(.+)$", "", "(?-s:.+;|;.+)") - f(".*;|;.*", "", "(?-s:.*;|;.*)") - f("^(.*);|;(.*)$", "", "(?-s:.*;|;.*)") - f("^(.*);$|^;(.*)$", "", "(?-s:.*;|;.*)") + f(".+;|;.+", "", "(?s:.+;|;.+)") + f("^(.+);|;(.+)$", "", "(?s:.+;|;.+)") + f("^(.+);$|^;(.+)$", "", "(?s:.+;|;.+)") + f(".*;|;.*", "", "(?s:.*;|;.*)") + f("^(.*);|;(.*)$", "", "(?s:.*;|;.*)") + f("^(.*);$|^;(.*)$", "", "(?s:.*;|;.*)") } func TestRemoveStartEndAnchors(t *testing.T) {