From b0afef1e2ba25295baf3d013a689a26f2f9372f9 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 23 May 2024 21:24:08 +0200 Subject: [PATCH] wip --- lib/regexutil/promregex.go | 20 ++++++--- lib/regexutil/promregex_test.go | 4 ++ lib/regexutil/regex.go | 35 +++++++++------ lib/regexutil/regex_test.go | 17 +++++++ lib/regexutil/regexutil.go | 79 ++++++++++++++++----------------- 5 files changed, 96 insertions(+), 59 deletions(-) diff --git a/lib/regexutil/promregex.go b/lib/regexutil/promregex.go index eeb2f9667..ebf82f309 100644 --- a/lib/regexutil/promregex.go +++ b/lib/regexutil/promregex.go @@ -23,6 +23,9 @@ type PromRegex struct { // For example, prefix="foo" for regex="foo(a|b)" prefix string + // isOnlyPrefix is set to true if the regex contains only the prefix. + isOnlyPrefix bool + // isSuffixDotStar is set to true if suffix is ".*" isSuffixDotStar bool @@ -49,11 +52,13 @@ func NewPromRegex(expr string) (*PromRegex, error) { return nil, err } prefix, suffix := SimplifyPromRegex(expr) - orValues := GetOrValuesPromRegex(suffix) - isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar) - isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus) - substrDotStar := getSubstringLiteral(suffix, syntax.OpStar) - substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus) + sre := mustParseRegexp(suffix) + orValues := getOrValues(sre) + isOnlyPrefix := len(orValues) == 1 && orValues[0] == "" + isSuffixDotStar := isDotOp(sre, syntax.OpStar) + isSuffixDotPlus := isDotOp(sre, syntax.OpPlus) + substrDotStar := getSubstringLiteral(sre, syntax.OpStar) + substrDotPlus := getSubstringLiteral(sre, syntax.OpPlus) // It is expected that Optimize returns valid regexp in suffix, so use MustCompile here. // Anchor suffix to the beginning and the end of the matching string. suffixExpr := "^(?:" + suffix + ")$" @@ -61,6 +66,7 @@ func NewPromRegex(expr string) (*PromRegex, error) { reSuffixMatcher := bytesutil.NewFastStringMatcher(reSuffix.MatchString) pr := &PromRegex{ prefix: prefix, + isOnlyPrefix: isOnlyPrefix, isSuffixDotStar: isSuffixDotStar, isSuffixDotPlus: isSuffixDotPlus, substrDotStar: substrDotStar, @@ -76,6 +82,10 @@ func NewPromRegex(expr string) (*PromRegex, error) { // The pr is automatically anchored to the beginning and to the end // of the matching string with '^' and '$'. func (pr *PromRegex) MatchString(s string) bool { + if pr.isOnlyPrefix { + return s == pr.prefix + } + if len(pr.prefix) > 0 { if !strings.HasPrefix(s, pr.prefix) { // Fast path - s has another prefix than pr. diff --git a/lib/regexutil/promregex_test.go b/lib/regexutil/promregex_test.go index 83cea682f..24315a187 100644 --- a/lib/regexutil/promregex_test.go +++ b/lib/regexutil/promregex_test.go @@ -118,4 +118,8 @@ func TestPromRegex(t *testing.T) { f(".*;|;.*", "foo;bar", false) f(".*;|;.*", "foo;", true) f(".*;|;.*", ";foo", true) + + f(".*foo(bar|baz)", "fooxfoobaz", true) + f(".*foo(bar|baz)", "fooxfooban", false) + f(".*foo(bar|baz)", "fooxfooban foobar", true) } diff --git a/lib/regexutil/regex.go b/lib/regexutil/regex.go index 9ae668174..63483041f 100644 --- a/lib/regexutil/regex.go +++ b/lib/regexutil/regex.go @@ -19,6 +19,9 @@ type Regex struct { // For example, prefix="foo" for regex="foo(a|b)" prefix string + // isOnlyPrefix is set to true if the regex contains only the prefix. + isOnlyPrefix bool + // isSuffixDotStar is set to true if suffix is ".*" isSuffixDotStar bool @@ -44,25 +47,28 @@ func NewRegex(expr string) (*Regex, error) { if _, err := regexp.Compile(expr); err != nil { return nil, err } + prefix, suffix := SimplifyRegex(expr) - orValues := GetOrValuesRegex(suffix) - isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar) - isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus) - substrDotStar := getSubstringLiteral(suffix, syntax.OpStar) - substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus) + sre := mustParseRegexp(suffix) + orValues := getOrValues(sre) + isOnlyPrefix := len(orValues) == 1 && orValues[0] == "" + isSuffixDotStar := isDotOp(sre, syntax.OpStar) + isSuffixDotPlus := isDotOp(sre, syntax.OpPlus) + substrDotStar := getSubstringLiteral(sre, syntax.OpStar) + substrDotPlus := getSubstringLiteral(sre, syntax.OpPlus) var re *regexp.Regexp - if len(orValues) == 0 && substrDotStar == "" && substrDotPlus == "" && suffix != ".*" && suffix != ".+" { - suffixAnchored := suffix - if len(prefix) > 0 { - suffixAnchored = "^(?:" + suffix + ")" - } - // The suffixAnchored must be properly compiled, since it has been already checked above. - // Otherwise it is a bug, which must be fixed. - re = regexp.MustCompile(suffixAnchored) + suffixAnchored := suffix + if len(prefix) > 0 { + suffixAnchored = "^(?:" + suffix + ")" } + // The suffixAnchored must be properly compiled, since it has been already checked above. + // Otherwise it is a bug, which must be fixed. + re = regexp.MustCompile(suffixAnchored) + r := &Regex{ prefix: prefix, + isOnlyPrefix: isOnlyPrefix, isSuffixDotStar: isSuffixDotStar, isSuffixDotPlus: isSuffixDotPlus, substrDotStar: substrDotStar, @@ -75,6 +81,9 @@ func NewRegex(expr string) (*Regex, error) { // MatchString returns true if s matches pr. func (r *Regex) MatchString(s string) bool { + if r.isOnlyPrefix { + return strings.Contains(s, r.prefix) + } if len(r.prefix) == 0 { return r.matchStringNoPrefix(s) } diff --git a/lib/regexutil/regex_test.go b/lib/regexutil/regex_test.go index a2f0c79ad..72ab66dac 100644 --- a/lib/regexutil/regex_test.go +++ b/lib/regexutil/regex_test.go @@ -4,6 +4,20 @@ import ( "testing" ) +func TestNewRegexFailure(t *testing.T) { + f := func(regex string) { + t.Helper() + + re, err := NewRegex(regex) + if err == nil { + t.Fatalf("expecting non-nil error when parsing %q; got %q", regex, re.re) + } + } + + f("[foo") + f("(foo") +} + func TestRegexMatchString(t *testing.T) { f := func(regex, s string, resultExpected bool) { t.Helper() @@ -122,4 +136,7 @@ func TestRegexMatchString(t *testing.T) { f("baz$", "foobarbaz", true) f("(bar$|^foo)", "foobarbaz", true) f("(bar$^boo)", "foobarbaz", false) + f("foo(bar|baz)", "a fooxfoobaz a", true) + f("foo(bar|baz)", "a fooxfooban a", false) + f("foo(bar|baz)", "a fooxfooban foobar a", true) } diff --git a/lib/regexutil/regexutil.go b/lib/regexutil/regexutil.go index 5aa641279..5807e148a 100644 --- a/lib/regexutil/regexutil.go +++ b/lib/regexutil/regexutil.go @@ -45,9 +45,9 @@ func getOrValuesRegex(expr string, keepAnchors bool) []string { if tailExpr == "" { return []string{prefix} } - sre, err := syntax.Parse(tailExpr, regexParseFlags) + sre, err := parseRegexp(tailExpr) if err != nil { - panic(fmt.Errorf("BUG: unexpected error when parsing verified tailExpr=%q: %w", tailExpr, err)) + return nil } orValues := getOrValues(sre) @@ -69,10 +69,11 @@ func getOrValues(sre *syntax.Regexp) []string { case syntax.OpCapture: return getOrValues(sre.Sub[0]) case syntax.OpLiteral: - if !isLiteral(sre) { + v, ok := getLiteral(sre) + if !ok { return nil } - return []string{string(sre.Rune)} + return []string{v} case syntax.OpEmptyMatch: return []string{""} case syntax.OpAlternate: @@ -137,11 +138,14 @@ func getOrValues(sre *syntax.Regexp) []string { } } -func isLiteral(sre *syntax.Regexp) bool { +func getLiteral(sre *syntax.Regexp) (string, bool) { if sre.Op == syntax.OpCapture { - return isLiteral(sre.Sub[0]) + return getLiteral(sre.Sub[0]) } - return sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 + if sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 { + return string(sre.Rune), true + } + return "", false } const maxOrValues = 100 @@ -167,7 +171,7 @@ func SimplifyPromRegex(expr string) (string, string) { } func simplifyRegex(expr string, keepAnchors bool) (string, string) { - sre, err := syntax.Parse(expr, regexParseFlags) + sre, err := parseRegexp(expr) if err != nil { // Cannot parse the regexp. Return it all as prefix. return expr, "" @@ -176,14 +180,14 @@ func simplifyRegex(expr string, keepAnchors bool) (string, string) { if sre == emptyRegexp { return "", "" } - if isLiteral(sre) { - return string(sre.Rune), "" + v, ok := getLiteral(sre) + if ok { + return v, "" } var prefix string if sre.Op == syntax.OpConcat { - sub0 := sre.Sub[0] - if isLiteral(sub0) { - prefix = string(sub0.Rune) + prefix, ok = getLiteral(sre.Sub[0]) + if ok { sre.Sub = sre.Sub[1:] if len(sre.Sub) == 0 { return prefix, "" @@ -216,11 +220,7 @@ func simplifyRegexp(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Reg if sNew == s { return sre } - var err error - sre, err = syntax.Parse(sNew, regexParseFlags) - if err != nil { - panic(fmt.Errorf("BUG: cannot parse simplified regexp %q: %w", sNew, err)) - } + sre = mustParseRegexp(sNew) s = sNew } } @@ -282,36 +282,23 @@ func simplifyRegexpExt(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax. } } -// getSubstringLiteral returns regex part from expr surrounded by .+ or .* depending on the prefixSuffixOp. +// getSubstringLiteral returns regex part from sre surrounded by .+ or .* depending on the prefixSuffixOp. // -// For example, if expr=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo". +// For example, if sre=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo". // -// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix -// or if the regex part surrounded by prefixSuffix contains alternate regexps. -func getSubstringLiteral(expr string, prefixSuffixOp syntax.Op) string { - // Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix. - sre, err := syntax.Parse(expr, regexParseFlags) - if err != nil { +// An empty string is returned if sre doesn't contain the given prefixSuffixOp prefix and suffix. +func getSubstringLiteral(sre *syntax.Regexp, prefixSuffixOp syntax.Op) string { + if sre.Op != syntax.OpConcat || len(sre.Sub) != 3 { return "" } - if sre.Op != syntax.OpConcat { + if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) { return "" } - if len(sre.Sub) != 3 { + v, ok := getLiteral(sre.Sub[1]) + if !ok { return "" } - if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) || !isLiteral(sre.Sub[1]) { - return "" - } - return string(sre.Sub[1].Rune) -} - -func isDotOpRegexp(expr string, op syntax.Op) bool { - sre, err := syntax.Parse(expr, regexParseFlags) - if err != nil { - return false - } - return isDotOp(sre, op) + return v } func isDotOp(sre *syntax.Regexp, op syntax.Op) bool { @@ -325,4 +312,14 @@ var emptyRegexp = &syntax.Regexp{ Op: syntax.OpEmptyMatch, } -const regexParseFlags = syntax.Perl | syntax.DotNL +func parseRegexp(expr string) (*syntax.Regexp, error) { + return syntax.Parse(expr, syntax.Perl|syntax.DotNL) +} + +func mustParseRegexp(expr string) *syntax.Regexp { + sre, err := parseRegexp(expr) + if err != nil { + panic(fmt.Errorf("BUG: cannot parse already verified regexp %q: %w", expr, err)) + } + return sre +}