This commit is contained in:
Aliaksandr Valialkin 2024-05-23 21:24:08 +02:00
parent 32e96050f9
commit b0afef1e2b
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
5 changed files with 96 additions and 59 deletions

View file

@ -23,6 +23,9 @@ type PromRegex struct {
// For example, prefix="foo" for regex="foo(a|b)"
prefix string
// isOnlyPrefix is set to true if the regex contains only the prefix.
isOnlyPrefix bool
// isSuffixDotStar is set to true if suffix is ".*"
isSuffixDotStar bool
@ -49,11 +52,13 @@ func NewPromRegex(expr string) (*PromRegex, error) {
return nil, err
}
prefix, suffix := SimplifyPromRegex(expr)
orValues := GetOrValuesPromRegex(suffix)
isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar)
isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus)
substrDotStar := getSubstringLiteral(suffix, syntax.OpStar)
substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus)
sre := mustParseRegexp(suffix)
orValues := getOrValues(sre)
isOnlyPrefix := len(orValues) == 1 && orValues[0] == ""
isSuffixDotStar := isDotOp(sre, syntax.OpStar)
isSuffixDotPlus := isDotOp(sre, syntax.OpPlus)
substrDotStar := getSubstringLiteral(sre, syntax.OpStar)
substrDotPlus := getSubstringLiteral(sre, syntax.OpPlus)
// It is expected that Optimize returns valid regexp in suffix, so use MustCompile here.
// Anchor suffix to the beginning and the end of the matching string.
suffixExpr := "^(?:" + suffix + ")$"
@ -61,6 +66,7 @@ func NewPromRegex(expr string) (*PromRegex, error) {
reSuffixMatcher := bytesutil.NewFastStringMatcher(reSuffix.MatchString)
pr := &PromRegex{
prefix: prefix,
isOnlyPrefix: isOnlyPrefix,
isSuffixDotStar: isSuffixDotStar,
isSuffixDotPlus: isSuffixDotPlus,
substrDotStar: substrDotStar,
@ -76,6 +82,10 @@ func NewPromRegex(expr string) (*PromRegex, error) {
// The pr is automatically anchored to the beginning and to the end
// of the matching string with '^' and '$'.
func (pr *PromRegex) MatchString(s string) bool {
if pr.isOnlyPrefix {
return s == pr.prefix
}
if len(pr.prefix) > 0 {
if !strings.HasPrefix(s, pr.prefix) {
// Fast path - s has another prefix than pr.

View file

@ -118,4 +118,8 @@ func TestPromRegex(t *testing.T) {
f(".*;|;.*", "foo;bar", false)
f(".*;|;.*", "foo;", true)
f(".*;|;.*", ";foo", true)
f(".*foo(bar|baz)", "fooxfoobaz", true)
f(".*foo(bar|baz)", "fooxfooban", false)
f(".*foo(bar|baz)", "fooxfooban foobar", true)
}

View file

@ -19,6 +19,9 @@ type Regex struct {
// For example, prefix="foo" for regex="foo(a|b)"
prefix string
// isOnlyPrefix is set to true if the regex contains only the prefix.
isOnlyPrefix bool
// isSuffixDotStar is set to true if suffix is ".*"
isSuffixDotStar bool
@ -44,25 +47,28 @@ func NewRegex(expr string) (*Regex, error) {
if _, err := regexp.Compile(expr); err != nil {
return nil, err
}
prefix, suffix := SimplifyRegex(expr)
orValues := GetOrValuesRegex(suffix)
isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar)
isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus)
substrDotStar := getSubstringLiteral(suffix, syntax.OpStar)
substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus)
sre := mustParseRegexp(suffix)
orValues := getOrValues(sre)
isOnlyPrefix := len(orValues) == 1 && orValues[0] == ""
isSuffixDotStar := isDotOp(sre, syntax.OpStar)
isSuffixDotPlus := isDotOp(sre, syntax.OpPlus)
substrDotStar := getSubstringLiteral(sre, syntax.OpStar)
substrDotPlus := getSubstringLiteral(sre, syntax.OpPlus)
var re *regexp.Regexp
if len(orValues) == 0 && substrDotStar == "" && substrDotPlus == "" && suffix != ".*" && suffix != ".+" {
suffixAnchored := suffix
if len(prefix) > 0 {
suffixAnchored = "^(?:" + suffix + ")"
}
// The suffixAnchored must be properly compiled, since it has been already checked above.
// Otherwise it is a bug, which must be fixed.
re = regexp.MustCompile(suffixAnchored)
suffixAnchored := suffix
if len(prefix) > 0 {
suffixAnchored = "^(?:" + suffix + ")"
}
// The suffixAnchored must be properly compiled, since it has been already checked above.
// Otherwise it is a bug, which must be fixed.
re = regexp.MustCompile(suffixAnchored)
r := &Regex{
prefix: prefix,
isOnlyPrefix: isOnlyPrefix,
isSuffixDotStar: isSuffixDotStar,
isSuffixDotPlus: isSuffixDotPlus,
substrDotStar: substrDotStar,
@ -75,6 +81,9 @@ func NewRegex(expr string) (*Regex, error) {
// MatchString returns true if s matches pr.
func (r *Regex) MatchString(s string) bool {
if r.isOnlyPrefix {
return strings.Contains(s, r.prefix)
}
if len(r.prefix) == 0 {
return r.matchStringNoPrefix(s)
}

View file

@ -4,6 +4,20 @@ import (
"testing"
)
func TestNewRegexFailure(t *testing.T) {
f := func(regex string) {
t.Helper()
re, err := NewRegex(regex)
if err == nil {
t.Fatalf("expecting non-nil error when parsing %q; got %q", regex, re.re)
}
}
f("[foo")
f("(foo")
}
func TestRegexMatchString(t *testing.T) {
f := func(regex, s string, resultExpected bool) {
t.Helper()
@ -122,4 +136,7 @@ func TestRegexMatchString(t *testing.T) {
f("baz$", "foobarbaz", true)
f("(bar$|^foo)", "foobarbaz", true)
f("(bar$^boo)", "foobarbaz", false)
f("foo(bar|baz)", "a fooxfoobaz a", true)
f("foo(bar|baz)", "a fooxfooban a", false)
f("foo(bar|baz)", "a fooxfooban foobar a", true)
}

View file

@ -45,9 +45,9 @@ func getOrValuesRegex(expr string, keepAnchors bool) []string {
if tailExpr == "" {
return []string{prefix}
}
sre, err := syntax.Parse(tailExpr, regexParseFlags)
sre, err := parseRegexp(tailExpr)
if err != nil {
panic(fmt.Errorf("BUG: unexpected error when parsing verified tailExpr=%q: %w", tailExpr, err))
return nil
}
orValues := getOrValues(sre)
@ -69,10 +69,11 @@ func getOrValues(sre *syntax.Regexp) []string {
case syntax.OpCapture:
return getOrValues(sre.Sub[0])
case syntax.OpLiteral:
if !isLiteral(sre) {
v, ok := getLiteral(sre)
if !ok {
return nil
}
return []string{string(sre.Rune)}
return []string{v}
case syntax.OpEmptyMatch:
return []string{""}
case syntax.OpAlternate:
@ -137,11 +138,14 @@ func getOrValues(sre *syntax.Regexp) []string {
}
}
func isLiteral(sre *syntax.Regexp) bool {
func getLiteral(sre *syntax.Regexp) (string, bool) {
if sre.Op == syntax.OpCapture {
return isLiteral(sre.Sub[0])
return getLiteral(sre.Sub[0])
}
return sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0
if sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 {
return string(sre.Rune), true
}
return "", false
}
const maxOrValues = 100
@ -167,7 +171,7 @@ func SimplifyPromRegex(expr string) (string, string) {
}
func simplifyRegex(expr string, keepAnchors bool) (string, string) {
sre, err := syntax.Parse(expr, regexParseFlags)
sre, err := parseRegexp(expr)
if err != nil {
// Cannot parse the regexp. Return it all as prefix.
return expr, ""
@ -176,14 +180,14 @@ func simplifyRegex(expr string, keepAnchors bool) (string, string) {
if sre == emptyRegexp {
return "", ""
}
if isLiteral(sre) {
return string(sre.Rune), ""
v, ok := getLiteral(sre)
if ok {
return v, ""
}
var prefix string
if sre.Op == syntax.OpConcat {
sub0 := sre.Sub[0]
if isLiteral(sub0) {
prefix = string(sub0.Rune)
prefix, ok = getLiteral(sre.Sub[0])
if ok {
sre.Sub = sre.Sub[1:]
if len(sre.Sub) == 0 {
return prefix, ""
@ -216,11 +220,7 @@ func simplifyRegexp(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Reg
if sNew == s {
return sre
}
var err error
sre, err = syntax.Parse(sNew, regexParseFlags)
if err != nil {
panic(fmt.Errorf("BUG: cannot parse simplified regexp %q: %w", sNew, err))
}
sre = mustParseRegexp(sNew)
s = sNew
}
}
@ -282,36 +282,23 @@ func simplifyRegexpExt(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.
}
}
// getSubstringLiteral returns regex part from expr surrounded by .+ or .* depending on the prefixSuffixOp.
// getSubstringLiteral returns regex part from sre surrounded by .+ or .* depending on the prefixSuffixOp.
//
// For example, if expr=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo".
// For example, if sre=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo".
//
// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix
// or if the regex part surrounded by prefixSuffix contains alternate regexps.
func getSubstringLiteral(expr string, prefixSuffixOp syntax.Op) string {
// Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix.
sre, err := syntax.Parse(expr, regexParseFlags)
if err != nil {
// An empty string is returned if sre doesn't contain the given prefixSuffixOp prefix and suffix.
func getSubstringLiteral(sre *syntax.Regexp, prefixSuffixOp syntax.Op) string {
if sre.Op != syntax.OpConcat || len(sre.Sub) != 3 {
return ""
}
if sre.Op != syntax.OpConcat {
if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) {
return ""
}
if len(sre.Sub) != 3 {
v, ok := getLiteral(sre.Sub[1])
if !ok {
return ""
}
if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) || !isLiteral(sre.Sub[1]) {
return ""
}
return string(sre.Sub[1].Rune)
}
func isDotOpRegexp(expr string, op syntax.Op) bool {
sre, err := syntax.Parse(expr, regexParseFlags)
if err != nil {
return false
}
return isDotOp(sre, op)
return v
}
func isDotOp(sre *syntax.Regexp, op syntax.Op) bool {
@ -325,4 +312,14 @@ var emptyRegexp = &syntax.Regexp{
Op: syntax.OpEmptyMatch,
}
const regexParseFlags = syntax.Perl | syntax.DotNL
func parseRegexp(expr string) (*syntax.Regexp, error) {
return syntax.Parse(expr, syntax.Perl|syntax.DotNL)
}
func mustParseRegexp(expr string) *syntax.Regexp {
sre, err := parseRegexp(expr)
if err != nil {
panic(fmt.Errorf("BUG: cannot parse already verified regexp %q: %w", expr, err))
}
return sre
}