mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-31 15:06:26 +00:00
wip
This commit is contained in:
parent
ceae8a7e08
commit
91b006f0a7
5 changed files with 429 additions and 91 deletions
|
@ -23,9 +23,11 @@ type PromRegex struct {
|
|||
// For example, prefix="foo" for regex="foo(a|b)"
|
||||
prefix string
|
||||
|
||||
// Suffix contains regex suffix left after removing the prefix.
|
||||
// For example, suffix="a|b" for regex="foo(a|b)"
|
||||
suffix string
|
||||
// isSuffixDotStar is set to true if suffix is ".*"
|
||||
isSuffixDotStar bool
|
||||
|
||||
// isSuffixDotPlus is set to true if suffix is ".+"
|
||||
isSuffixDotPlus bool
|
||||
|
||||
// substrDotStar contains literal string for regex suffix=".*string.*"
|
||||
substrDotStar string
|
||||
|
@ -48,8 +50,10 @@ func NewPromRegex(expr string) (*PromRegex, error) {
|
|||
}
|
||||
prefix, suffix := SimplifyPromRegex(expr)
|
||||
orValues := GetOrValuesPromRegex(suffix)
|
||||
substrDotStar := getSubstringLiteral(suffix, ".*")
|
||||
substrDotPlus := getSubstringLiteral(suffix, ".+")
|
||||
isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar)
|
||||
isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus)
|
||||
substrDotStar := getSubstringLiteral(suffix, syntax.OpStar)
|
||||
substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus)
|
||||
// It is expected that Optimize returns valid regexp in suffix, so use MustCompile here.
|
||||
// Anchor suffix to the beginning and the end of the matching string.
|
||||
suffixExpr := "^(?:" + suffix + ")$"
|
||||
|
@ -57,7 +61,8 @@ func NewPromRegex(expr string) (*PromRegex, error) {
|
|||
reSuffixMatcher := bytesutil.NewFastStringMatcher(reSuffix.MatchString)
|
||||
pr := &PromRegex{
|
||||
prefix: prefix,
|
||||
suffix: suffix,
|
||||
isSuffixDotStar: isSuffixDotStar,
|
||||
isSuffixDotPlus: isSuffixDotPlus,
|
||||
substrDotStar: substrDotStar,
|
||||
substrDotPlus: substrDotPlus,
|
||||
orValues: orValues,
|
||||
|
@ -71,19 +76,21 @@ func NewPromRegex(expr string) (*PromRegex, error) {
|
|||
// The pr is automatically anchored to the beginning and to the end
|
||||
// of the matching string with '^' and '$'.
|
||||
func (pr *PromRegex) MatchString(s string) bool {
|
||||
if !strings.HasPrefix(s, pr.prefix) {
|
||||
// Fast path - s has another prefix than pr.
|
||||
return false
|
||||
}
|
||||
s = s[len(pr.prefix):]
|
||||
if len(pr.orValues) > 0 {
|
||||
// Fast path - pr contains only alternate strings such as 'foo|bar|baz'
|
||||
for _, v := range pr.orValues {
|
||||
if s == v {
|
||||
return true
|
||||
}
|
||||
if len(pr.prefix) > 0 {
|
||||
if !strings.HasPrefix(s, pr.prefix) {
|
||||
// Fast path - s has another prefix than pr.
|
||||
return false
|
||||
}
|
||||
return false
|
||||
s = s[len(pr.prefix):]
|
||||
}
|
||||
|
||||
if pr.isSuffixDotStar {
|
||||
// Fast path - the pr contains "prefix.*"
|
||||
return true
|
||||
}
|
||||
if pr.isSuffixDotPlus {
|
||||
// Fast path - the pr contains "prefix.+"
|
||||
return len(s) > 0
|
||||
}
|
||||
if pr.substrDotStar != "" {
|
||||
// Fast path - pr contains ".*someText.*"
|
||||
|
@ -94,45 +101,17 @@ func (pr *PromRegex) MatchString(s string) bool {
|
|||
n := strings.Index(s, pr.substrDotPlus)
|
||||
return n > 0 && n+len(pr.substrDotPlus) < len(s)
|
||||
}
|
||||
switch pr.suffix {
|
||||
case ".*":
|
||||
// Fast path - the pr contains "prefix.*"
|
||||
return true
|
||||
case ".+":
|
||||
// Fast path - the pr contains "prefix.+"
|
||||
return len(s) > 0
|
||||
|
||||
if len(pr.orValues) > 0 {
|
||||
// Fast path - pr contains only alternate strings such as 'foo|bar|baz'
|
||||
for _, v := range pr.orValues {
|
||||
if s == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Fall back to slow path by matching the original regexp.
|
||||
return pr.reSuffixMatcher.Match(s)
|
||||
}
|
||||
|
||||
// getSubstringLiteral returns regex part from expr surrounded by prefixSuffix.
|
||||
//
|
||||
// For example, if expr=".+foo.+" and prefixSuffix=".+", then the function returns "foo".
|
||||
//
|
||||
// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix
|
||||
// or if the regex part surrounded by prefixSuffix contains alternate regexps.
|
||||
func getSubstringLiteral(expr, prefixSuffix string) string {
|
||||
// Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix.
|
||||
sre, err := syntax.Parse(expr, syntax.Perl)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
if sre.Op == syntax.OpAlternate {
|
||||
return ""
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(expr, prefixSuffix) {
|
||||
return ""
|
||||
}
|
||||
expr = expr[len(prefixSuffix):]
|
||||
if !strings.HasSuffix(expr, prefixSuffix) {
|
||||
return ""
|
||||
}
|
||||
expr = expr[:len(expr)-len(prefixSuffix)]
|
||||
prefix, suffix := SimplifyPromRegex(expr)
|
||||
if suffix != "" {
|
||||
return ""
|
||||
}
|
||||
return prefix
|
||||
}
|
||||
|
|
165
lib/regexutil/regex.go
Normal file
165
lib/regexutil/regex.go
Normal file
|
@ -0,0 +1,165 @@
|
|||
package regexutil
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"regexp/syntax"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Regex implements an optimized string matching for Go regex.
|
||||
//
|
||||
// The following regexs are optimized:
|
||||
//
|
||||
// - plain string such as "foobar"
|
||||
// - alternate strings such as "foo|bar|baz"
|
||||
// - prefix match such as "foo.*" or "foo.+"
|
||||
// - substring match such as ".*foo.*" or ".+bar.+"
|
||||
type Regex struct {
|
||||
// prefix contains literal prefix for regex.
|
||||
// For example, prefix="foo" for regex="foo(a|b)"
|
||||
prefix string
|
||||
|
||||
// isSuffixDotStar is set to true if suffix is ".*"
|
||||
isSuffixDotStar bool
|
||||
|
||||
// isSuffixDotPlus is set to true if suffix is ".+"
|
||||
isSuffixDotPlus bool
|
||||
|
||||
// substrDotStar contains literal string for regex suffix=".*string.*"
|
||||
substrDotStar string
|
||||
|
||||
// substrDotPlus contains literal string for regex suffix=".+string.+"
|
||||
substrDotPlus string
|
||||
|
||||
// orValues contains or values for the suffix regex.
|
||||
// For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz"
|
||||
orValues []string
|
||||
|
||||
// re is the original regexp.
|
||||
re *regexp.Regexp
|
||||
}
|
||||
|
||||
// NewRegex returns Regex for the given expr.
|
||||
func NewRegex(expr string) (*Regex, error) {
|
||||
if _, err := regexp.Compile(expr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
prefix, suffix := SimplifyRegex(expr)
|
||||
orValues := GetOrValuesRegex(suffix)
|
||||
isSuffixDotStar := isDotOpRegexp(suffix, syntax.OpStar)
|
||||
isSuffixDotPlus := isDotOpRegexp(suffix, syntax.OpPlus)
|
||||
substrDotStar := getSubstringLiteral(suffix, syntax.OpStar)
|
||||
substrDotPlus := getSubstringLiteral(suffix, syntax.OpPlus)
|
||||
|
||||
var re *regexp.Regexp
|
||||
if len(orValues) == 0 && substrDotStar == "" && substrDotPlus == "" && suffix != ".*" && suffix != ".+" {
|
||||
suffixAnchored := suffix
|
||||
if len(prefix) > 0 {
|
||||
suffixAnchored = "^(?:" + suffix + ")"
|
||||
}
|
||||
// The suffixAnchored must be properly compiled, since it has been already checked above.
|
||||
// Otherwise it is a bug, which must be fixed.
|
||||
re = regexp.MustCompile(suffixAnchored)
|
||||
}
|
||||
r := &Regex{
|
||||
prefix: prefix,
|
||||
isSuffixDotStar: isSuffixDotStar,
|
||||
isSuffixDotPlus: isSuffixDotPlus,
|
||||
substrDotStar: substrDotStar,
|
||||
substrDotPlus: substrDotPlus,
|
||||
orValues: orValues,
|
||||
re: re,
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// MatchString returns true if s matches pr.
|
||||
func (r *Regex) MatchString(s string) bool {
|
||||
if len(r.prefix) == 0 {
|
||||
return r.matchStringNoPrefix(s)
|
||||
}
|
||||
return r.matchStringWithPrefix(s)
|
||||
}
|
||||
|
||||
func (r *Regex) matchStringNoPrefix(s string) bool {
|
||||
if r.isSuffixDotStar {
|
||||
return true
|
||||
}
|
||||
if r.isSuffixDotPlus {
|
||||
return len(s) > 0
|
||||
}
|
||||
if r.substrDotStar != "" {
|
||||
// Fast path - r contains ".*someText.*"
|
||||
return strings.Contains(s, r.substrDotStar)
|
||||
}
|
||||
if r.substrDotPlus != "" {
|
||||
// Fast path - r contains ".+someText.+"
|
||||
n := strings.Index(s, r.substrDotPlus)
|
||||
return n > 0 && n+len(r.substrDotPlus) < len(s)
|
||||
}
|
||||
|
||||
if len(r.orValues) == 0 {
|
||||
// Fall back to slow path by matching the original regexp.
|
||||
return r.re.MatchString(s)
|
||||
}
|
||||
|
||||
// Fast path - compare s to pr.orValues
|
||||
for _, v := range r.orValues {
|
||||
if strings.Contains(s, v) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (r *Regex) matchStringWithPrefix(s string) bool {
|
||||
n := strings.Index(s, r.prefix)
|
||||
if n < 0 {
|
||||
// Fast path - s doesn't contain the needed prefix
|
||||
return false
|
||||
}
|
||||
sNext := s[n+1:]
|
||||
s = s[n+len(r.prefix):]
|
||||
|
||||
if r.isSuffixDotStar {
|
||||
return true
|
||||
}
|
||||
if r.isSuffixDotPlus {
|
||||
return len(s) > 0
|
||||
}
|
||||
if r.substrDotStar != "" {
|
||||
// Fast path - r contains ".*someText.*"
|
||||
return strings.Contains(s, r.substrDotStar)
|
||||
}
|
||||
if r.substrDotPlus != "" {
|
||||
// Fast path - r contains ".+someText.+"
|
||||
n := strings.Index(s, r.substrDotPlus)
|
||||
return n > 0 && n+len(r.substrDotPlus) < len(s)
|
||||
}
|
||||
|
||||
for {
|
||||
if len(r.orValues) == 0 {
|
||||
// Fall back to slow path by matching the original regexp.
|
||||
if r.re.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
// Fast path - compare s to pr.orValues
|
||||
for _, v := range r.orValues {
|
||||
if strings.HasPrefix(s, v) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mismatch. Try again starting from the next char.
|
||||
s = sNext
|
||||
n := strings.Index(s, r.prefix)
|
||||
if n < 0 {
|
||||
// Fast path - s doesn't contain the needed prefix
|
||||
return false
|
||||
}
|
||||
sNext = s[n+1:]
|
||||
s = s[n+len(r.prefix):]
|
||||
}
|
||||
}
|
125
lib/regexutil/regex_test.go
Normal file
125
lib/regexutil/regex_test.go
Normal file
|
@ -0,0 +1,125 @@
|
|||
package regexutil
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRegexMatchString(t *testing.T) {
|
||||
f := func(regex, s string, resultExpected bool) {
|
||||
t.Helper()
|
||||
|
||||
re, err := NewRegex(regex)
|
||||
if err != nil {
|
||||
t.Fatalf("cannot parse %q: %s", regex, err)
|
||||
}
|
||||
result := re.MatchString(s)
|
||||
if result != resultExpected {
|
||||
t.Fatalf("unexpected result when matching %q against regex=%q; got %v; want %v", s, regex, result, resultExpected)
|
||||
}
|
||||
}
|
||||
|
||||
f("", "", true)
|
||||
f("", "foo", true)
|
||||
f("foo", "", false)
|
||||
f(".*", "", true)
|
||||
f(".*", "foo", true)
|
||||
f(".+", "", false)
|
||||
f(".+", "foo", true)
|
||||
f("foo.*", "bar", false)
|
||||
f("foo.*", "foo", true)
|
||||
f("foo.*", "a foo", true)
|
||||
f("foo.*", "a foo a", true)
|
||||
f("foo.*", "foobar", true)
|
||||
f("foo.*", "a foobar", true)
|
||||
f("foo.+", "bar", false)
|
||||
f("foo.+", "foo", false)
|
||||
f("foo.+", "a foo", false)
|
||||
f("foo.+", "foobar", true)
|
||||
f("foo.+", "a foobar", true)
|
||||
f("foo|bar", "", false)
|
||||
f("foo|bar", "a", false)
|
||||
f("foo|bar", "foo", true)
|
||||
f("foo|bar", "a foo", true)
|
||||
f("foo|bar", "foo a", true)
|
||||
f("foo|bar", "a foo a", true)
|
||||
f("foo|bar", "bar", true)
|
||||
f("foo|bar", "foobar", true)
|
||||
f("foo(bar|baz)", "a", false)
|
||||
f("foo(bar|baz)", "foobar", true)
|
||||
f("foo(bar|baz)", "foobaz", true)
|
||||
f("foo(bar|baz)", "foobaza", true)
|
||||
f("foo(bar|baz)", "a foobaz a", true)
|
||||
f("foo(bar|baz)", "foobal", false)
|
||||
f("^foo|b(ar)$", "foo", true)
|
||||
f("^foo|b(ar)$", "foo a", true)
|
||||
f("^foo|b(ar)$", "a foo", false)
|
||||
f("^foo|b(ar)$", "bar", true)
|
||||
f("^foo|b(ar)$", "a bar", true)
|
||||
f("^foo|b(ar)$", "barz", false)
|
||||
f("^foo|b(ar)$", "ar", false)
|
||||
f(".*foo.*", "foo", true)
|
||||
f(".*foo.*", "afoobar", true)
|
||||
f(".*foo.*", "abc", false)
|
||||
f("foo.*bar.*", "foobar", true)
|
||||
f("foo.*bar.*", "foo_bar_", true)
|
||||
f("foo.*bar.*", "a foo bar baz", true)
|
||||
f("foo.*bar.*", "foobaz", false)
|
||||
f("foo.*bar.*", "baz foo", false)
|
||||
f(".+foo.+", "foo", false)
|
||||
f(".+foo.+", "afoobar", true)
|
||||
f(".+foo.+", "afoo", false)
|
||||
f(".+foo.+", "abc", false)
|
||||
f("foo.+bar.+", "foobar", false)
|
||||
f("foo.+bar.+", "foo_bar_", true)
|
||||
f("foo.+bar.+", "a foo_bar_", true)
|
||||
f("foo.+bar.+", "foobaz", false)
|
||||
f("foo.+bar.+", "abc", false)
|
||||
f(".+foo.*", "foo", false)
|
||||
f(".+foo.*", "afoo", true)
|
||||
f(".+foo.*", "afoobar", true)
|
||||
f(".*(a|b).*", "a", true)
|
||||
f(".*(a|b).*", "ax", true)
|
||||
f(".*(a|b).*", "xa", true)
|
||||
f(".*(a|b).*", "xay", true)
|
||||
f(".*(a|b).*", "xzy", false)
|
||||
f("^(?:true)$", "true", true)
|
||||
f("^(?:true)$", "false", false)
|
||||
|
||||
f(".+;|;.+", ";", false)
|
||||
f(".+;|;.+", "foo", false)
|
||||
f(".+;|;.+", "foo;bar", true)
|
||||
f(".+;|;.+", "foo;", true)
|
||||
f(".+;|;.+", ";foo", true)
|
||||
f(".+foo|bar|baz.+", "foo", false)
|
||||
f(".+foo|bar|baz.+", "afoo", true)
|
||||
f(".+foo|bar|baz.+", "fooa", false)
|
||||
f(".+foo|bar|baz.+", "afooa", true)
|
||||
f(".+foo|bar|baz.+", "bar", true)
|
||||
f(".+foo|bar|baz.+", "abar", true)
|
||||
f(".+foo|bar|baz.+", "abara", true)
|
||||
f(".+foo|bar|baz.+", "bara", true)
|
||||
f(".+foo|bar|baz.+", "baz", false)
|
||||
f(".+foo|bar|baz.+", "baza", true)
|
||||
f(".+foo|bar|baz.+", "abaz", false)
|
||||
f(".+foo|bar|baz.+", "abaza", true)
|
||||
f(".+foo|bar|baz.+", "afoo|bar|baza", true)
|
||||
f(".+(foo|bar|baz).+", "bar", false)
|
||||
f(".+(foo|bar|baz).+", "bara", false)
|
||||
f(".+(foo|bar|baz).+", "abar", false)
|
||||
f(".+(foo|bar|baz).+", "abara", true)
|
||||
f(".+(foo|bar|baz).+", "afooa", true)
|
||||
f(".+(foo|bar|baz).+", "abaza", true)
|
||||
|
||||
f(".*;|;.*", ";", true)
|
||||
f(".*;|;.*", "foo", false)
|
||||
f(".*;|;.*", "foo;bar", true)
|
||||
f(".*;|;.*", "foo;", true)
|
||||
f(".*;|;.*", ";foo", true)
|
||||
|
||||
f("^bar", "foobarbaz", false)
|
||||
f("^foo", "foobarbaz", true)
|
||||
f("bar$", "foobarbaz", false)
|
||||
f("baz$", "foobarbaz", true)
|
||||
f("(bar$|^foo)", "foobarbaz", true)
|
||||
f("(bar$^boo)", "foobarbaz", false)
|
||||
}
|
|
@ -18,6 +18,16 @@ func RemoveStartEndAnchors(expr string) string {
|
|||
return expr
|
||||
}
|
||||
|
||||
// GetOrValuesRegex returns "or" values from the given regexp expr.
|
||||
//
|
||||
// It returns ["foo", "bar"] for "foo|bar" regexp.
|
||||
// It returns ["foo"] for "foo" regexp.
|
||||
// It returns [""] for "" regexp.
|
||||
// It returns an empty list if it is impossible to extract "or" values from the regexp.
|
||||
func GetOrValuesRegex(expr string) []string {
|
||||
return getOrValuesRegex(expr, true)
|
||||
}
|
||||
|
||||
// GetOrValuesPromRegex returns "or" values from the given Prometheus-like regexp expr.
|
||||
//
|
||||
// It ignores start and end anchors ('^') and ('$') at the start and the end of expr.
|
||||
|
@ -27,15 +37,19 @@ func RemoveStartEndAnchors(expr string) string {
|
|||
// It returns an empty list if it is impossible to extract "or" values from the regexp.
|
||||
func GetOrValuesPromRegex(expr string) []string {
|
||||
expr = RemoveStartEndAnchors(expr)
|
||||
prefix, tailExpr := SimplifyPromRegex(expr)
|
||||
return getOrValuesRegex(expr, false)
|
||||
}
|
||||
|
||||
func getOrValuesRegex(expr string, keepAnchors bool) []string {
|
||||
prefix, tailExpr := simplifyRegex(expr, keepAnchors)
|
||||
if tailExpr == "" {
|
||||
return []string{prefix}
|
||||
}
|
||||
sre, err := syntax.Parse(tailExpr, syntax.Perl)
|
||||
sre, err := syntax.Parse(tailExpr, regexParseFlags)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("BUG: unexpected error when parsing verified tailExpr=%q: %w", tailExpr, err))
|
||||
}
|
||||
orValues := getOrValuesExt(sre)
|
||||
orValues := getOrValues(sre)
|
||||
|
||||
// Sort orValues for faster index seek later
|
||||
sort.Strings(orValues)
|
||||
|
@ -50,10 +64,10 @@ func GetOrValuesPromRegex(expr string) []string {
|
|||
return orValues
|
||||
}
|
||||
|
||||
func getOrValuesExt(sre *syntax.Regexp) []string {
|
||||
func getOrValues(sre *syntax.Regexp) []string {
|
||||
switch sre.Op {
|
||||
case syntax.OpCapture:
|
||||
return getOrValuesExt(sre.Sub[0])
|
||||
return getOrValues(sre.Sub[0])
|
||||
case syntax.OpLiteral:
|
||||
if !isLiteral(sre) {
|
||||
return nil
|
||||
|
@ -64,7 +78,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string {
|
|||
case syntax.OpAlternate:
|
||||
a := make([]string, 0, len(sre.Sub))
|
||||
for _, reSub := range sre.Sub {
|
||||
ca := getOrValuesExt(reSub)
|
||||
ca := getOrValues(reSub)
|
||||
if len(ca) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
@ -94,7 +108,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string {
|
|||
if len(sre.Sub) < 1 {
|
||||
return []string{""}
|
||||
}
|
||||
prefixes := getOrValuesExt(sre.Sub[0])
|
||||
prefixes := getOrValues(sre.Sub[0])
|
||||
if len(prefixes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
@ -102,7 +116,7 @@ func getOrValuesExt(sre *syntax.Regexp) []string {
|
|||
return prefixes
|
||||
}
|
||||
sre.Sub = sre.Sub[1:]
|
||||
suffixes := getOrValuesExt(sre)
|
||||
suffixes := getOrValues(sre)
|
||||
if len(suffixes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
@ -132,21 +146,33 @@ func isLiteral(sre *syntax.Regexp) bool {
|
|||
|
||||
const maxOrValues = 100
|
||||
|
||||
// SimplifyRegex simplifies the given regexp expr.
|
||||
//
|
||||
// It returns plaintext pefix and the remaining regular expression
|
||||
// without capturing parens.
|
||||
func SimplifyRegex(expr string) (string, string) {
|
||||
return simplifyRegex(expr, true)
|
||||
}
|
||||
|
||||
// SimplifyPromRegex simplifies the given Prometheus-like expr.
|
||||
//
|
||||
// It returns plaintext prefix and the remaining regular expression
|
||||
// with dropped '^' and '$' anchors at the beginning and the end
|
||||
// with dropped '^' and '$' anchors at the beginning and at the end
|
||||
// of the regular expression.
|
||||
//
|
||||
// The function removes capturing parens from the expr,
|
||||
// so it cannot be used when capturing parens are necessary.
|
||||
func SimplifyPromRegex(expr string) (string, string) {
|
||||
sre, err := syntax.Parse(expr, syntax.Perl)
|
||||
return simplifyRegex(expr, false)
|
||||
}
|
||||
|
||||
func simplifyRegex(expr string, keepAnchors bool) (string, string) {
|
||||
sre, err := syntax.Parse(expr, regexParseFlags)
|
||||
if err != nil {
|
||||
// Cannot parse the regexp. Return it all as prefix.
|
||||
return expr, ""
|
||||
}
|
||||
sre = simplifyRegexp(sre, false)
|
||||
sre = simplifyRegexp(sre, keepAnchors, keepAnchors)
|
||||
if sre == emptyRegexp {
|
||||
return "", ""
|
||||
}
|
||||
|
@ -162,7 +188,7 @@ func SimplifyPromRegex(expr string) (string, string) {
|
|||
if len(sre.Sub) == 0 {
|
||||
return prefix, ""
|
||||
}
|
||||
sre = simplifyRegexp(sre, true)
|
||||
sre = simplifyRegexp(sre, true, keepAnchors)
|
||||
}
|
||||
}
|
||||
if _, err := syntax.Compile(sre); err != nil {
|
||||
|
@ -171,17 +197,19 @@ func SimplifyPromRegex(expr string) (string, string) {
|
|||
}
|
||||
s := sre.String()
|
||||
s = strings.ReplaceAll(s, "(?:)", "")
|
||||
s = strings.ReplaceAll(s, "(?-s:.)", ".")
|
||||
s = strings.ReplaceAll(s, "(?-m:$)", "$")
|
||||
s = strings.ReplaceAll(s, "(?s:.)", ".")
|
||||
s = strings.ReplaceAll(s, "(?m:$)", "$")
|
||||
return prefix, s
|
||||
}
|
||||
|
||||
func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp {
|
||||
func simplifyRegexp(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp {
|
||||
s := sre.String()
|
||||
for {
|
||||
sre = simplifyRegexpExt(sre, hasPrefix, false)
|
||||
sre = simplifyRegexpExt(sre, keepBeginOp, keepEndOp)
|
||||
sre = sre.Simplify()
|
||||
if sre.Op == syntax.OpBeginText || sre.Op == syntax.OpEndText {
|
||||
if !keepBeginOp && sre.Op == syntax.OpBeginText {
|
||||
sre = emptyRegexp
|
||||
} else if !keepEndOp && sre.Op == syntax.OpEndText {
|
||||
sre = emptyRegexp
|
||||
}
|
||||
sNew := sre.String()
|
||||
|
@ -189,7 +217,7 @@ func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp {
|
|||
return sre
|
||||
}
|
||||
var err error
|
||||
sre, err = syntax.Parse(sNew, syntax.Perl)
|
||||
sre, err = syntax.Parse(sNew, regexParseFlags)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("BUG: cannot parse simplified regexp %q: %w", sNew, err))
|
||||
}
|
||||
|
@ -197,18 +225,18 @@ func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp {
|
|||
}
|
||||
}
|
||||
|
||||
func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Regexp {
|
||||
func simplifyRegexpExt(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp {
|
||||
switch sre.Op {
|
||||
case syntax.OpCapture:
|
||||
// Substitute all the capture regexps with non-capture regexps.
|
||||
sre.Op = syntax.OpAlternate
|
||||
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix)
|
||||
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp)
|
||||
if sre.Sub[0] == emptyRegexp {
|
||||
return emptyRegexp
|
||||
}
|
||||
return sre
|
||||
case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat:
|
||||
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix)
|
||||
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp)
|
||||
if sre.Sub[0] == emptyRegexp {
|
||||
return emptyRegexp
|
||||
}
|
||||
|
@ -216,13 +244,13 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re
|
|||
case syntax.OpAlternate:
|
||||
// Do not remove empty captures from OpAlternate, since this may break regexp.
|
||||
for i, sub := range sre.Sub {
|
||||
sre.Sub[i] = simplifyRegexpExt(sub, hasPrefix, hasSuffix)
|
||||
sre.Sub[i] = simplifyRegexpExt(sub, keepBeginOp, keepEndOp)
|
||||
}
|
||||
return sre
|
||||
case syntax.OpConcat:
|
||||
subs := sre.Sub[:0]
|
||||
for i, sub := range sre.Sub {
|
||||
sub = simplifyRegexpExt(sub, hasPrefix || len(subs) > 0, hasSuffix || i+1 < len(sre.Sub))
|
||||
sub = simplifyRegexpExt(sub, keepBeginOp || len(subs) > 0, keepEndOp || i+1 < len(sre.Sub))
|
||||
if sub != emptyRegexp {
|
||||
subs = append(subs, sub)
|
||||
}
|
||||
|
@ -230,12 +258,12 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re
|
|||
sre.Sub = subs
|
||||
// Remove anchros from the beginning and the end of regexp, since they
|
||||
// will be added later.
|
||||
if !hasPrefix {
|
||||
if !keepBeginOp {
|
||||
for len(sre.Sub) > 0 && sre.Sub[0].Op == syntax.OpBeginText {
|
||||
sre.Sub = sre.Sub[1:]
|
||||
}
|
||||
}
|
||||
if !hasSuffix {
|
||||
if !keepEndOp {
|
||||
for len(sre.Sub) > 0 && sre.Sub[len(sre.Sub)-1].Op == syntax.OpEndText {
|
||||
sre.Sub = sre.Sub[:len(sre.Sub)-1]
|
||||
}
|
||||
|
@ -254,6 +282,47 @@ func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Re
|
|||
}
|
||||
}
|
||||
|
||||
// getSubstringLiteral returns regex part from expr surrounded by .+ or .* depending on the prefixSuffixOp.
|
||||
//
|
||||
// For example, if expr=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo".
|
||||
//
|
||||
// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix
|
||||
// or if the regex part surrounded by prefixSuffix contains alternate regexps.
|
||||
func getSubstringLiteral(expr string, prefixSuffixOp syntax.Op) string {
|
||||
// Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix.
|
||||
sre, err := syntax.Parse(expr, regexParseFlags)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
if sre.Op != syntax.OpConcat {
|
||||
return ""
|
||||
}
|
||||
if len(sre.Sub) != 3 {
|
||||
return ""
|
||||
}
|
||||
if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) || !isLiteral(sre.Sub[1]) {
|
||||
return ""
|
||||
}
|
||||
return string(sre.Sub[1].Rune)
|
||||
}
|
||||
|
||||
func isDotOpRegexp(expr string, op syntax.Op) bool {
|
||||
sre, err := syntax.Parse(expr, regexParseFlags)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return isDotOp(sre, op)
|
||||
}
|
||||
|
||||
func isDotOp(sre *syntax.Regexp, op syntax.Op) bool {
|
||||
if sre.Op != op {
|
||||
return false
|
||||
}
|
||||
return sre.Sub[0].Op == syntax.OpAnyChar
|
||||
}
|
||||
|
||||
var emptyRegexp = &syntax.Regexp{
|
||||
Op: syntax.OpEmptyMatch,
|
||||
}
|
||||
|
||||
const regexParseFlags = syntax.Perl | syntax.DotNL
|
||||
|
|
|
@ -77,7 +77,7 @@ func TestSimplifyPromRegex(t *testing.T) {
|
|||
f("^foobar|foobaz", "fooba", "[rz]")
|
||||
f("^foobar|^foobaz$", "fooba", "[rz]")
|
||||
f("foobar|foobaz", "fooba", "[rz]")
|
||||
f("(?:^foobar|^foobaz)aa.*", "fooba", "(?-s:[rz]aa.*)")
|
||||
f("(?:^foobar|^foobaz)aa.*", "fooba", "(?s:[rz]aa.*)")
|
||||
f("foo[bar]+", "foo", "[abr]+")
|
||||
f("foo[a-z]+", "foo", "[a-z]+")
|
||||
f("foo[bar]*", "foo", "[abr]*")
|
||||
|
@ -88,12 +88,12 @@ func TestSimplifyPromRegex(t *testing.T) {
|
|||
f("foo[^x]*", "foo", "[^x]*")
|
||||
f("foo[x]*bar", "foo", "x*bar")
|
||||
f("fo\\Bo[x]*bar?", "fo", "\\Box*bar?")
|
||||
f("foo.+bar", "foo", "(?-s:.+bar)")
|
||||
f("a(b|c.*).+", "a", "(?-s:(?:b|c.*).+)")
|
||||
f("foo.+bar", "foo", "(?s:.+bar)")
|
||||
f("a(b|c.*).+", "a", "(?s:(?:b|c.*).+)")
|
||||
f("ab|ac", "a", "[bc]")
|
||||
f("(?i)xyz", "", "(?i:XYZ)")
|
||||
f("(?i)foo|bar", "", "(?i:FOO|BAR)")
|
||||
f("(?i)up.+x", "", "(?i-s:UP.+X)")
|
||||
f("(?i)up.+x", "", "(?is:UP.+X)")
|
||||
f("(?smi)xy.*z$", "", "(?ims:XY.*Z$)")
|
||||
|
||||
// test invalid regexps
|
||||
|
@ -111,12 +111,12 @@ func TestSimplifyPromRegex(t *testing.T) {
|
|||
f("(foo|bar$)x*", "", "(?-m:(?:foo|bar$)x*)")
|
||||
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297
|
||||
f(".+;|;.+", "", "(?-s:.+;|;.+)")
|
||||
f("^(.+);|;(.+)$", "", "(?-s:.+;|;.+)")
|
||||
f("^(.+);$|^;(.+)$", "", "(?-s:.+;|;.+)")
|
||||
f(".*;|;.*", "", "(?-s:.*;|;.*)")
|
||||
f("^(.*);|;(.*)$", "", "(?-s:.*;|;.*)")
|
||||
f("^(.*);$|^;(.*)$", "", "(?-s:.*;|;.*)")
|
||||
f(".+;|;.+", "", "(?s:.+;|;.+)")
|
||||
f("^(.+);|;(.+)$", "", "(?s:.+;|;.+)")
|
||||
f("^(.+);$|^;(.+)$", "", "(?s:.+;|;.+)")
|
||||
f(".*;|;.*", "", "(?s:.*;|;.*)")
|
||||
f("^(.*);|;(.*)$", "", "(?s:.*;|;.*)")
|
||||
f("^(.*);$|^;(.*)$", "", "(?s:.*;|;.*)")
|
||||
}
|
||||
|
||||
func TestRemoveStartEndAnchors(t *testing.T) {
|
||||
|
|
Loading…
Reference in a new issue