VictoriaMetrics/lib/regexutil/regex.go

210 lines
4.8 KiB
Go
Raw Normal View History

2024-05-23 15:32:42 +00:00
package regexutil
import (
"regexp"
"regexp/syntax"
"strings"
)
// Regex implements an optimized string matching for Go regex.
//
// The following regexs are optimized:
//
// - plain string such as "foobar"
// - alternate strings such as "foo|bar|baz"
// - prefix match such as "foo.*" or "foo.+"
// - substring match such as ".*foo.*" or ".+bar.+"
type Regex struct {
2024-05-23 19:47:21 +00:00
// exprStr is the original expression.
exprStr string
2024-05-23 15:32:42 +00:00
// prefix contains literal prefix for regex.
// For example, prefix="foo" for regex="foo(a|b)"
prefix string
2024-05-23 19:24:08 +00:00
// isOnlyPrefix is set to true if the regex contains only the prefix.
isOnlyPrefix bool
2024-05-23 15:32:42 +00:00
// isSuffixDotStar is set to true if suffix is ".*"
isSuffixDotStar bool
// isSuffixDotPlus is set to true if suffix is ".+"
isSuffixDotPlus bool
// substrDotStar contains literal string for regex suffix=".*string.*"
substrDotStar string
// substrDotPlus contains literal string for regex suffix=".+string.+"
substrDotPlus string
// orValues contains or values for the suffix regex.
// For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz"
orValues []string
2024-05-23 19:47:21 +00:00
// suffixRe is the regexp for suffix
suffixRe *regexp.Regexp
2024-05-23 15:32:42 +00:00
}
// NewRegex returns Regex for the given expr.
func NewRegex(expr string) (*Regex, error) {
if _, err := regexp.Compile(expr); err != nil {
return nil, err
}
2024-05-23 19:24:08 +00:00
2024-05-23 15:32:42 +00:00
prefix, suffix := SimplifyRegex(expr)
2024-05-23 19:24:08 +00:00
sre := mustParseRegexp(suffix)
orValues := getOrValues(sre)
isOnlyPrefix := len(orValues) == 1 && orValues[0] == ""
isSuffixDotStar := isDotOp(sre, syntax.OpStar)
isSuffixDotPlus := isDotOp(sre, syntax.OpPlus)
substrDotStar := getSubstringLiteral(sre, syntax.OpStar)
substrDotPlus := getSubstringLiteral(sre, syntax.OpPlus)
2024-05-23 15:32:42 +00:00
2024-05-23 19:24:08 +00:00
suffixAnchored := suffix
if len(prefix) > 0 {
suffixAnchored = "^(?:" + suffix + ")"
2024-05-23 15:32:42 +00:00
}
2024-05-23 19:24:08 +00:00
// The suffixAnchored must be properly compiled, since it has been already checked above.
// Otherwise it is a bug, which must be fixed.
2024-05-23 19:47:21 +00:00
suffixRe := regexp.MustCompile(suffixAnchored)
2024-05-23 19:24:08 +00:00
2024-05-23 15:32:42 +00:00
r := &Regex{
2024-05-23 19:47:21 +00:00
exprStr: expr,
2024-05-23 15:32:42 +00:00
prefix: prefix,
2024-05-23 19:24:08 +00:00
isOnlyPrefix: isOnlyPrefix,
2024-05-23 15:32:42 +00:00
isSuffixDotStar: isSuffixDotStar,
isSuffixDotPlus: isSuffixDotPlus,
substrDotStar: substrDotStar,
substrDotPlus: substrDotPlus,
orValues: orValues,
2024-05-23 19:47:21 +00:00
suffixRe: suffixRe,
2024-05-23 15:32:42 +00:00
}
return r, nil
}
2024-05-23 19:47:21 +00:00
// MatchString returns true if s matches r.
2024-05-23 15:32:42 +00:00
func (r *Regex) MatchString(s string) bool {
2024-05-23 19:24:08 +00:00
if r.isOnlyPrefix {
return strings.Contains(s, r.prefix)
}
2024-05-23 19:47:21 +00:00
2024-05-23 15:32:42 +00:00
if len(r.prefix) == 0 {
return r.matchStringNoPrefix(s)
}
return r.matchStringWithPrefix(s)
}
2024-05-23 20:54:21 +00:00
// GetLiterals returns literals for r.
func (r *Regex) GetLiterals() []string {
sre := mustParseRegexp(r.exprStr)
for sre.Op == syntax.OpCapture {
sre = sre.Sub[0]
}
v, ok := getLiteral(sre)
if ok {
return []string{v}
}
if sre.Op != syntax.OpConcat {
return nil
}
var a []string
for _, sub := range sre.Sub {
v, ok := getLiteral(sub)
if ok {
a = append(a, v)
}
}
return a
}
2024-05-23 19:47:21 +00:00
// String returns string represetnation for r
func (r *Regex) String() string {
return r.exprStr
}
2024-05-23 15:32:42 +00:00
func (r *Regex) matchStringNoPrefix(s string) bool {
if r.isSuffixDotStar {
return true
}
if r.isSuffixDotPlus {
return len(s) > 0
}
if r.substrDotStar != "" {
// Fast path - r contains ".*someText.*"
return strings.Contains(s, r.substrDotStar)
}
if r.substrDotPlus != "" {
// Fast path - r contains ".+someText.+"
n := strings.Index(s, r.substrDotPlus)
return n > 0 && n+len(r.substrDotPlus) < len(s)
}
if len(r.orValues) == 0 {
2024-05-23 19:47:21 +00:00
// Fall back to slow path by matching the suffix regexp.
return r.suffixRe.MatchString(s)
2024-05-23 15:32:42 +00:00
}
2024-05-23 19:47:21 +00:00
// Fast path - compare s to r.orValues
2024-05-23 15:32:42 +00:00
for _, v := range r.orValues {
if strings.Contains(s, v) {
return true
}
}
return false
}
func (r *Regex) matchStringWithPrefix(s string) bool {
n := strings.Index(s, r.prefix)
if n < 0 {
// Fast path - s doesn't contain the needed prefix
return false
}
sNext := s[n+1:]
s = s[n+len(r.prefix):]
if r.isSuffixDotStar {
return true
}
if r.isSuffixDotPlus {
return len(s) > 0
}
if r.substrDotStar != "" {
// Fast path - r contains ".*someText.*"
return strings.Contains(s, r.substrDotStar)
}
if r.substrDotPlus != "" {
// Fast path - r contains ".+someText.+"
n := strings.Index(s, r.substrDotPlus)
return n > 0 && n+len(r.substrDotPlus) < len(s)
}
for {
if len(r.orValues) == 0 {
2024-05-23 19:47:21 +00:00
// Fall back to slow path by matching the suffix regexp.
if r.suffixRe.MatchString(s) {
2024-05-23 15:32:42 +00:00
return true
}
} else {
2024-05-23 19:47:21 +00:00
// Fast path - compare s to r.orValues
2024-05-23 15:32:42 +00:00
for _, v := range r.orValues {
if strings.HasPrefix(s, v) {
return true
}
}
}
// Mismatch. Try again starting from the next char.
s = sNext
n := strings.Index(s, r.prefix)
if n < 0 {
// Fast path - s doesn't contain the needed prefix
return false
}
sNext = s[n+1:]
s = s[n+len(r.prefix):]
}
}