From 44db8c9f46c1e60ed202a860007a4504b2219ca4 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Mon, 13 Nov 2023 18:23:36 +0100
Subject: [PATCH] lib/regexutil: properly handle alternate regexps surrounded
 by .+ or .*

Previously the following regexps were improperly handled:

  .+foo|bar.+
  .*foo|bar.*

This could lead to unexpected regexp match results.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297

Thanks to @Haleygo for the initial attempt to fix the issue at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5308
---
 docs/CHANGELOG.md               |  1 +
 lib/regexutil/promregex.go      | 16 ++++++++++++++++
 lib/regexutil/promregex_test.go | 33 +++++++++++++++++++++++++++++++--
 lib/regexutil/regexutil_test.go |  8 ++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 554736e949..5bff1da290 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -15,6 +15,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 
 ## v1.87.x long-time support release (LTS)
 
+* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly apply [relabeling](https://docs.victoriametrics.com/vmagent.html#relabeling) with `regex`, which start and end with `.+` or `.*` and which contain alternate sub-regexps. For example, `.+;|;.+` or `.*foo|bar|baz.*`. Previously such regexps were improperly parsed, which could result in undexpected relabeling results. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297).
 * BUGFIX: [vmstorage](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): log warning about switching to ReadOnly mode only on state change. Before, vmstorage would log this warning every 1s. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5159) for details.
 
 ## [v1.87.10](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.10)
diff --git a/lib/regexutil/promregex.go b/lib/regexutil/promregex.go
index cf196b8cd1..786dfd9b3e 100644
--- a/lib/regexutil/promregex.go
+++ b/lib/regexutil/promregex.go
@@ -2,6 +2,7 @@ package regexutil
 
 import (
 	"regexp"
+	"regexp/syntax"
 	"strings"
 
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
@@ -105,7 +106,22 @@ func (pr *PromRegex) MatchString(s string) bool {
 	return pr.reSuffixMatcher.Match(s)
 }
 
+// getSubstringLiteral returns regex part from expr surrounded by prefixSuffix.
+//
+// For example, if expr=".+foo.+" and prefixSuffix=".+", then the function returns "foo".
+//
+// An empty string is returned if expr doesn't contain the given prefixSuffix prefix and suffix
+// or if the regex part surrounded by prefixSuffix contains alternate regexps.
 func getSubstringLiteral(expr, prefixSuffix string) string {
+	// Verify that the expr doesn't contain alternate regexps. In this case it is unsafe removing prefix and suffix.
+	sre, err := syntax.Parse(expr, syntax.Perl)
+	if err != nil {
+		return ""
+	}
+	if sre.Op == syntax.OpAlternate {
+		return ""
+	}
+
 	if !strings.HasPrefix(expr, prefixSuffix) {
 		return ""
 	}
diff --git a/lib/regexutil/promregex_test.go b/lib/regexutil/promregex_test.go
index 20a04312ad..83cea682fa 100644
--- a/lib/regexutil/promregex_test.go
+++ b/lib/regexutil/promregex_test.go
@@ -29,7 +29,7 @@ func TestPromRegex(t *testing.T) {
 		}
 		result := pr.MatchString(s)
 		if result != resultExpected {
-			t.Fatalf("unexpected result when matching %s against %s; got %v; want %v", expr, s, result, resultExpected)
+			t.Fatalf("unexpected result when matching %q against %q; got %v; want %v", expr, s, result, resultExpected)
 		}
 
 		// Make sure the result is the same for regular regexp
@@ -37,7 +37,7 @@ func TestPromRegex(t *testing.T) {
 		re := regexp.MustCompile(exprAnchored)
 		result = re.MatchString(s)
 		if result != resultExpected {
-			t.Fatalf("unexpected result when matching %s against %s during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected)
+			t.Fatalf("unexpected result when matching %q against %q during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected)
 		}
 	}
 	f("", "", true)
@@ -89,4 +89,33 @@ func TestPromRegex(t *testing.T) {
 	f(".*(a|b).*", "xzy", false)
 	f("^(?:true)$", "true", true)
 	f("^(?:true)$", "false", false)
+
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297
+	f(".+;|;.+", ";", false)
+	f(".+;|;.+", "foo", false)
+	f(".+;|;.+", "foo;bar", false)
+	f(".+;|;.+", "foo;", true)
+	f(".+;|;.+", ";foo", true)
+	f(".+foo|bar|baz.+", "foo", false)
+	f(".+foo|bar|baz.+", "afoo", true)
+	f(".+foo|bar|baz.+", "fooa", false)
+	f(".+foo|bar|baz.+", "afooa", false)
+	f(".+foo|bar|baz.+", "bar", true)
+	f(".+foo|bar|baz.+", "abar", false)
+	f(".+foo|bar|baz.+", "abara", false)
+	f(".+foo|bar|baz.+", "bara", false)
+	f(".+foo|bar|baz.+", "baz", false)
+	f(".+foo|bar|baz.+", "baza", true)
+	f(".+foo|bar|baz.+", "abaz", false)
+	f(".+foo|bar|baz.+", "abaza", false)
+	f(".+foo|bar|baz.+", "afoo|bar|baza", false)
+	f(".+(foo|bar|baz).+", "abara", true)
+	f(".+(foo|bar|baz).+", "afooa", true)
+	f(".+(foo|bar|baz).+", "abaza", true)
+
+	f(".*;|;.*", ";", true)
+	f(".*;|;.*", "foo", false)
+	f(".*;|;.*", "foo;bar", false)
+	f(".*;|;.*", "foo;", true)
+	f(".*;|;.*", ";foo", true)
 }
diff --git a/lib/regexutil/regexutil_test.go b/lib/regexutil/regexutil_test.go
index e45781d25e..7357e115f4 100644
--- a/lib/regexutil/regexutil_test.go
+++ b/lib/regexutil/regexutil_test.go
@@ -109,6 +109,14 @@ func TestSimplify(t *testing.T) {
 
 	// The transformed regexp mustn't match barx
 	f("(foo|bar$)x*", "", "(?:foo|bar$)x*")
+
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5297
+	f(".+;|;.+", "", ".+;|;.+")
+	f("^(.+);|;(.+)$", "", ".+;|;.+")
+	f("^(.+);$|^;(.+)$", "", ".+;|;.+")
+	f(".*;|;.*", "", ".*;|;.*")
+	f("^(.*);|;(.*)$", "", ".*;|;.*")
+	f("^(.*);$|^;(.*)$", "", ".*;|;.*")
 }
 
 func TestRemoveStartEndAnchors(t *testing.T) {