From e3cbf97bdd210639b26ce7363160d620b383c8fe Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Thu, 23 May 2024 12:24:09 +0200
Subject: [PATCH] wip

---
 docs/VictoriaLogs/CHANGELOG.md      |  2 ++
 docs/VictoriaLogs/LogsQL.md         |  9 ++++++++-
 lib/logstorage/logfmt_parser.go     |  2 +-
 lib/logstorage/pattern.go           | 20 ++++++++++++--------
 lib/logstorage/pattern_test.go      |  6 +++---
 lib/logstorage/pipe_extract_test.go | 24 ++++++++++++++++++++++++
 lib/logstorage/storage_search.go    |  2 +-
 7 files changed, 51 insertions(+), 14 deletions(-)
diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md
index 2c0d1903a..e5556e9b5 100644
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta
 
 ## tip
 
+* FEATURE: allow disabling automatic unquoting of the matched placeholders in [`extract` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe). See [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#format-for-extract-pipe-pattern).
+
 * BUGFIX: properly parse `!` in front of [exact filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-filter), [exact-prefix filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-prefix-filter) and [regexp filter](https://docs.victoriametrics.com/victorialogs/logsql/#regexp-filter). For example, `!~"some regexp"` is properly parsed as `not ="some regexp"`. Previously it was incorrectly parsed as `'~="some regexp"'` [phrase filter](https://docs.victoriametrics.com/victorialogs/logsql/#phrase-filter).
 
 ## [v0.9.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.9.1-victorialogs)
diff --git a/docs/VictoriaLogs/LogsQL.md b/docs/VictoriaLogs/LogsQL.md
index af3dff3cc..830bc9632 100644
--- a/docs/VictoriaLogs/LogsQL.md
+++ b/docs/VictoriaLogs/LogsQL.md
@@ -1184,7 +1184,7 @@ Placeholders can be anonymous and named. Anonymous placeholders are written as `
 must be skipped until the next `textX`. Named palceholders are written as `<some_name>`, where `some_name` is the name of the log field to store
 the corresponding matching substring to.
 
-The matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `<field1>` and doesn't contain `text1`,
+Matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `<field1>` and doesn't contain `text1`,
 then the matching starts from the beginning of the input text. Matching is performed sequentially according to the `pattern`. If some `textX` isn't found
 in the remaining input text, then the remaining named placeholders receive empty string values and the matching finishes prematurely.
 
@@ -1219,6 +1219,13 @@ This is useful for extracting JSON strings. For example, the following `pattern`
 "message":<msg>
 ```
 
+The automatic string unquoting can be disabled if needed by adding `plain:` prefix in front of the field name. For example, if some JSON array of string values must be captured
+into `json_array` field, then the following `pattern` can be used:
+
+```
+some json string array: [<plain:json_array>]
+```
+
 If some special chars such as `<` must be matched by the `pattern`, then they can be [html-escaped](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
 For example, the following `pattern` properly matches `a < b` text by extracting `a` into `left` field and `b` into `right` field:
 
diff --git a/lib/logstorage/logfmt_parser.go b/lib/logstorage/logfmt_parser.go
index 5eede8bbb..31901cd3c 100644
--- a/lib/logstorage/logfmt_parser.go
+++ b/lib/logstorage/logfmt_parser.go
@@ -38,7 +38,7 @@ func (p *logfmtParser) parse(s string) {
 		}
 
 		// Search for field value
-		value, nOffset := tryUnquoteString(s)
+		value, nOffset := tryUnquoteString(s, "")
 		if nOffset >= 0 {
 			p.addField(name, value)
 			s = s[nOffset:]
diff --git a/lib/logstorage/pattern.go b/lib/logstorage/pattern.go
index ed1f7758f..416cb4c69 100644
--- a/lib/logstorage/pattern.go
+++ b/lib/logstorage/pattern.go
@@ -109,7 +109,7 @@ func (ptn *pattern) apply(s string) {
 			nextPrefix = steps[i+1].prefix
 		}
 
-		us, nOffset := tryUnquoteString(s)
+		us, nOffset := tryUnquoteString(s, steps[i].opt)
 		if nOffset >= 0 {
 			// Matched quoted string
 			matches[i] = us
@@ -136,20 +136,23 @@ func (ptn *pattern) apply(s string) {
 	}
 }
 
-func tryUnquoteString(s string) (string, int) {
+func tryUnquoteString(s, opt string) (string, int) {
+	if opt == "plain" {
+		return "", -1
+	}
 	if len(s) == 0 {
-		return s, -1
+		return "", -1
 	}
 	if s[0] != '"' && s[0] != '`' {
-		return s, -1
+		return "", -1
 	}
 	qp, err := strconv.QuotedPrefix(s)
 	if err != nil {
-		return s, -1
+		return "", -1
 	}
 	us, err := strconv.Unquote(qp)
 	if err != nil {
-		return s, -1
+		return "", -1
 	}
 	return us, len(qp)
 }
@@ -171,9 +174,10 @@ func parsePatternSteps(s string) ([]patternStep, error) {
 		step := &steps[i]
 		field := step.field
 		if n := strings.IndexByte(field, ':'); n >= 0 {
-			step.opt = field[:n]
-			step.field = field[n+1:]
+			step.opt = strings.TrimSpace(field[:n])
+			field = field[n+1:]
 		}
+		step.field = strings.TrimSpace(field)
 	}
 
 	return steps, nil
diff --git a/lib/logstorage/pattern_test.go b/lib/logstorage/pattern_test.go
index eee5a1a26..3dff47543 100644
--- a/lib/logstorage/pattern_test.go
+++ b/lib/logstorage/pattern_test.go
@@ -196,7 +196,7 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 			prefix: "<&>",
 		},
 	})
-	f("&lt;<foo>&amp;gt;", []patternStep{
+	f("&lt;< foo >&amp;gt;", []patternStep{
 		{
 			prefix: "<",
 			field:  "foo",
@@ -205,7 +205,7 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 			prefix: "&gt;",
 		},
 	})
-	f("<q:foo>bar<abc:baz:c:y>f<:foo:bar:baz>", []patternStep{
+	f("< q : foo >bar<plain : baz:c:y>f<:foo:bar:baz>", []patternStep{
 		{
 			field: "foo",
 			opt:   "q",
@@ -213,7 +213,7 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 		{
 			prefix: "bar",
 			field:  "baz:c:y",
-			opt:    "abc",
+			opt:    "plain",
 		},
 		{
 			prefix: "f",
diff --git a/lib/logstorage/pipe_extract_test.go b/lib/logstorage/pipe_extract_test.go
index 7a2d39f99..a80803a28 100644
--- a/lib/logstorage/pipe_extract_test.go
+++ b/lib/logstorage/pipe_extract_test.go
@@ -99,6 +99,30 @@ func TestPipeExtract(t *testing.T) {
 		},
 	})
 
+	// single row, disable unquoting
+	f(`extract 'foo=[< plain : bar >]' from x`, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+		},
+	}, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+			{"bar", `"bc","de"`},
+		},
+	})
+
+	// single row, default unquoting
+	f(`extract 'foo=[< bar >]' from x`, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+		},
+	}, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+			{"bar", `bc`},
+		},
+	})
+
 	// single row, overwirte existing column
 	f(`extract "foo=<bar> baz=<xx>" from x`, [][]Field{
 		{
diff --git a/lib/logstorage/storage_search.go b/lib/logstorage/storage_search.go
index 69be30c46..f3395f4f1 100644
--- a/lib/logstorage/storage_search.go
+++ b/lib/logstorage/storage_search.go
@@ -1042,7 +1042,7 @@ func parseStreamLabels(dst []Field, s string) ([]Field, error) {
 		name := s[:n]
 		s = s[n+1:]
 
-		value, nOffset := tryUnquoteString(s)
+		value, nOffset := tryUnquoteString(s, "")
 		if nOffset < 0 {
 			return dst, fmt.Errorf("cannot find parse label value in double quotes at [%s]", s)
 		}