wip

2024-12-31 15:06:26 +00:00 · 2024-05-23 12:24:09 +02:00 · 2024-05-23 12:24:09 +02:00 · e3cbf97bdd
commit e3cbf97bdd
parent a4337149a2
7 changed files with 51 additions and 14 deletions
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta

 ## tip

+* FEATURE: allow disabling automatic unquoting of the matched placeholders in [`extract` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe). See [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#format-for-extract-pipe-pattern).
+
 * BUGFIX: properly parse `!` in front of [exact filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-filter), [exact-prefix filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-prefix-filter) and [regexp filter](https://docs.victoriametrics.com/victorialogs/logsql/#regexp-filter). For example, `!~"some regexp"` is properly parsed as `not ="some regexp"`. Previously it was incorrectly parsed as `'~="some regexp"'` [phrase filter](https://docs.victoriametrics.com/victorialogs/logsql/#phrase-filter).

 ## [v0.9.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.9.1-victorialogs)
--- a/docs/VictoriaLogs/LogsQL.md
+++ b/docs/VictoriaLogs/LogsQL.md
@ -1184,7 +1184,7 @@ Placeholders can be anonymous and named. Anonymous placeholders are written as `
 must be skipped until the next `textX`. Named palceholders are written as `<some_name>`, where `some_name` is the name of the log field to store
 the corresponding matching substring to.

-The matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `<field1>` and doesn't contain `text1`,
+Matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `<field1>` and doesn't contain `text1`,
 then the matching starts from the beginning of the input text. Matching is performed sequentially according to the `pattern`. If some `textX` isn't found
 in the remaining input text, then the remaining named placeholders receive empty string values and the matching finishes prematurely.

@ -1219,6 +1219,13 @@ This is useful for extracting JSON strings. For example, the following `pattern`
 "message":<msg>
 ```

+The automatic string unquoting can be disabled if needed by adding `plain:` prefix in front of the field name. For example, if some JSON array of string values must be captured
+into `json_array` field, then the following `pattern` can be used:
+
+```
+some json string array: [<plain:json_array>]
+```
+
 If some special chars such as `<` must be matched by the `pattern`, then they can be [html-escaped](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
 For example, the following `pattern` properly matches `a < b` text by extracting `a` into `left` field and `b` into `right` field:

--- a/lib/logstorage/logfmt_parser.go
+++ b/lib/logstorage/logfmt_parser.go
@ -38,7 +38,7 @@ func (p *logfmtParser) parse(s string) {
 		}

 		// Search for field value
-		value, nOffset := tryUnquoteString(s)
+		value, nOffset := tryUnquoteString(s, "")
 		if nOffset >= 0 {
 			p.addField(name, value)
 			s = s[nOffset:]
--- a/lib/logstorage/pattern.go
+++ b/lib/logstorage/pattern.go
@ -109,7 +109,7 @@ func (ptn *pattern) apply(s string) {
 			nextPrefix = steps[i+1].prefix
 		}

-		us, nOffset := tryUnquoteString(s)
+		us, nOffset := tryUnquoteString(s, steps[i].opt)
 		if nOffset >= 0 {
 			// Matched quoted string
 			matches[i] = us
@ -136,20 +136,23 @@ func (ptn *pattern) apply(s string) {
 	}
 }

-func tryUnquoteString(s string) (string, int) {
+func tryUnquoteString(s, opt string) (string, int) {
+	if opt == "plain" {
+		return "", -1
+	}
 	if len(s) == 0 {
-		return s, -1
+		return "", -1
 	}
 	if s[0] != '"' && s[0] != '`' {
-		return s, -1
+		return "", -1
 	}
 	qp, err := strconv.QuotedPrefix(s)
 	if err != nil {
-		return s, -1
+		return "", -1
 	}
 	us, err := strconv.Unquote(qp)
 	if err != nil {
-		return s, -1
+		return "", -1
 	}
 	return us, len(qp)
 }
@ -171,9 +174,10 @@ func parsePatternSteps(s string) ([]patternStep, error) {
 		step := &steps[i]
 		field := step.field
 		if n := strings.IndexByte(field, ':'); n >= 0 {
-			step.opt = field[:n]
-			step.field = field[n+1:]
+			step.opt = strings.TrimSpace(field[:n])
+			field = field[n+1:]
 		}
+		step.field = strings.TrimSpace(field)
 	}

 	return steps, nil
--- a/lib/logstorage/pattern_test.go
+++ b/lib/logstorage/pattern_test.go
@ -205,7 +205,7 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 			prefix: "&gt;",
 		},
 	})
-	f("<q:foo>bar<abc:baz:c:y>f<:foo:bar:baz>", []patternStep{
+	f("< q : foo >bar<plain : baz:c:y>f<:foo:bar:baz>", []patternStep{
 		{
 			field: "foo",
 			opt:   "q",
@ -213,7 +213,7 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 		{
 			prefix: "bar",
 			field:  "baz:c:y",
-			opt:    "abc",
+			opt:    "plain",
 		},
 		{
 			prefix: "f",
--- a/lib/logstorage/pipe_extract_test.go
+++ b/lib/logstorage/pipe_extract_test.go
@ -99,6 +99,30 @@ func TestPipeExtract(t *testing.T) {
 		},
 	})

+	// single row, disable unquoting
+	f(`extract 'foo=[< plain : bar >]' from x`, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+		},
+	}, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+			{"bar", `"bc","de"`},
+		},
+	})
+
+	// single row, default unquoting
+	f(`extract 'foo=[< bar >]' from x`, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+		},
+	}, [][]Field{
+		{
+			{"x", `a foo=["bc","de"]`},
+			{"bar", `bc`},
+		},
+	})
+
 	// single row, overwirte existing column
 	f(`extract "foo=<bar> baz=<xx>" from x`, [][]Field{
 		{
--- a/lib/logstorage/storage_search.go
+++ b/lib/logstorage/storage_search.go
@ -1042,7 +1042,7 @@ func parseStreamLabels(dst []Field, s string) ([]Field, error) {
 		name := s[:n]
 		s = s[n+1:]

-		value, nOffset := tryUnquoteString(s)
+		value, nOffset := tryUnquoteString(s, "")
 		if nOffset < 0 {
 			return dst, fmt.Errorf("cannot find parse label value in double quotes at [%s]", s)
 		}