From e3cbf97bdd210639b26ce7363160d620b383c8fe Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 23 May 2024 12:24:09 +0200 Subject: [PATCH] wip --- docs/VictoriaLogs/CHANGELOG.md | 2 ++ docs/VictoriaLogs/LogsQL.md | 9 ++++++++- lib/logstorage/logfmt_parser.go | 2 +- lib/logstorage/pattern.go | 20 ++++++++++++-------- lib/logstorage/pattern_test.go | 6 +++--- lib/logstorage/pipe_extract_test.go | 24 ++++++++++++++++++++++++ lib/logstorage/storage_search.go | 2 +- 7 files changed, 51 insertions(+), 14 deletions(-) diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md index 2c0d1903a..e5556e9b5 100644 --- a/docs/VictoriaLogs/CHANGELOG.md +++ b/docs/VictoriaLogs/CHANGELOG.md @@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta ## tip +* FEATURE: allow disabling automatic unquoting of the matched placeholders in [`extract` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe). See [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#format-for-extract-pipe-pattern). + * BUGFIX: properly parse `!` in front of [exact filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-filter), [exact-prefix filter](https://docs.victoriametrics.com/victorialogs/logsql/#exact-prefix-filter) and [regexp filter](https://docs.victoriametrics.com/victorialogs/logsql/#regexp-filter). For example, `!~"some regexp"` is properly parsed as `not ="some regexp"`. Previously it was incorrectly parsed as `'~="some regexp"'` [phrase filter](https://docs.victoriametrics.com/victorialogs/logsql/#phrase-filter). ## [v0.9.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.9.1-victorialogs) diff --git a/docs/VictoriaLogs/LogsQL.md b/docs/VictoriaLogs/LogsQL.md index af3dff3cc..830bc9632 100644 --- a/docs/VictoriaLogs/LogsQL.md +++ b/docs/VictoriaLogs/LogsQL.md @@ -1184,7 +1184,7 @@ Placeholders can be anonymous and named. Anonymous placeholders are written as ` must be skipped until the next `textX`. Named palceholders are written as ``, where `some_name` is the name of the log field to store the corresponding matching substring to. -The matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `` and doesn't contain `text1`, +Matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `` and doesn't contain `text1`, then the matching starts from the beginning of the input text. Matching is performed sequentially according to the `pattern`. If some `textX` isn't found in the remaining input text, then the remaining named placeholders receive empty string values and the matching finishes prematurely. @@ -1219,6 +1219,13 @@ This is useful for extracting JSON strings. For example, the following `pattern` "message": ``` +The automatic string unquoting can be disabled if needed by adding `plain:` prefix in front of the field name. For example, if some JSON array of string values must be captured +into `json_array` field, then the following `pattern` can be used: + +``` +some json string array: [] +``` + If some special chars such as `<` must be matched by the `pattern`, then they can be [html-escaped](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references). For example, the following `pattern` properly matches `a < b` text by extracting `a` into `left` field and `b` into `right` field: diff --git a/lib/logstorage/logfmt_parser.go b/lib/logstorage/logfmt_parser.go index 5eede8bbb..31901cd3c 100644 --- a/lib/logstorage/logfmt_parser.go +++ b/lib/logstorage/logfmt_parser.go @@ -38,7 +38,7 @@ func (p *logfmtParser) parse(s string) { } // Search for field value - value, nOffset := tryUnquoteString(s) + value, nOffset := tryUnquoteString(s, "") if nOffset >= 0 { p.addField(name, value) s = s[nOffset:] diff --git a/lib/logstorage/pattern.go b/lib/logstorage/pattern.go index ed1f7758f..416cb4c69 100644 --- a/lib/logstorage/pattern.go +++ b/lib/logstorage/pattern.go @@ -109,7 +109,7 @@ func (ptn *pattern) apply(s string) { nextPrefix = steps[i+1].prefix } - us, nOffset := tryUnquoteString(s) + us, nOffset := tryUnquoteString(s, steps[i].opt) if nOffset >= 0 { // Matched quoted string matches[i] = us @@ -136,20 +136,23 @@ func (ptn *pattern) apply(s string) { } } -func tryUnquoteString(s string) (string, int) { +func tryUnquoteString(s, opt string) (string, int) { + if opt == "plain" { + return "", -1 + } if len(s) == 0 { - return s, -1 + return "", -1 } if s[0] != '"' && s[0] != '`' { - return s, -1 + return "", -1 } qp, err := strconv.QuotedPrefix(s) if err != nil { - return s, -1 + return "", -1 } us, err := strconv.Unquote(qp) if err != nil { - return s, -1 + return "", -1 } return us, len(qp) } @@ -171,9 +174,10 @@ func parsePatternSteps(s string) ([]patternStep, error) { step := &steps[i] field := step.field if n := strings.IndexByte(field, ':'); n >= 0 { - step.opt = field[:n] - step.field = field[n+1:] + step.opt = strings.TrimSpace(field[:n]) + field = field[n+1:] } + step.field = strings.TrimSpace(field) } return steps, nil diff --git a/lib/logstorage/pattern_test.go b/lib/logstorage/pattern_test.go index eee5a1a26..3dff47543 100644 --- a/lib/logstorage/pattern_test.go +++ b/lib/logstorage/pattern_test.go @@ -196,7 +196,7 @@ func TestParsePatternStepsSuccess(t *testing.T) { prefix: "<&>", }, }) - f("<&gt;", []patternStep{ + f("<< foo >&gt;", []patternStep{ { prefix: "<", field: "foo", @@ -205,7 +205,7 @@ func TestParsePatternStepsSuccess(t *testing.T) { prefix: ">", }, }) - f("barf<:foo:bar:baz>", []patternStep{ + f("< q : foo >barf<:foo:bar:baz>", []patternStep{ { field: "foo", opt: "q", @@ -213,7 +213,7 @@ func TestParsePatternStepsSuccess(t *testing.T) { { prefix: "bar", field: "baz:c:y", - opt: "abc", + opt: "plain", }, { prefix: "f", diff --git a/lib/logstorage/pipe_extract_test.go b/lib/logstorage/pipe_extract_test.go index 7a2d39f99..a80803a28 100644 --- a/lib/logstorage/pipe_extract_test.go +++ b/lib/logstorage/pipe_extract_test.go @@ -99,6 +99,30 @@ func TestPipeExtract(t *testing.T) { }, }) + // single row, disable unquoting + f(`extract 'foo=[< plain : bar >]' from x`, [][]Field{ + { + {"x", `a foo=["bc","de"]`}, + }, + }, [][]Field{ + { + {"x", `a foo=["bc","de"]`}, + {"bar", `"bc","de"`}, + }, + }) + + // single row, default unquoting + f(`extract 'foo=[< bar >]' from x`, [][]Field{ + { + {"x", `a foo=["bc","de"]`}, + }, + }, [][]Field{ + { + {"x", `a foo=["bc","de"]`}, + {"bar", `bc`}, + }, + }) + // single row, overwirte existing column f(`extract "foo= baz=" from x`, [][]Field{ { diff --git a/lib/logstorage/storage_search.go b/lib/logstorage/storage_search.go index 69be30c46..f3395f4f1 100644 --- a/lib/logstorage/storage_search.go +++ b/lib/logstorage/storage_search.go @@ -1042,7 +1042,7 @@ func parseStreamLabels(dst []Field, s string) ([]Field, error) { name := s[:n] s = s[n+1:] - value, nOffset := tryUnquoteString(s) + value, nOffset := tryUnquoteString(s, "") if nOffset < 0 { return dst, fmt.Errorf("cannot find parse label value in double quotes at [%s]", s) }