From 5b0b7d509f82b3018105d75e503427910626decb Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Fri, 8 Nov 2024 19:57:22 +0100
Subject: [PATCH] lib/logstorage: support for `[label1=value1 ...
 labelN=valueN]` syntax inside syslog messages for adding arbitrary labels
 (fields) to log entries

---
 docs/VictoriaLogs/CHANGELOG.md       |  1 +
 lib/logstorage/syslog_parser.go      | 31 +++++++++++++++++++++++-----
 lib/logstorage/syslog_parser_test.go |  4 ++--
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md
index c96e5f1b06..6a7f96756e 100644
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@@ -19,6 +19,7 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta
 * FEATURE: [`_time` filter](https://docs.victoriametrics.com/victorialogs/logsql/#time-filter): allow specifying offset without time range. For example, `_time:offset 1d` matches all the logs until `now-1d` in the [`_time` field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field). This is useful when building graphs for time ranges with some offset in the past.
 * FEATURE: [`/select/logsql/tail` HTTP endpoint](): support for `offset` query arg, which can be used for delayed emission of matching logs during live tailing. Thanks to @Fusl for the initial idea and implementation in [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7428).
 * FEATURE: [vlogscli](https://docs.victoriametrics.com/victorialogs/querying/vlogscli/): allow enabling and disabling wrapping of long lines, which do not fit screen width, with `\wrap_long_lines` command.
+* FEATURE: [syslog data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/): allow adding arbitrary [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) via `[label1=value1 ... labelN=valueN]` syntax inside Syslog messages. For example, `<165>1 2024-06-03T17:42:00.000Z example.com appname 12345 ID47 [field1=value1 field2=value2] some message`.
 
 * BUGFIX: [HTTP querying APIs](https://docs.victoriametrics.com/victorialogs/querying/#http-api): properly take into account the `end` query arg when calculating time range for [`_time:duration` filter](https://docs.victoriametrics.com/victorialogs/logsql/#time-filter). Previously the `_time:duration` filter was treated as `_time:[now-duration, now)`, while it should be treated as `_time:[end-duration, end)`.
 
diff --git a/lib/logstorage/syslog_parser.go b/lib/logstorage/syslog_parser.go
index 25c5b49610..e432870929 100644
--- a/lib/logstorage/syslog_parser.go
+++ b/lib/logstorage/syslog_parser.go
@@ -240,6 +240,12 @@ func (p *SyslogParser) parseRFC5424SDLine(s string) (string, bool) {
 	sdID := s[:n]
 	s = s[n:]
 
+	if n := strings.IndexByte(sdID, '='); n >= 0 {
+		// Special case when sdID contains `key=value`
+		p.addField(sdID[:n], sdID[n+1:])
+		sdID = ""
+	}
+
 	// Parse structured data
 	i := 0
 	for i < len(s) && s[i] != ']' {
@@ -257,11 +263,19 @@ func (p *SyslogParser) parseRFC5424SDLine(s string) (string, bool) {
 		i += n + 1
 
 		// Parse value
-		qp, err := strconv.QuotedPrefix(s[i:])
-		if err != nil {
-			return s, false
+		if strings.HasPrefix(s[i:], `"`) {
+			qp, err := strconv.QuotedPrefix(s[i:])
+			if err != nil {
+				return s, false
+			}
+			i += len(qp)
+		} else {
+			n := strings.IndexAny(s[i:], " ]")
+			if n < 0 {
+				return s, false
+			}
+			i += n
 		}
-		i += len(qp)
 	}
 	if i == len(s) {
 		return s, false
@@ -272,9 +286,16 @@ func (p *SyslogParser) parseRFC5424SDLine(s string) (string, bool) {
 	p.sdParser.parse(sdValue)
 	if len(p.sdParser.fields) == 0 {
 		// Special case when structured data doesn't contain any fields
-		p.addField(sdID, "")
+		if sdID != "" {
+			p.addField(sdID, "")
+		}
 	} else {
 		for _, f := range p.sdParser.fields {
+			if sdID == "" {
+				p.addField(f.Name, f.Value)
+				continue
+			}
+
 			bufLen := len(p.buf)
 			p.buf = append(p.buf, sdID...)
 			p.buf = append(p.buf, '.')
diff --git a/lib/logstorage/syslog_parser_test.go b/lib/logstorage/syslog_parser_test.go
index ed14a607a3..a5fd6a303e 100644
--- a/lib/logstorage/syslog_parser_test.go
+++ b/lib/logstorage/syslog_parser_test.go
@@ -39,8 +39,8 @@ func TestSyslogParser(t *testing.T) {
 		`format=rfc5424 timestamp=2023-06-03T17:42:32.123456789Z hostname=mymachine.example.com app_name=appname proc_id=12345 msg_id=ID47 message="This is a test message with structured data."`)
 	f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [exampleSDID@32473 iut="3" eventSource="Application 123 = ] 56" eventID="11211"] This is a test message with structured data.`, time.UTC,
 		`priority=165 facility=20 severity=5 format=rfc5424 timestamp=2023-06-03T17:42:00.000Z hostname=mymachine.example.com app_name=appname proc_id=12345 msg_id=ID47 exampleSDID@32473.iut=3 exampleSDID@32473.eventSource="Application 123 = ] 56" exampleSDID@32473.eventID=11211 message="This is a test message with structured data."`)
-	f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [foo@123 iut="3"][bar@456 eventID="11211"] This is a test message with structured data.`, time.UTC,
-		`priority=165 facility=20 severity=5 format=rfc5424 timestamp=2023-06-03T17:42:00.000Z hostname=mymachine.example.com app_name=appname proc_id=12345 msg_id=ID47 foo@123.iut=3 bar@456.eventID=11211 message="This is a test message with structured data."`)
+	f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [foo@123 iut="3"][bar@456 eventID="11211"][abc=def][x=y z=a q="]= "] This is a test message with structured data.`, time.UTC,
+		`priority=165 facility=20 severity=5 format=rfc5424 timestamp=2023-06-03T17:42:00.000Z hostname=mymachine.example.com app_name=appname proc_id=12345 msg_id=ID47 foo@123.iut=3 bar@456.eventID=11211 abc=def x=y z=a q="]= " message="This is a test message with structured data."`)
 
 	// Incomplete RFC 3164
 	f("", time.UTC, ``)