From 3eacd43fffcd8734be44f699609c7ba075d57fca Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 25 Jun 2024 14:52:43 +0200 Subject: [PATCH] lib/logstorage: parse syslog structured data into separate fields in order to simplify further querying of this data --- app/vlinsert/syslog/syslog_test.go | 2 +- docs/VictoriaLogs/CHANGELOG.md | 2 ++ docs/VictoriaLogs/LogsQL.md | 4 +-- docs/VictoriaLogs/data-ingestion/syslog.md | 3 +++ lib/logstorage/syslog_parser.go | 29 ++++++++++++++++++++-- lib/logstorage/syslog_parser_test.go | 4 +-- 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/app/vlinsert/syslog/syslog_test.go b/app/vlinsert/syslog/syslog_test.go index b0069e9ec0..e984d197a8 100644 --- a/app/vlinsert/syslog/syslog_test.go +++ b/app/vlinsert/syslog/syslog_test.go @@ -94,7 +94,7 @@ func TestProcessStreamInternal_Success(t *testing.T) { timestampsExpected := []int64{1685794113000000000, 1685880513000000000, 1685814132345000000} resultExpected := `{"format":"rfc3164","timestamp":"","hostname":"abcd","app_name":"systemd","_msg":"Starting Update the local ESM caches..."} {"priority":"165","facility":"20","severity":"5","format":"rfc3164","timestamp":"","hostname":"abcd","app_name":"systemd","proc_id":"345","_msg":"abc defg"} -{"priority":"123","facility":"15","severity":"3","format":"rfc5424","timestamp":"","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473":"iut=\"3\" eventSource=\"Application 123 = ] 56\" eventID=\"11211\"","_msg":"This is a test message with structured data."}` +{"priority":"123","facility":"15","severity":"3","format":"rfc5424","timestamp":"","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473.iut":"3","exampleSDID@32473.eventSource":"Application 123 = ] 56","exampleSDID@32473.eventID":"11211","_msg":"This is a test message with structured data."}` f(data, currentYear, rowsExpected, timestampsExpected, resultExpected) } diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md index 5cc2ab5560..4d6d0a7ee7 100644 --- a/docs/VictoriaLogs/CHANGELOG.md +++ b/docs/VictoriaLogs/CHANGELOG.md @@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta ## tip +* FEATURE: [syslog data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/): parse [STRUCTURED-DATA](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3) into `SD-ID.field1=value1`, `SD-ID.field2=value2`, ..., `SD-ID.fieldN=valueN` [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). Previously the `STRUCTURED-DATA` was parsed into a single log field with the `SD-ID` name and `field1=value1 field2=value2 ... fieldN=valueN` value. This could complicate querying of such data. + * BUGFIX: properly parse timestamps with timezones during [data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/) and [querying](https://docs.victoriametrics.com/victorialogs/querying/). This has been broken in [v0.20.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.20.0-victorialogs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6508). ## [v0.22.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.22.0-victorialogs) diff --git a/docs/VictoriaLogs/LogsQL.md b/docs/VictoriaLogs/LogsQL.md index b2bff69961..a3460251f1 100644 --- a/docs/VictoriaLogs/LogsQL.md +++ b/docs/VictoriaLogs/LogsQL.md @@ -2571,8 +2571,8 @@ The following fields are unpacked: The `` part is optional. If it is missing, then `priority`, `facility` and `severity` fields aren't set. -The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID` name and `param1="value1" ... paramN="valueN"` value -according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3). The value then can be parsed to separate fields with [`unpack_logfmt` pipe](#unpack_logfmt-pipe). +The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID.param1`, `SD-ID.param2`, ..., `SD-ID.paramN` names and the corresponding values +according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3). For example, the following query unpacks [syslog](https://en.wikipedia.org/wiki/Syslog) message from the [`_msg` field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field) across logs for the last 5 minutes: diff --git a/docs/VictoriaLogs/data-ingestion/syslog.md b/docs/VictoriaLogs/data-ingestion/syslog.md index 86f4de3b09..cd3c4c1a0e 100644 --- a/docs/VictoriaLogs/data-ingestion/syslog.md +++ b/docs/VictoriaLogs/data-ingestion/syslog.md @@ -50,6 +50,9 @@ from the received Syslog lines: - `format` - this field is set to either `rfc3164` or `rfc5424` depending on the format of the parsed syslog line - `msg_id` - `MSGID` field from log line in `RFC5424` format. +The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID.param1`, `SD-ID.param2`, ..., `SD-ID.paramN` names and the corresponding values +according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3). + By default local timezone is used when parsing timestamps in `rfc3164` lines. This can be changed to any desired timezone via `-syslog.timezone` command-line flag. See [the list of supported timezone identifiers](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). For example, the following command starts VictoriaLogs, which parses syslog timestamps in `rfc3164` using `Europe/Berlin` timezone: diff --git a/lib/logstorage/syslog_parser.go b/lib/logstorage/syslog_parser.go index b86ccdb681..b60a242dfa 100644 --- a/lib/logstorage/syslog_parser.go +++ b/lib/logstorage/syslog_parser.go @@ -52,10 +52,18 @@ type SyslogParser struct { // Fields contains parsed fields after Parse call. Fields []Field + // buf contains temporary data used in Fields. buf []byte + // sdParser is used for structured data parsing in rfc5424. + // See https://datatracker.ietf.org/doc/html/rfc5424#section-6.3 + sdParser logfmtParser + + // currentYear is used as the current year for rfc3164 messages. currentYear int - timezone *time.Location + + // timezeon is used as the current timezeon for rfc3164 messages. + timezone *time.Location } func (p *SyslogParser) reset() { @@ -69,6 +77,7 @@ func (p *SyslogParser) resetFields() { p.Fields = p.Fields[:0] p.buf = p.buf[:0] + p.sdParser.reset() } func (p *SyslogParser) addField(name, value string) { @@ -259,7 +268,23 @@ func (p *SyslogParser) parseRFC5424SDLine(s string) (string, bool) { } sdValue := strings.TrimSpace(s[:i]) - p.addField(sdID, sdValue) + + p.sdParser.parse(sdValue) + if len(p.sdParser.fields) == 0 { + // Special case when structured data doesn't contain any fields + p.addField(sdID, "") + } else { + for _, f := range p.sdParser.fields { + bufLen := len(p.buf) + p.buf = append(p.buf, sdID...) + p.buf = append(p.buf, '.') + p.buf = append(p.buf, f.Name...) + + fieldName := bytesutil.ToUnsafeString(p.buf[bufLen:]) + p.addField(fieldName, f.Value) + } + } + s = s[i+1:] return s, true } diff --git a/lib/logstorage/syslog_parser_test.go b/lib/logstorage/syslog_parser_test.go index 7f5f3d7e10..5bc07939ac 100644 --- a/lib/logstorage/syslog_parser_test.go +++ b/lib/logstorage/syslog_parser_test.go @@ -38,9 +38,9 @@ func TestSyslogParser(t *testing.T) { f(`1 2023-06-03T17:42:32.123456789Z mymachine.example.com appname 12345 ID47 - This is a test message with structured data.`, time.UTC, `{"format":"rfc5424","timestamp":"2023-06-03T17:42:32.123456789Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","message":"This is a test message with structured data."}`) f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [exampleSDID@32473 iut="3" eventSource="Application 123 = ] 56" eventID="11211"] This is a test message with structured data.`, time.UTC, - `{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473":"iut=\"3\" eventSource=\"Application 123 = ] 56\" eventID=\"11211\"","message":"This is a test message with structured data."}`) + `{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473.iut":"3","exampleSDID@32473.eventSource":"Application 123 = ] 56","exampleSDID@32473.eventID":"11211","message":"This is a test message with structured data."}`) f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [foo@123 iut="3"][bar@456 eventID="11211"] This is a test message with structured data.`, time.UTC, - `{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","foo@123":"iut=\"3\"","bar@456":"eventID=\"11211\"","message":"This is a test message with structured data."}`) + `{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","foo@123.iut":"3","bar@456.eventID":"11211","message":"This is a test message with structured data."}`) // Incomplete RFC 3164 f("", time.UTC, `{}`)