lib/logstorage: parse syslog structured data into separate fields in order to simplify further querying of this data

This commit is contained in:
Aliaksandr Valialkin 2024-06-25 14:52:43 +02:00
parent 9e1c037249
commit 3eacd43fff
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
6 changed files with 37 additions and 7 deletions

View file

@ -94,7 +94,7 @@ func TestProcessStreamInternal_Success(t *testing.T) {
timestampsExpected := []int64{1685794113000000000, 1685880513000000000, 1685814132345000000}
resultExpected := `{"format":"rfc3164","timestamp":"","hostname":"abcd","app_name":"systemd","_msg":"Starting Update the local ESM caches..."}
{"priority":"165","facility":"20","severity":"5","format":"rfc3164","timestamp":"","hostname":"abcd","app_name":"systemd","proc_id":"345","_msg":"abc defg"}
{"priority":"123","facility":"15","severity":"3","format":"rfc5424","timestamp":"","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473":"iut=\"3\" eventSource=\"Application 123 = ] 56\" eventID=\"11211\"","_msg":"This is a test message with structured data."}`
{"priority":"123","facility":"15","severity":"3","format":"rfc5424","timestamp":"","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473.iut":"3","exampleSDID@32473.eventSource":"Application 123 = ] 56","exampleSDID@32473.eventID":"11211","_msg":"This is a test message with structured data."}`
f(data, currentYear, rowsExpected, timestampsExpected, resultExpected)
}

View file

@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta
## tip
* FEATURE: [syslog data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/): parse [STRUCTURED-DATA](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3) into `SD-ID.field1=value1`, `SD-ID.field2=value2`, ..., `SD-ID.fieldN=valueN` [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). Previously the `STRUCTURED-DATA` was parsed into a single log field with the `SD-ID` name and `field1=value1 field2=value2 ... fieldN=valueN` value. This could complicate querying of such data.
* BUGFIX: properly parse timestamps with timezones during [data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/) and [querying](https://docs.victoriametrics.com/victorialogs/querying/). This has been broken in [v0.20.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.20.0-victorialogs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6508).
## [v0.22.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.22.0-victorialogs)

View file

@ -2571,8 +2571,8 @@ The following fields are unpacked:
The `<PRI>` part is optional. If it is missing, then `priority`, `facility` and `severity` fields aren't set.
The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID` name and `param1="value1" ... paramN="valueN"` value
according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3). The value then can be parsed to separate fields with [`unpack_logfmt` pipe](#unpack_logfmt-pipe).
The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID.param1`, `SD-ID.param2`, ..., `SD-ID.paramN` names and the corresponding values
according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3).
For example, the following query unpacks [syslog](https://en.wikipedia.org/wiki/Syslog) message from the [`_msg` field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field)
across logs for the last 5 minutes:

View file

@ -50,6 +50,9 @@ from the received Syslog lines:
- `format` - this field is set to either `rfc3164` or `rfc5424` depending on the format of the parsed syslog line
- `msg_id` - `MSGID` field from log line in `RFC5424` format.
The `[STRUCTURED-DATA]` is parsed into fields with the `SD-ID.param1`, `SD-ID.param2`, ..., `SD-ID.paramN` names and the corresponding values
according to [the specification](https://datatracker.ietf.org/doc/html/rfc5424#section-6.3).
By default local timezone is used when parsing timestamps in `rfc3164` lines. This can be changed to any desired timezone via `-syslog.timezone` command-line flag.
See [the list of supported timezone identifiers](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). For example, the following command starts VictoriaLogs,
which parses syslog timestamps in `rfc3164` using `Europe/Berlin` timezone:

View file

@ -52,10 +52,18 @@ type SyslogParser struct {
// Fields contains parsed fields after Parse call.
Fields []Field
// buf contains temporary data used in Fields.
buf []byte
// sdParser is used for structured data parsing in rfc5424.
// See https://datatracker.ietf.org/doc/html/rfc5424#section-6.3
sdParser logfmtParser
// currentYear is used as the current year for rfc3164 messages.
currentYear int
timezone *time.Location
// timezeon is used as the current timezeon for rfc3164 messages.
timezone *time.Location
}
func (p *SyslogParser) reset() {
@ -69,6 +77,7 @@ func (p *SyslogParser) resetFields() {
p.Fields = p.Fields[:0]
p.buf = p.buf[:0]
p.sdParser.reset()
}
func (p *SyslogParser) addField(name, value string) {
@ -259,7 +268,23 @@ func (p *SyslogParser) parseRFC5424SDLine(s string) (string, bool) {
}
sdValue := strings.TrimSpace(s[:i])
p.addField(sdID, sdValue)
p.sdParser.parse(sdValue)
if len(p.sdParser.fields) == 0 {
// Special case when structured data doesn't contain any fields
p.addField(sdID, "")
} else {
for _, f := range p.sdParser.fields {
bufLen := len(p.buf)
p.buf = append(p.buf, sdID...)
p.buf = append(p.buf, '.')
p.buf = append(p.buf, f.Name...)
fieldName := bytesutil.ToUnsafeString(p.buf[bufLen:])
p.addField(fieldName, f.Value)
}
}
s = s[i+1:]
return s, true
}

View file

@ -38,9 +38,9 @@ func TestSyslogParser(t *testing.T) {
f(`1 2023-06-03T17:42:32.123456789Z mymachine.example.com appname 12345 ID47 - This is a test message with structured data.`, time.UTC,
`{"format":"rfc5424","timestamp":"2023-06-03T17:42:32.123456789Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","message":"This is a test message with structured data."}`)
f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [exampleSDID@32473 iut="3" eventSource="Application 123 = ] 56" eventID="11211"] This is a test message with structured data.`, time.UTC,
`{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473":"iut=\"3\" eventSource=\"Application 123 = ] 56\" eventID=\"11211\"","message":"This is a test message with structured data."}`)
`{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","exampleSDID@32473.iut":"3","exampleSDID@32473.eventSource":"Application 123 = ] 56","exampleSDID@32473.eventID":"11211","message":"This is a test message with structured data."}`)
f(`<165>1 2023-06-03T17:42:00.000Z mymachine.example.com appname 12345 ID47 [foo@123 iut="3"][bar@456 eventID="11211"] This is a test message with structured data.`, time.UTC,
`{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","foo@123":"iut=\"3\"","bar@456":"eventID=\"11211\"","message":"This is a test message with structured data."}`)
`{"priority":"165","facility":"20","severity":"5","format":"rfc5424","timestamp":"2023-06-03T17:42:00.000Z","hostname":"mymachine.example.com","app_name":"appname","proc_id":"12345","msg_id":"ID47","foo@123.iut":"3","bar@456.eventID":"11211","message":"This is a test message with structured data."}`)
// Incomplete RFC 3164
f("", time.UTC, `{}`)