From 9f8dd1ef31159cb42d71dae867368418f859bcc2 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Sat, 27 Apr 2024 00:52:15 +0200 Subject: [PATCH] wip --- docs/VictoriaLogs/CHANGELOG.md | 4 ++- docs/VictoriaLogs/LogsQL.md | 19 ++++++++----- lib/logstorage/parser_test.go | 4 +-- lib/logstorage/pipes.go | 49 +++++++++++++++++++++++++++++----- 4 files changed, 61 insertions(+), 15 deletions(-) diff --git a/docs/VictoriaLogs/CHANGELOG.md b/docs/VictoriaLogs/CHANGELOG.md index 3516dfac6..bebd88932 100644 --- a/docs/VictoriaLogs/CHANGELOG.md +++ b/docs/VictoriaLogs/CHANGELOG.md @@ -19,8 +19,10 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta ## tip +* FEATURE: return all the log fields by default in query results. Previously only [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields), [`_time`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field) and [`_msg`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field) fields were returned by default. +* FEATURE: add support for returning only the requested log [fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). See [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#querying-specific-fields). +* FEATURE: add support for calculating the number of matching logs and the number of logs with non-empty [fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). Grouping by arbitrary set of [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) is supported. See [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#stats) for details. * FEATURE: optimize performance for [LogsQL query](https://docs.victoriametrics.com/victorialogs/logsql/), which contains multiple filters for [words](https://docs.victoriametrics.com/victorialogs/logsql/#word-filter) or [phrases](https://docs.victoriametrics.com/victorialogs/logsql/#phrase-filter) delimited with [`AND` operator](https://docs.victoriametrics.com/victorialogs/logsql/#logical-filter). For example, `foo AND bar` query must find [log messages](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field) with `foo` and `bar` words at faster speed. -* FEATURE: return all the log fields by default in query results. Previously only [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields), [`_time`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field) and [`_msg`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field) fields were returned by default. If only some fields must be returned, then they can be listed in `| fields ...` section as described in [these docs](https://docs.victoriametrics.com/victorialogs/logsql/#querying-specific-fields). * BUGFIX: prevent from additional CPU usage for up to a few seconds after canceling the query. * BUGFIX: prevent from returning log entries with emtpy `_stream` field in the form `"_stream":""` in [search query results](https://docs.victoriametrics.com/victorialogs/querying/). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6042). diff --git a/docs/VictoriaLogs/LogsQL.md b/docs/VictoriaLogs/LogsQL.md index 7c37c9507..b5dd58ae9 100644 --- a/docs/VictoriaLogs/LogsQL.md +++ b/docs/VictoriaLogs/LogsQL.md @@ -1050,14 +1050,18 @@ See the [Roadmap](https://docs.victoriametrics.com/VictoriaLogs/Roadmap.html) fo ## Stats -It is possible to perform stats calculations on the [selected log entries](#filters) at client side with `sort`, `uniq`, etc. Unix commands -according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/querying/#command-line). +LogsQL supports calculating the following stats: -LogsQL will support calculating the following stats based on the [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) +- The number of matching log entries. Examples: + - `error | stats count() as errors_total` returns the number of log messages containing the `error` [word](#word). + - `error | stats by (_stream) count() as errors_by_stream` returns the number of log messages containing the `error` [word](#word) + grouped by [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields). + - `error | stats by (datacenter, namespace) count(trace_id, user_id) as errors_with_trace_and_user` returns the number of log messages containing the `error` [word](#word), + which contain non-empty `trace_id` or `user_id` [fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model), grouped by `datacenter` and `namespace` fields. + +LogsQL will support calculating the following additional stats based on the [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) and fields created by [transformations](#transformations): -- The number of selected logs via `query | stats count() as total` syntax. -- The number of non-empty values for the given field. - The number of unique values for the given field. - The min, max, avg, and sum for the given field. - The median and [percentile](https://en.wikipedia.org/wiki/Percentile) for the given field. @@ -1068,6 +1072,9 @@ For example, `sumIf(response_size, is_admin:true)` calculates the total response It will be possible to group stats by the specified [fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) and by the specified time buckets. +It is possible to perform stats calculations on the [selected log entries](#filters) at client side with `sort`, `uniq`, etc. Unix commands +according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/querying/#command-line). + See the [Roadmap](https://docs.victoriametrics.com/VictoriaLogs/Roadmap.html) for details. ## Sorting @@ -1097,7 +1104,7 @@ See the [Roadmap](https://docs.victoriametrics.com/VictoriaLogs/Roadmap.html) fo By default VictoriaLogs query response contains all the [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model). -If you want selecting some specific fields, then add `| fields field1, field2, ... fieldN` to the end of the query. +If you want selecting some specific fields, then add `| fields field1, field2, ... fieldN` to the query. For example, the following query returns only [`_time`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field), [`_stream`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#stream-fields), `host` and [`_msg`](https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field) fields: diff --git a/lib/logstorage/parser_test.go b/lib/logstorage/parser_test.go index 3deaeede9..369b8afd6 100644 --- a/lib/logstorage/parser_test.go +++ b/lib/logstorage/parser_test.go @@ -815,8 +815,8 @@ func TestParseQuerySuccess(t *testing.T) { // stats count pipe f(`* | Stats count() AS foo`, `* | stats count() as foo`) - f(`* | STATS bY (foo, b.a/r, "b az") count(*) as XYz`, `* | stats by (foo, "b.a/r", "b az") count() as XYz`) - f(`* | stats by() count(x, 'a).b,c|d') as qwert`, `* | stats count(x, "a).b,c|d") as qwert`) + f(`* | STATS bY (foo, b.a/r, "b az") count(*) as XYz`, `* | stats by (foo, "b.a/r", "b az") count(*) as XYz`) + f(`* | stats by() COUNT(x, 'a).b,c|d') as qwert`, `* | stats count(x, "a).b,c|d") as qwert`) } func TestParseQueryFailure(t *testing.T) { diff --git a/lib/logstorage/pipes.go b/lib/logstorage/pipes.go index 22b8652bd..79f597bb7 100644 --- a/lib/logstorage/pipes.go +++ b/lib/logstorage/pipes.go @@ -474,8 +474,7 @@ type statsFuncCount struct { } func (sfc *statsFuncCount) String() string { - fields := getFieldsIgnoreStar(sfc.fields) - return "count(" + fieldNamesString(fields) + ") as " + quoteTokenIfNeeded(sfc.resultName) + return "count(" + fieldNamesString(sfc.fields) + ") as " + quoteTokenIfNeeded(sfc.resultName) } func (sfc *statsFuncCount) newStatsFuncProcessor() statsFuncProcessor { @@ -493,12 +492,50 @@ type statsFuncCountProcessor struct { rowsCount uint64 } -func (sfcp *statsFuncCountProcessor) updateStatsForAllRows(timestamps []int64, _ []BlockColumn) { - sfcp.rowsCount += uint64(len(timestamps)) +func (sfcp *statsFuncCountProcessor) updateStatsForAllRows(timestamps []int64, columns []BlockColumn) { + fields := sfcp.sfc.fields + if len(fields) == 0 || slices.Contains(fields, "*") { + // Fast path - count all the columns. + sfcp.rowsCount += uint64(len(timestamps)) + return + } + + // Slow path - count rows containing at least a single non-empty value for the fields enumerated inside count(). + bm := getFilterBitmap(len(timestamps)) + bm.setBits() + for _, f := range fields { + if idx := getBlockColumnIndex(columns, f); idx >= 0 { + values := columns[idx].Values + bm.forEachSetBit(func(i int) bool { + return values[i] == "" + }) + } + } + + emptyValues := 0 + bm.forEachSetBit(func(i int) bool { + emptyValues++ + return true + }) + + sfcp.rowsCount += uint64(len(timestamps) - emptyValues) } -func (sfcp *statsFuncCountProcessor) updateStatsForRow(_ []int64, _ []BlockColumn, _ int) { - sfcp.rowsCount++ +func (sfcp *statsFuncCountProcessor) updateStatsForRow(_ []int64, columns []BlockColumn, rowIdx int) { + fields := sfcp.sfc.fields + if len(fields) == 0 || slices.Contains(fields, "*") { + // Fast path - count the given column + sfcp.rowsCount++ + return + } + + // Slow path - count the row at rowIdx if at least a single field enumerated inside count() is non-empty + for _, f := range fields { + if idx := getBlockColumnIndex(columns, f); idx >= 0 && columns[idx].Values[rowIdx] != "" { + sfcp.rowsCount++ + return + } + } } func (sfcp *statsFuncCountProcessor) mergeState(sfp statsFuncProcessor) {