wip

2024-12-31 15:06:26 +00:00 · 2024-05-30 19:45:10 +02:00 · 2024-05-30 19:45:10 +02:00 · 68b3bd370e
commit 68b3bd370e
parent b21c39a871
5 changed files with 343 additions and 0 deletions
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@ -19,6 +19,8 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta
 ## tip
 * FEATURE: add [`drop_empty_fields` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#drop_empty_fields-pipe) for dropping [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) with empty values.
 ## [v0.15.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v0.15.0-victorialogs)
 Released at 2024-05-30
--- a/docs/VictoriaLogs/LogsQL.md
+++ b/docs/VictoriaLogs/LogsQL.md
@ -1155,6 +1155,7 @@ LogsQL supports the following pipes:
 - [`copy`](#copy-pipe) copies [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model).
 - [`delete`](#delete-pipe) deletes [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model).
 - [`drop_empty_fields`](#drop_empty_fields-pipe) drops [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) with empty values.
 - [`extract`](#extract-pipe) extracts the sepcified text into the given log fields.
 - [`extract_regexp`](#extract_regexp-pipe) extracts the sepcified text into the given log fields via [RE2 regular expressions](https://github.com/google/re2/wiki/Syntax).
 - [`field_names`](#field_names-pipe) returns all the names of [log fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model).
@ -1219,6 +1220,22 @@ See also:
 - [`rename` pipe](#rename-pipe)
 - [`fields` pipe](#fields-pipe)
 ### drop_empty_fields pipe
 `| drop_empty_fields` pipe drops [fields](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) with empty values. It also skips log entries with zero non-empty fields.
 For example, the following query drops possible empty `email` field generated by [`extract` pipe](#extract-pipe) if the `foo` field doesn't contain email:
 ```logsql
 _time:5m | extract 'email: <email>,' from foo | drop_empty_fields
 ```
 See also:
 - [`filter` pipe](#filter-pipe)
 - [`extract` pipe](#extract-pipe)
 ### extract pipe
 `| extract "pattern" from field_name` [pipe](#pipes) allows extracting abitrary text into output fields according to the [`pattern`](#format-for-extract-pipe-pattern) from the given
@ -1295,6 +1312,7 @@ the corresponding matching substring to.
 Matching starts from the first occurence of the `text1` in the input text. If the `pattern` starts with `<field1>` and doesn't contain `text1`,
 then the matching starts from the beginning of the input text. Matching is performed sequentially according to the `pattern`. If some `textX` isn't found
 in the remaining input text, then the remaining named placeholders receive empty string values and the matching finishes prematurely.
 The empty string values can be dropped with [`drop_empty_fields` pipe](#drop_empty_fields-pipe).
 Matching finishes successfully when `textN+1` is found in the input text.
 If the `pattern` ends with `<fieldN>` and doesn't contain `textN+1`, then the `<fieldN>` matches the remaining input text.
--- a/lib/logstorage/pipe.go
+++ b/lib/logstorage/pipe.go
@ -106,6 +106,12 @@ func parsePipe(lex *lexer) (pipe, error) {
 			return nil, fmt.Errorf("cannot parse 'delete' pipe: %w", err)
 		}
 		return pd, nil
 	case lex.isKeyword("drop_empty_fields"):
 		pd, err := parsePipeDropEmptyFields(lex)
 		if err != nil {
 			return nil, fmt.Errorf("cannot parse 'drop_empty_fields' pipe: %w", err)
 		}
 		return pd, nil
 	case lex.isKeyword("extract"):
 		pe, err := parsePipeExtract(lex)
 		if err != nil {
--- a/lib/logstorage/pipe_drop_empty_fields.go
+++ b/lib/logstorage/pipe_drop_empty_fields.go
@ -0,0 +1,223 @@
 package logstorage
 import (
 	"fmt"
 	"unsafe"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/slicesutil"
 )
 // pipeDropEmptyFields processes '| drop_empty_fields ...' pipe.
 //
 // See https://docs.victoriametrics.com/victorialogs/logsql/#drop_empty_fields-pipe
 type pipeDropEmptyFields struct {
 }
 func (pd *pipeDropEmptyFields) String() string {
 	return "drop_empty_fields"
 }
 func (pd *pipeDropEmptyFields) optimize() {
 	// nothing to do
 }
 func (pd *pipeDropEmptyFields) hasFilterInWithQuery() bool {
 	return false
 }
 func (pd *pipeDropEmptyFields) initFilterInValues(_ map[string][]string, _ getFieldValuesFunc) (pipe, error) {
 	return pd, nil
 }
 func (pd *pipeDropEmptyFields) updateNeededFields(_, _ fieldsSet) {
 	// nothing to do
 }
 func (pd *pipeDropEmptyFields) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppNext pipeProcessor) pipeProcessor {
 	return &pipeDropEmptyFieldsProcessor{
 		ppNext: ppNext,
 		shards: make([]pipeDropEmptyFieldsProcessorShard, workersCount),
 	}
 }
 type pipeDropEmptyFieldsProcessor struct {
 	ppNext pipeProcessor
 	shards []pipeDropEmptyFieldsProcessorShard
 }
 type pipeDropEmptyFieldsProcessorShard struct {
 	pipeDropEmptyFieldsProcessorShardNopad
 	// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .
 	_ [128 - unsafe.Sizeof(pipeDropEmptyFieldsProcessorShardNopad{})%128]byte
 }
 type pipeDropEmptyFieldsProcessorShardNopad struct {
 	columnValues [][]string
 	fields       []Field
 	wctx pipeDropEmptyFieldsWriteContext
 }
 func (pdp *pipeDropEmptyFieldsProcessor) writeBlock(workerID uint, br *blockResult) {
 	if len(br.timestamps) == 0 {
 		return
 	}
 	shard := &pdp.shards[workerID]
 	cs := br.getColumns()
 	shard.columnValues = slicesutil.SetLength(shard.columnValues, len(cs))
 	columnValues := shard.columnValues
 	for i, c := range cs {
 		columnValues[i] = c.getValues(br)
 	}
 	if !hasEmptyValues(columnValues) {
 		// Fast path - just write br to ppNext, since it has no empty values.
 		pdp.ppNext.writeBlock(workerID, br)
 		return
 	}
 	// Slow path - drop fields with empty values
 	shard.wctx.init(workerID, pdp.ppNext)
 	fields := shard.fields
 	for rowIdx := range br.timestamps {
 		fields = fields[:0]
 		for i, values := range columnValues {
 			v := values[rowIdx]
 			if v == "" {
 				continue
 			}
 			fields = append(fields, Field{
 				Name:  cs[i].name,
 				Value: values[rowIdx],
 			})
 		}
 		shard.wctx.writeRow(fields)
 	}
 	shard.fields = fields
 	shard.wctx.flush()
 }
 func (pdp *pipeDropEmptyFieldsProcessor) flush() error {
 	return nil
 }
 type pipeDropEmptyFieldsWriteContext struct {
 	workerID uint
 	ppNext   pipeProcessor
 	rcs []resultColumn
 	br  blockResult
 	// rowsCount is the number of rows in the current block
 	rowsCount int
 	// valuesLen is the total length of values in the current block
 	valuesLen int
 }
 func (wctx *pipeDropEmptyFieldsWriteContext) reset() {
 	wctx.workerID = 0
 	wctx.ppNext = nil
 	rcs := wctx.rcs
 	for i := range rcs {
 		rcs[i].reset()
 	}
 	wctx.rcs = rcs[:0]
 	wctx.rowsCount = 0
 	wctx.valuesLen = 0
 }
 func (wctx *pipeDropEmptyFieldsWriteContext) init(workerID uint, ppNext pipeProcessor) {
 	wctx.reset()
 	wctx.workerID = workerID
 	wctx.ppNext = ppNext
 }
 func (wctx *pipeDropEmptyFieldsWriteContext) writeRow(fields []Field) {
 	if len(fields) == 0 {
 		// skip rows without non-empty fields
 		return
 	}
 	rcs := wctx.rcs
 	areEqualColumns := len(rcs) == len(fields)
 	if areEqualColumns {
 		for i, f := range fields {
 			if rcs[i].name != f.Name {
 				areEqualColumns = false
 				break
 			}
 		}
 	}
 	if !areEqualColumns {
 		// send the current block to ppNext and construct a block with new set of columns
 		wctx.flush()
 		rcs = wctx.rcs[:0]
 		for _, f := range fields {
 			rcs = appendResultColumnWithName(rcs, f.Name)
 		}
 		wctx.rcs = rcs
 	}
 	for i, f := range fields {
 		v := f.Value
 		rcs[i].addValue(v)
 		wctx.valuesLen += len(v)
 	}
 	wctx.rowsCount++
 	if wctx.valuesLen >= 1_000_000 {
 		wctx.flush()
 	}
 }
 func (wctx *pipeDropEmptyFieldsWriteContext) flush() {
 	rcs := wctx.rcs
 	wctx.valuesLen = 0
 	// Flush rcs to ppNext
 	br := &wctx.br
 	br.setResultColumns(rcs, wctx.rowsCount)
 	wctx.rowsCount = 0
 	wctx.ppNext.writeBlock(wctx.workerID, br)
 	br.reset()
 	for i := range rcs {
 		rcs[i].resetValues()
 	}
 }
 func parsePipeDropEmptyFields(lex *lexer) (*pipeDropEmptyFields, error) {
 	if !lex.isKeyword("drop_empty_fields") {
 		return nil, fmt.Errorf("unexpected token: %q; want %q", lex.token, "drop_empty_fields")
 	}
 	lex.nextToken()
 	pd := &pipeDropEmptyFields{}
 	return pd, nil
 }
 func hasEmptyValues(columnValues [][]string) bool {
 	for _, values := range columnValues {
 		for _, v := range values {
 			if v == "" {
 				return true
 			}
 		}
 	}
 	return false
 }
--- a/lib/logstorage/pipe_drop_empty_fields_test.go
+++ b/lib/logstorage/pipe_drop_empty_fields_test.go
@ -0,0 +1,94 @@
 package logstorage
 import (
 	"testing"
 )
 func TestParsePipeDropEmptyFieldsSuccess(t *testing.T) {
 	f := func(pipeStr string) {
 		t.Helper()
 		expectParsePipeSuccess(t, pipeStr)
 	}
 	f(`drop_empty_fields`)
 }
 func TestParsePipeDropEmptyFieldsFailure(t *testing.T) {
 	f := func(pipeStr string) {
 		t.Helper()
 		expectParsePipeFailure(t, pipeStr)
 	}
 	f(`drop_empty_fields foo`)
 }
 func TestPipeDropEmptyFields(t *testing.T) {
 	f := func(pipeStr string, rows, rowsExpected [][]Field) {
 		t.Helper()
 		expectPipeResults(t, pipeStr, rows, rowsExpected)
 	}
 	f(`drop_empty_fields`, [][]Field{
 		{
 			{"a", "foo"},
 			{"b", "bar"},
 			{"c", "baz"},
 		},
 	}, [][]Field{
 		{
 			{"a", "foo"},
 			{"b", "bar"},
 			{"c", "baz"},
 		},
 	})
 	f(`drop_empty_fields`, [][]Field{
 		{
 			{"a", "foo"},
 			{"b", "bar"},
 			{"c", "baz"},
 		},
 		{
 			{"a", "foo1"},
 			{"b", ""},
 			{"c", "baz1"},
 		},
 		{
 			{"a", ""},
 			{"b", "bar2"},
 		},
 		{
 			{"a", ""},
 			{"b", ""},
 			{"c", ""},
 		},
 	}, [][]Field{
 		{
 			{"a", "foo"},
 			{"b", "bar"},
 			{"c", "baz"},
 		},
 		{
 			{"a", "foo1"},
 			{"c", "baz1"},
 		},
 		{
 			{"b", "bar2"},
 		},
 	})
 }
 func TestPipeDropEmptyFieldsUpdateNeededFields(t *testing.T) {
 	f := func(s string, neededFields, unneededFields, neededFieldsExpected, unneededFieldsExpected string) {
 		t.Helper()
 		expectPipeNeededFields(t, s, neededFields, unneededFields, neededFieldsExpected, unneededFieldsExpected)
 	}
 	// all the needed fields
 	f(`drop_empty_fields`, "*", "", "*", "")
 	// non-empty unneeded fields
 	f(`drop_empty_fields`, "*", "f1,f2", "*", "f1,f2")
 	// non-empty needed fields
 	f(`drop_empty_fields`, "x,y", "", "x,y", "")
 }