wip

2024-12-31 15:06:26 +00:00 · 2024-05-22 14:05:32 +02:00 · 2024-05-22 14:05:32 +02:00 · 6458b5c138
commit 6458b5c138
parent fb251af08a
5 changed files with 323 additions and 77 deletions
--- a/lib/logstorage/pattern.go
+++ b/lib/logstorage/pattern.go
@ -31,11 +31,47 @@ type patternStep struct {
 	field  string
 }

-func newPattern(steps []patternStep) *pattern {
-	if len(steps) == 0 {
-		logger.Panicf("BUG: steps cannot be empty")
+func (ptn *pattern) clone() *pattern {
+	steps := ptn.steps
+	fields, matches := newFieldsAndMatchesFromPatternSteps(steps)
+	if len(fields) == 0 {
+		logger.Panicf("BUG: fields cannot be empty for steps=%v", steps)
+	}
+	return &pattern{
+		steps:   steps,
+		matches: matches,
+		fields:  fields,
+	}
+}
+
+func parsePattern(s string) (*pattern, error) {
+	steps, err := parsePatternSteps(s)
+	if err != nil {
+		return nil, err
 	}

+	// Verify that prefixes are non-empty between fields. The first prefix may be empty.
+	for i := 1; i < len(steps); i++ {
+		if steps[i].prefix == "" {
+			return nil, fmt.Errorf("missing delimiter between <%s> and <%s>", steps[i-1].field, steps[i].field)
+		}
+	}
+
+	// Build pattern struct
+	fields, matches := newFieldsAndMatchesFromPatternSteps(steps)
+	if len(fields) == 0 {
+		return nil, fmt.Errorf("pattern %q must contain at least a single named field in the form <field_name>", s)
+	}
+
+	ptn := &pattern{
+		steps:   steps,
+		matches: matches,
+		fields:  fields,
+	}
+	return ptn, nil
+}
+
+func newFieldsAndMatchesFromPatternSteps(steps []patternStep) ([]patternField, []string) {
 	matches := make([]string, len(steps))

 	var fields []patternField
@ -47,22 +83,14 @@ func newPattern(steps []patternStep) *pattern {
 			})
 		}
 	}
-	if len(fields) == 0 {
-		logger.Panicf("BUG: fields cannot be empty")
-	}

-	ef := &pattern{
-		steps:   steps,
-		matches: matches,
-		fields:  fields,
-	}
-	return ef
+	return fields, matches
 }

-func (ef *pattern) apply(s string) {
-	clear(ef.matches)
+func (ptn *pattern) apply(s string) {
+	clear(ptn.matches)

-	steps := ef.steps
+	steps := ptn.steps

 	if prefix := steps[0].prefix; prefix != "" {
 		n := strings.Index(s, prefix)
@ -73,7 +101,7 @@ func (ef *pattern) apply(s string) {
 		s = s[n+len(prefix):]
 	}

-	matches := ef.matches
+	matches := ptn.matches
 	for i := range steps {
 		nextPrefix := ""
 		if i+1 < len(steps) {
@ -126,13 +154,18 @@ func tryUnquoteString(s string) (string, int) {
 }

 func parsePatternSteps(s string) ([]patternStep, error) {
-	var steps []patternStep
+	if len(s) == 0 {
+		return nil, nil
+	}

-	hasNamedField := false
+	var steps []patternStep

 	n := strings.IndexByte(s, '<')
 	if n < 0 {
-		return nil, fmt.Errorf("missing <...> fields")
+		steps = append(steps, patternStep{
+			prefix: s,
+		})
+		return steps, nil
 	}
 	prefix := s[:n]
 	s = s[n+1:]
@ -151,9 +184,6 @@ func parsePatternSteps(s string) ([]patternStep, error) {
 			prefix: prefix,
 			field:  field,
 		})
-		if !hasNamedField && field != "" {
-			hasNamedField = true
-		}
 		if len(s) == 0 {
 			break
 		}
@ -165,17 +195,10 @@ func parsePatternSteps(s string) ([]patternStep, error) {
 			})
 			break
 		}
-		if n == 0 {
-			return nil, fmt.Errorf("missing delimiter after <%s>", field)
-		}
 		prefix = s[:n]
 		s = s[n+1:]
 	}

-	if !hasNamedField {
-		return nil, fmt.Errorf("missing named fields like <name>")
-	}
-
 	for i := range steps {
 		step := &steps[i]
 		step.prefix = html.UnescapeString(step.prefix)
--- a/lib/logstorage/pattern_test.go
+++ b/lib/logstorage/pattern_test.go
@ -6,26 +6,34 @@ import (
 )

 func TestPatternApply(t *testing.T) {
-	f := func(pattern, s string, resultsExpected []string) {
+	f := func(patternStr, s string, resultsExpected []string) {
 		t.Helper()

-		steps, err := parsePatternSteps(pattern)
-		if err != nil {
-			t.Fatalf("unexpected error: %s", err)
+		checkFields := func(ptn *pattern) {
+			t.Helper()
+			if len(ptn.fields) != len(resultsExpected) {
+				t.Fatalf("unexpected number of results; got %d; want %d", len(ptn.fields), len(resultsExpected))
 			}
-		ef := newPattern(steps)
-		ef.apply(s)
-
-		if len(ef.fields) != len(resultsExpected) {
-			t.Fatalf("unexpected number of results; got %d; want %d", len(ef.fields), len(resultsExpected))
-		}
-		for i, f := range ef.fields {
+			for i, f := range ptn.fields {
 				if v := *f.value; v != resultsExpected[i] {
 					t.Fatalf("unexpected value for field %q; got %q; want %q", f.name, v, resultsExpected[i])
 				}
 			}
 		}

+		ptn, err := parsePattern(patternStr)
+		if err != nil {
+			t.Fatalf("cannot parse %q: %s", patternStr, err)
+		}
+		ptn.apply(s)
+		checkFields(ptn)
+
+		// clone pattern and check fields again
+		ptnCopy := ptn.clone()
+		ptnCopy.apply(s)
+		checkFields(ptn)
+	}
+
 	f("<foo>", "", []string{""})
 	f("<foo>", "abc", []string{"abc"})
 	f("<foo>bar", "", []string{""})
@ -57,6 +65,30 @@ func TestPatternApply(t *testing.T) {
 	f(`<foo>,"bar`, `"foo,\"bar"`, []string{`foo,"bar`})
 }

+func TestParsePatternFailure(t *testing.T) {
+	f := func(patternStr string) {
+		t.Helper()
+
+		ptn, err := parsePattern(patternStr)
+		if err == nil {
+			t.Fatalf("expecting error when parsing %q; got %v", patternStr, ptn)
+		}
+	}
+
+	// Missing named fields
+	f("")
+	f("foobar")
+	f("<>")
+	f("<>foo<>bar")
+
+	// Missing delimiter between fields
+	f("<foo><bar>")
+	f("abc<foo><bar>def")
+	f("abc<foo><bar>")
+	f("abc<foo><_>")
+	f("abc<_><_>")
+}
+
 func TestParsePatternStepsSuccess(t *testing.T) {
 	f := func(s string, stepsExpected []patternStep) {
 		t.Helper()
@ -70,6 +102,33 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 		}
 	}

+	f("", nil)
+
+	f("foobar", []patternStep{
+		{
+			prefix: "foobar",
+		},
+	})
+
+	f("<>", []patternStep{
+		{},
+	})
+
+	f("foo<>", []patternStep{
+		{
+			prefix: "foo",
+		},
+	})
+
+	f("<foo><bar>", []patternStep{
+		{
+			field: "foo",
+		},
+		{
+			field: "bar",
+		},
+	})
+
 	f("<foo>", []patternStep{
 		{
 			field: "foo",
@ -141,38 +200,19 @@ func TestParsePatternStepsSuccess(t *testing.T) {
 			prefix: "&gt;",
 		},
 	})
+
 }

 func TestParsePatternStepsFailure(t *testing.T) {
 	f := func(s string) {
 		t.Helper()

-		_, err := parsePatternSteps(s)
+		steps, err := parsePatternSteps(s)
 		if err == nil {
-			t.Fatalf("expecting non-nil error when parsing %q", s)
+			t.Fatalf("expecting non-nil error when parsing %q; got steps: %v", s, steps)
 		}
 	}

-	// empty string
-	f("")
-
-	// zero fields
-	f("foobar")
-
-	// Zero named fields
-	f("<>")
-	f("foo<>")
-	f("<>foo")
-	f("foo<_>bar<*>baz<>xxx")
-
-	// missing delimiter between fields
-	f("<foo><bar>")
-	f("<><bar>")
-	f("<foo><>")
-	f("bb<foo><><bar>aa")
-	f("aa<foo><bar>")
-	f("aa<foo><bar>bb")
-
 	// missing >
 	f("<foo")
 	f("foo<bar")
--- a/lib/logstorage/pattern_timing_test.go
+++ b/lib/logstorage/pattern_timing_test.go
@ -51,10 +51,10 @@ func BenchmarkPatternApply(b *testing.B) {
 	})
 }

-func benchmarkPatternApply(b *testing.B, pattern string, a []string) {
-	steps, err := parsePatternSteps(pattern)
+func benchmarkPatternApply(b *testing.B, patternStr string, a []string) {
+	ptnMain, err := parsePattern(patternStr)
 	if err != nil {
-		b.Fatalf("unexpected error: %s", err)
+		b.Fatalf("cannot parse pattern %q: %s", patternStr, err)
 	}

 	n := 0
@ -65,12 +65,12 @@ func benchmarkPatternApply(b *testing.B, pattern string, a []string) {
 	b.ReportAllocs()
 	b.SetBytes(int64(n))
 	b.RunParallel(func(pb *testing.PB) {
+		ptn := ptnMain.clone()
 		sink := 0
-		ef := newPattern(steps)
 		for pb.Next() {
 			for _, s := range a {
-				ef.apply(s)
-				for _, v := range ef.matches {
+				ptn.apply(s)
+				for _, v := range ptn.matches {
 					sink += len(v)
 				}
 			}
--- a/lib/logstorage/pipe_extract.go
+++ b/lib/logstorage/pipe_extract.go
@ -9,7 +9,7 @@ import (
 // See https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe
 type pipeExtract struct {
 	fromField string
-	steps     []patternStep
+	ptn       *pattern

 	patternStr string

@ -33,7 +33,7 @@ func (pe *pipeExtract) updateNeededFields(neededFields, unneededFields fieldsSet
 	if neededFields.contains("*") {
 		unneededFieldsOrig := unneededFields.clone()
 		needFromField := false
-		for _, step := range pe.steps {
+		for _, step := range pe.ptn.steps {
 			if step.field != "" {
 				if !unneededFieldsOrig.contains(step.field) {
 					needFromField = true
@ -52,7 +52,7 @@ func (pe *pipeExtract) updateNeededFields(neededFields, unneededFields fieldsSet
 	} else {
 		neededFieldsOrig := neededFields.clone()
 		needFromField := false
-		for _, step := range pe.steps {
+		for _, step := range pe.ptn.steps {
 			if step.field != "" && neededFieldsOrig.contains(step.field) {
 				needFromField = true
 				neededFields.remove(step.field)
@ -70,7 +70,7 @@ func (pe *pipeExtract) updateNeededFields(neededFields, unneededFields fieldsSet
 func (pe *pipeExtract) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {
 	patterns := make([]*pattern, workersCount)
 	for i := range patterns {
-		patterns[i] = newPattern(pe.steps)
+		patterns[i] = pe.ptn.clone()
 	}

 	unpackFunc := func(uctx *fieldsUnpackerContext, s string) {
@ -105,14 +105,14 @@ func parsePipeExtract(lex *lexer) (*pipeExtract, error) {
 	if err != nil {
 		return nil, fmt.Errorf("cannot read 'pattern': %w", err)
 	}
-	steps, err := parsePatternSteps(patternStr)
+	ptn, err := parsePattern(patternStr)
 	if err != nil {
 		return nil, fmt.Errorf("cannot parse 'pattern' %q: %w", patternStr, err)
 	}

 	pe := &pipeExtract{
 		fromField:  fromField,
-		steps:      steps,
+		ptn:        ptn,
 		patternStr: patternStr,
 	}

--- a/lib/logstorage/pipe_format.go
+++ b/lib/logstorage/pipe_format.go
@ -0,0 +1,183 @@
+package logstorage
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+)
+
+// pipeFormat processes '| format ...' pipe.
+//
+// See https://docs.victoriametrics.com/victorialogs/logsql/#format-pipe
+type pipeFormat struct {
+	formatStr string
+	steps     []patternStep
+
+	resultField string
+
+	// iff is an optional filter for skipping the format func
+	iff *ifFilter
+}
+
+func (pf *pipeFormat) String() string {
+	s := "format" + quoteTokenIfNeeded(pf.formatStr)
+	if pf.iff != nil {
+		s += " " + pf.iff.String()
+	}
+	s += " as " + quoteTokenIfNeeded(pf.resultField)
+	return s
+}
+
+func (pf *pipeFormat) updateNeededFields(neededFields, unneededFields fieldsSet) {
+	if neededFields.contains("*") {
+		if !unneededFields.contains(pf.resultField) {
+			unneededFields.add(pf.resultField)
+			for _, step := range pf.steps {
+				if step.field != "" {
+					unneededFields.remove(step.field)
+				}
+			}
+		}
+	} else {
+		if neededFields.contains(pf.resultField) {
+			neededFields.remove(pf.resultField)
+			for _, step := range pf.steps {
+				if step.field != "" {
+					neededFields.add(step.field)
+				}
+			}
+		}
+	}
+}
+
+func (pf *pipeFormat) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {
+	return &pipeFormatProcessor{
+		pf:     pf,
+		ppBase: ppBase,
+
+		shards: make([]pipeFormatProcessorShard, workersCount),
+	}
+}
+
+type pipeFormatProcessor struct {
+	pf     *pipeFormat
+	ppBase pipeProcessor
+
+	shards []pipeFormatProcessorShard
+}
+
+type pipeFormatProcessorShard struct {
+	pipeFormatProcessorShardNopad
+
+	// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .
+	_ [128 - unsafe.Sizeof(pipeFormatProcessorShardNopad{})%128]byte
+}
+
+type pipeFormatProcessorShardNopad struct {
+	bm bitmap
+
+	uctx fieldsUnpackerContext
+	wctx pipeUnpackWriteContext
+}
+
+func (pfp *pipeFormatProcessor) writeBlock(workerID uint, br *blockResult) {
+	if len(br.timestamps) == 0 {
+		return
+	}
+
+	shard := &pfp.shards[workerID]
+	shard.wctx.init(workerID, pfp.ppBase, br)
+	shard.uctx.init(workerID, "")
+
+	bm := &shard.bm
+	bm.init(len(br.timestamps))
+	bm.setBits()
+	if iff := pfp.pf.iff; iff != nil {
+		iff.f.applyToBlockResult(br, bm)
+		if bm.isZero() {
+			pfp.ppBase.writeBlock(workerID, br)
+			return
+		}
+	}
+
+	for rowIdx := range br.timestamps {
+		if bm.isSetBit(rowIdx) {
+			shard.formatRow(pfp.pf, br, rowIdx)
+			shard.wctx.writeRow(rowIdx, shard.uctx.fields)
+		} else {
+			shard.wctx.writeRow(rowIdx, nil)
+		}
+	}
+
+	shard.wctx.flush()
+	shard.wctx.reset()
+	shard.uctx.reset()
+}
+
+func (pfp *pipeFormatProcessor) flush() error {
+	return nil
+}
+
+func (shard *pipeFormatProcessorShard) formatRow(pf *pipeFormat, br *blockResult, rowIdx int) {
+	bb := bbPool.Get()
+	b := bb.B
+	for _, step := range pf.steps {
+		b = append(b, step.prefix...)
+		if step.field != "" {
+			c := br.getColumnByName(step.field)
+			v := c.getValueAtRow(br, rowIdx)
+			b = append(b, v...)
+		}
+	}
+	bb.B = b
+
+	s := bytesutil.ToUnsafeString(b)
+	shard.uctx.resetFields()
+	shard.uctx.addField(pf.resultField, s)
+	bbPool.Put(bb)
+}
+
+func parsePipeFormat(lex *lexer) (*pipeFormat, error) {
+	if !lex.isKeyword("format") {
+		return nil, fmt.Errorf("unexpected token: %q; want %q", lex.token, "format")
+	}
+	lex.nextToken()
+
+	// parse format
+	formatStr, err := getCompoundToken(lex)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read 'format': %w", err)
+	}
+	steps, err := parsePatternSteps(formatStr)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse 'pattern' %q: %w", formatStr, err)
+	}
+
+	if !lex.isKeyword("as") {
+		return nil, fmt.Errorf("missing 'as' keyword after 'format %q'", formatStr)
+	}
+	lex.nextToken()
+
+	resultField, err := parseFieldName(lex)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse result field after 'format %q as': %w", formatStr, err)
+	}
+
+	pf := &pipeFormat{
+		formatStr:   formatStr,
+		steps:       steps,
+		resultField: resultField,
+	}
+
+	// parse optional if (...)
+	if lex.isKeyword("if") {
+		iff, err := parseIfFilter(lex)
+		if err != nil {
+			return nil, err
+		}
+		pf.iff = iff
+	}
+
+	return pf, nil
+}