VictoriaMetrics/lib/logstorage/pipes.go

838 lines
22 KiB
Go
Raw Normal View History

2024-04-25 22:19:58 +00:00
package logstorage
import (
"fmt"
2024-04-26 21:47:50 +00:00
"slices"
2024-04-25 22:19:58 +00:00
"strings"
2024-04-27 00:50:19 +00:00
"sync/atomic"
2024-04-26 21:47:50 +00:00
"unsafe"
2024-04-25 22:19:58 +00:00
2024-04-26 21:47:50 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
2024-04-25 22:19:58 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
2024-04-28 10:52:21 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
2024-04-25 22:19:58 +00:00
)
type pipe interface {
2024-04-26 21:47:50 +00:00
// String returns string representation of the pipe.
2024-04-25 22:19:58 +00:00
String() string
2024-04-26 21:47:50 +00:00
// newPipeProcessor must return new pipeProcessor for the given ppBase.
//
// workersCount is the number of goroutine workers, which will call writeBlock() method.
//
// If stopCh is closed, the returned pipeProcessor must stop performing CPU-intensive tasks which take more than a few milliseconds.
// It is OK to continue processing pipeProcessor calls if they take less than a few milliseconds.
//
2024-04-27 20:08:03 +00:00
// The returned pipeProcessor may call cancel() at any time in order to notify worker goroutines to stop sending new data to pipeProcessor.
2024-04-26 21:47:50 +00:00
newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppBase pipeProcessor) pipeProcessor
}
// pipeProcessor must process a single pipe.
type pipeProcessor interface {
// writeBlock must write the given block of data to the given pipeProcessor.
//
2024-04-27 20:08:03 +00:00
// writeBlock is called concurrently from worker goroutines.
// The workerID is the id of the worker goroutine, which calls the writeBlock.
2024-04-26 21:47:50 +00:00
// It is in the range 0 ... workersCount-1 .
//
// It is forbidden to hold references to columns after returning from writeBlock, since the caller re-uses columns.
2024-04-27 20:08:03 +00:00
//
// If any error occurs at writeBlock, then cancel() must be called by pipeProcessor in order to notify worker goroutines
// to stop sending new data. The occurred error must be returned from flush().
//
// cancel() may be called also when the pipeProcessor decides to stop accepting new data, even if there is no any error.
2024-04-26 21:47:50 +00:00
writeBlock(workerID uint, timestamps []int64, columns []BlockColumn)
// flush must flush all the data accumulated in the pipeProcessor to the base pipeProcessor.
//
2024-04-27 20:08:03 +00:00
// flush is called after all the worker goroutines are stopped.
//
// It is guaranteed that flush() is called for every pipeProcessor returned from pipe.newPipeProcessor().
flush() error
2024-04-26 21:47:50 +00:00
}
type defaultPipeProcessor func(workerID uint, timestamps []int64, columns []BlockColumn)
func newDefaultPipeProcessor(writeBlock func(workerID uint, timestamps []int64, columns []BlockColumn)) pipeProcessor {
return defaultPipeProcessor(writeBlock)
}
func (dpp defaultPipeProcessor) writeBlock(workerID uint, timestamps []int64, columns []BlockColumn) {
dpp(workerID, timestamps, columns)
}
2024-04-27 20:08:03 +00:00
func (dpp defaultPipeProcessor) flush() error {
return nil
2024-04-25 22:19:58 +00:00
}
func parsePipes(lex *lexer) ([]pipe, error) {
var pipes []pipe
2024-04-26 23:53:32 +00:00
for !lex.isKeyword(")", "") {
2024-04-25 22:19:58 +00:00
if !lex.isKeyword("|") {
return nil, fmt.Errorf("expecting '|'")
}
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing token after '|'")
}
switch {
case lex.isKeyword("fields"):
2024-04-29 01:27:46 +00:00
pf, err := parsePipeFields(lex)
2024-04-25 22:19:58 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'fields' pipe: %w", err)
}
2024-04-29 01:27:46 +00:00
pipes = append(pipes, pf)
2024-04-25 22:19:58 +00:00
case lex.isKeyword("stats"):
2024-04-29 01:30:25 +00:00
ps, err := parseStatsPipe(lex)
2024-04-25 22:19:58 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'stats' pipe: %w", err)
}
2024-04-29 01:30:25 +00:00
pipes = append(pipes, ps)
2024-04-27 00:50:19 +00:00
case lex.isKeyword("head"):
hp, err := parseHeadPipe(lex)
if err != nil {
return nil, fmt.Errorf("cannot parse 'head' pipe: %w", err)
}
pipes = append(pipes, hp)
2024-04-27 01:14:00 +00:00
case lex.isKeyword("skip"):
2024-04-29 01:30:25 +00:00
ps, err := parseSkipPipe(lex)
2024-04-27 01:14:00 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'skip' pipe: %w", err)
}
2024-04-29 01:30:25 +00:00
pipes = append(pipes, ps)
2024-04-25 22:19:58 +00:00
default:
return nil, fmt.Errorf("unexpected pipe %q", lex.token)
}
}
return pipes, nil
}
2024-04-29 01:27:46 +00:00
type pipeFields struct {
2024-04-25 22:19:58 +00:00
// fields contains list of fields to fetch
fields []string
2024-04-28 20:52:15 +00:00
// whether fields contains star
containsStar bool
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:27:46 +00:00
func (pf *pipeFields) String() string {
if len(pf.fields) == 0 {
logger.Panicf("BUG: pipeFields must contain at least a single field")
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:27:46 +00:00
return "fields " + fieldNamesString(pf.fields)
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:27:46 +00:00
func (pf *pipeFields) newPipeProcessor(_ int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {
return &pipeFieldsProcessor{
pf: pf,
2024-04-26 21:47:50 +00:00
ppBase: ppBase,
}
}
2024-04-29 01:27:46 +00:00
type pipeFieldsProcessor struct {
pf *pipeFields
2024-04-26 21:47:50 +00:00
ppBase pipeProcessor
}
2024-04-29 01:27:46 +00:00
func (fpp *pipeFieldsProcessor) writeBlock(workerID uint, timestamps []int64, columns []BlockColumn) {
if fpp.pf.containsStar || areSameBlockColumns(columns, fpp.pf.fields) {
2024-04-26 21:47:50 +00:00
// Fast path - there is no need in additional transformations before writing the block to ppBase.
fpp.ppBase.writeBlock(workerID, timestamps, columns)
return
}
2024-04-29 01:27:46 +00:00
// Slow path - construct columns for fpp.pf.fields before writing them to ppBase.
2024-04-26 21:47:50 +00:00
brs := getBlockRows()
cs := brs.cs
2024-04-29 01:27:46 +00:00
for _, f := range fpp.pf.fields {
2024-04-29 01:22:27 +00:00
values := getBlockColumnValues(columns, f, len(timestamps))
2024-04-26 21:47:50 +00:00
cs = append(cs, BlockColumn{
Name: f,
Values: values,
})
}
fpp.ppBase.writeBlock(workerID, timestamps, cs)
brs.cs = cs
putBlockRows(brs)
}
2024-04-29 01:27:46 +00:00
func (fpp *pipeFieldsProcessor) flush() error {
2024-04-27 20:08:03 +00:00
return nil
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:27:46 +00:00
func parsePipeFields(lex *lexer) (*pipeFields, error) {
2024-04-25 22:19:58 +00:00
var fields []string
for {
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing field name")
}
if lex.isKeyword(",") {
return nil, fmt.Errorf("unexpected ','; expecting field name")
}
2024-04-27 00:50:19 +00:00
field, err := parseFieldName(lex)
if err != nil {
return nil, fmt.Errorf("cannot parse field name: %w", err)
}
2024-04-25 22:19:58 +00:00
fields = append(fields, field)
switch {
2024-04-26 23:53:32 +00:00
case lex.isKeyword("|", ")", ""):
2024-04-29 01:27:46 +00:00
pf := &pipeFields{
2024-04-28 20:52:15 +00:00
fields: fields,
containsStar: slices.Contains(fields, "*"),
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:27:46 +00:00
return pf, nil
2024-04-25 22:19:58 +00:00
case lex.isKeyword(","):
default:
2024-04-26 23:53:32 +00:00
return nil, fmt.Errorf("unexpected token: %q; expecting ',', '|' or ')'", lex.token)
2024-04-25 22:19:58 +00:00
}
}
}
2024-04-29 01:30:25 +00:00
type pipeStats struct {
2024-04-25 22:19:58 +00:00
byFields []string
funcs []statsFunc
}
type statsFunc interface {
// String returns string representation of statsFunc
String() string
// neededFields returns the needed fields for calculating the given stats
neededFields() []string
2024-04-26 21:47:50 +00:00
2024-04-29 01:23:41 +00:00
// newStatsProcessor must create new statsProcessor for calculating stats for the given statsFunc.
2024-04-28 10:52:21 +00:00
//
2024-04-29 01:23:41 +00:00
// It also must return the size in bytes of the returned statsProcessor.
newStatsProcessor() (statsProcessor, int)
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:23:41 +00:00
// statsProcessor must process stats for some statsFunc.
2024-04-26 21:47:50 +00:00
//
2024-04-29 01:23:41 +00:00
// All the statsProcessor methods are called from a single goroutine at a time,
2024-04-26 21:47:50 +00:00
// so there is no need in the internal synchronization.
2024-04-29 01:23:41 +00:00
type statsProcessor interface {
// updateStatsForAllRows must update statsProcessor stats from all the rows.
2024-04-28 10:52:21 +00:00
//
2024-04-29 01:23:41 +00:00
// It must return the increase of internal state size in bytes for the statsProcessor.
2024-04-28 10:52:21 +00:00
updateStatsForAllRows(timestamps []int64, columns []BlockColumn) int
2024-04-26 21:47:50 +00:00
2024-04-29 01:23:41 +00:00
// updateStatsForRow must update statsProcessor stats from the row at rowIndex.
2024-04-28 10:52:21 +00:00
//
2024-04-29 01:23:41 +00:00
// It must return the increase of internal state size in bytes for the statsProcessor.
2024-04-28 10:52:21 +00:00
updateStatsForRow(timestamps []int64, columns []BlockColumn, rowIndex int) int
2024-04-26 21:47:50 +00:00
2024-04-29 01:23:41 +00:00
// mergeState must merge sfp state into statsProcessor state.
mergeState(sfp statsProcessor)
2024-04-26 21:47:50 +00:00
2024-04-29 01:23:41 +00:00
// finalizeStats must return the collected stats from statsProcessor.
2024-04-26 21:47:50 +00:00
finalizeStats() (name, value string)
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:30:25 +00:00
func (ps *pipeStats) String() string {
2024-04-25 22:19:58 +00:00
s := "stats "
2024-04-29 01:30:25 +00:00
if len(ps.byFields) > 0 {
s += "by (" + fieldNamesString(ps.byFields) + ") "
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:30:25 +00:00
if len(ps.funcs) == 0 {
logger.Panicf("BUG: pipeStats must contain at least a single statsFunc")
2024-04-25 22:19:58 +00:00
}
2024-04-29 01:30:25 +00:00
a := make([]string, len(ps.funcs))
for i, f := range ps.funcs {
2024-04-25 22:19:58 +00:00
a[i] = f.String()
}
s += strings.Join(a, ", ")
return s
}
2024-04-28 10:52:21 +00:00
const stateSizeBudgetChunk = 1 << 20
2024-04-29 01:30:25 +00:00
func (ps *pipeStats) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppBase pipeProcessor) pipeProcessor {
2024-04-28 10:52:21 +00:00
maxStateSize := int64(float64(memory.Allowed()) * 0.3)
2024-04-29 01:30:25 +00:00
shards := make([]pipeStatsProcessorShard, workersCount)
2024-04-26 21:47:50 +00:00
for i := range shards {
shard := &shards[i]
2024-04-29 01:30:25 +00:00
shard.ps = ps
shard.m = make(map[string]*pipeStatsGroup)
2024-04-28 10:52:21 +00:00
shard.stateSizeBudget = stateSizeBudgetChunk
maxStateSize -= stateSizeBudgetChunk
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
spp := &pipeStatsProcessor{
ps: ps,
2024-04-26 21:47:50 +00:00
stopCh: stopCh,
cancel: cancel,
ppBase: ppBase,
shards: shards,
2024-04-28 10:52:21 +00:00
maxStateSize: maxStateSize,
2024-04-26 21:47:50 +00:00
}
2024-04-28 10:52:21 +00:00
spp.stateSizeBudget.Store(maxStateSize)
return spp
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
type pipeStatsProcessor struct {
ps *pipeStats
2024-04-26 21:47:50 +00:00
stopCh <-chan struct{}
cancel func()
ppBase pipeProcessor
2024-04-29 01:30:25 +00:00
shards []pipeStatsProcessorShard
2024-04-28 10:52:21 +00:00
maxStateSize int64
stateSizeBudget atomic.Int64
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
type pipeStatsProcessorShard struct {
pipeStatsProcessorShardNopad
2024-04-26 21:47:50 +00:00
2024-04-26 23:53:32 +00:00
// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .
2024-04-29 01:30:25 +00:00
_ [128 - unsafe.Sizeof(pipeStatsProcessorShardNopad{})%128]byte
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
type pipeStatsProcessorShardNopad struct {
ps *pipeStats
m map[string]*pipeStatsGroup
2024-04-26 21:47:50 +00:00
2024-04-28 21:19:40 +00:00
columnValues [][]string
keyBuf []byte
2024-04-28 10:52:21 +00:00
stateSizeBudget int
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
func (shard *pipeStatsProcessorShard) getStatsProcessors(key []byte) []statsProcessor {
2024-04-28 20:15:27 +00:00
spg := shard.m[string(key)]
if spg == nil {
2024-04-29 01:30:25 +00:00
sfps := make([]statsProcessor, len(shard.ps.funcs))
for i, f := range shard.ps.funcs {
2024-04-29 01:23:41 +00:00
sfp, stateSize := f.newStatsProcessor()
2024-04-28 20:15:27 +00:00
sfps[i] = sfp
shard.stateSizeBudget -= stateSize
}
2024-04-29 01:30:25 +00:00
spg = &pipeStatsGroup{
2024-04-28 20:15:27 +00:00
sfps: sfps,
}
shard.m[string(key)] = spg
shard.stateSizeBudget -= len(key) + int(unsafe.Sizeof("")+unsafe.Sizeof(spg)+unsafe.Sizeof(sfps[0])*uintptr(len(sfps)))
2024-04-26 21:47:50 +00:00
}
2024-04-28 22:35:16 +00:00
return spg.sfps
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
type pipeStatsGroup struct {
2024-04-29 01:23:41 +00:00
sfps []statsProcessor
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
func (spp *pipeStatsProcessor) writeBlock(workerID uint, timestamps []int64, columns []BlockColumn) {
2024-04-26 21:47:50 +00:00
shard := &spp.shards[workerID]
2024-04-28 10:52:21 +00:00
for shard.stateSizeBudget < 0 {
// steal some budget for the state size from the global budget.
remaining := spp.stateSizeBudget.Add(-stateSizeBudgetChunk)
if remaining < 0 {
// The state size is too big. Stop processing data in order to avoid OOM crash.
if remaining+stateSizeBudgetChunk >= 0 {
// Notify worker goroutines to stop calling writeBlock() in order to save CPU time.
spp.cancel()
}
return
}
shard.stateSizeBudget += stateSizeBudgetChunk
}
2024-04-29 01:30:25 +00:00
byFields := spp.ps.byFields
2024-04-28 22:35:16 +00:00
if len(byFields) == 0 {
2024-04-28 21:19:40 +00:00
// Fast path - pass all the rows to a single group with empty key.
2024-04-29 01:23:41 +00:00
for _, sfp := range shard.getStatsProcessors(nil) {
2024-04-28 10:52:21 +00:00
shard.stateSizeBudget -= sfp.updateStatsForAllRows(timestamps, columns)
2024-04-26 21:47:50 +00:00
}
return
}
2024-04-28 21:19:40 +00:00
if len(byFields) == 1 {
// Special case for grouping by a single column.
2024-04-29 01:22:27 +00:00
values := getBlockColumnValues(columns, byFields[0], len(timestamps))
2024-04-28 22:54:32 +00:00
if isConstValue(values) {
// Fast path for column with constant value.
shard.keyBuf = encoding.MarshalBytes(shard.keyBuf[:0], bytesutil.ToUnsafeBytes(values[0]))
2024-04-29 01:23:41 +00:00
for _, sfp := range shard.getStatsProcessors(shard.keyBuf) {
2024-04-28 22:54:32 +00:00
shard.stateSizeBudget -= sfp.updateStatsForAllRows(timestamps, columns)
}
return
}
// Slower path for column with different values.
2024-04-29 01:23:41 +00:00
var sfps []statsProcessor
2024-04-28 21:19:40 +00:00
keyBuf := shard.keyBuf
for i := range timestamps {
if i <= 0 || values[i-1] != values[i] {
keyBuf = encoding.MarshalBytes(keyBuf[:0], bytesutil.ToUnsafeBytes(values[i]))
2024-04-29 01:23:41 +00:00
sfps = shard.getStatsProcessors(keyBuf)
2024-04-28 21:19:40 +00:00
}
2024-04-28 22:35:16 +00:00
for _, sfp := range sfps {
2024-04-28 21:19:40 +00:00
shard.stateSizeBudget -= sfp.updateStatsForRow(timestamps, columns, i)
}
}
shard.keyBuf = keyBuf
return
}
// Pre-calculate column values for byFields in order to speed up building group key in the loop below.
2024-04-29 01:30:25 +00:00
shard.columnValues = appendBlockColumnValues(shard.columnValues[:0], columns, byFields, len(timestamps))
2024-04-28 21:19:40 +00:00
columnValues := shard.columnValues
2024-04-26 21:47:50 +00:00
2024-04-28 22:54:32 +00:00
if areConstValues(columnValues) {
// Fast path for columns with constant values.
keyBuf := shard.keyBuf[:0]
for _, values := range columnValues {
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[0]))
}
2024-04-29 01:23:41 +00:00
for _, sfp := range shard.getStatsProcessors(keyBuf) {
2024-04-28 22:54:32 +00:00
shard.stateSizeBudget -= sfp.updateStatsForAllRows(timestamps, columns)
}
2024-04-29 00:04:42 +00:00
shard.keyBuf = keyBuf
2024-04-28 22:54:32 +00:00
return
}
// The slowest path - group by multiple columns.
2024-04-29 01:23:41 +00:00
var sfps []statsProcessor
2024-04-28 21:19:40 +00:00
keyBuf := shard.keyBuf
2024-04-26 21:47:50 +00:00
for i := range timestamps {
2024-04-28 20:42:50 +00:00
// verify whether the key for 'by (...)' fields equals the previous key
2024-04-28 22:35:16 +00:00
sameValue := sfps != nil
2024-04-28 21:19:40 +00:00
for _, values := range columnValues {
2024-04-28 20:42:50 +00:00
if i <= 0 || values[i-1] != values[i] {
sameValue = false
break
2024-04-26 21:47:50 +00:00
}
}
2024-04-28 20:42:50 +00:00
if !sameValue {
// Construct new key for the 'by (...)' fields
keyBuf = keyBuf[:0]
2024-04-28 21:19:40 +00:00
for _, values := range columnValues {
2024-04-28 22:54:32 +00:00
keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[i]))
2024-04-28 20:42:50 +00:00
}
2024-04-29 01:23:41 +00:00
sfps = shard.getStatsProcessors(keyBuf)
2024-04-28 20:42:50 +00:00
}
2024-04-28 22:35:16 +00:00
for _, sfp := range sfps {
2024-04-28 10:52:21 +00:00
shard.stateSizeBudget -= sfp.updateStatsForRow(timestamps, columns, i)
2024-04-26 21:47:50 +00:00
}
}
shard.keyBuf = keyBuf
}
2024-04-28 22:54:32 +00:00
func areConstValues(valuess [][]string) bool {
for _, values := range valuess {
if !isConstValue(values) {
return false
}
}
return true
}
func isConstValue(values []string) bool {
if len(values) == 0 {
2024-04-28 23:32:11 +00:00
// Return false, since it is impossible to get values[0] value from empty values.
2024-04-28 22:54:32 +00:00
return false
}
vFirst := values[0]
for _, v := range values[1:] {
if v != vFirst {
return false
}
}
return true
}
2024-04-29 01:30:25 +00:00
func (spp *pipeStatsProcessor) flush() error {
2024-04-28 10:52:21 +00:00
if n := spp.stateSizeBudget.Load(); n <= 0 {
2024-04-29 01:30:25 +00:00
return fmt.Errorf("cannot calculate [%s], since it requires more than %dMB of memory", spp.ps.String(), spp.maxStateSize/(1<<20))
2024-04-28 10:52:21 +00:00
}
2024-04-26 21:47:50 +00:00
// Merge states across shards
shards := spp.shards
m := shards[0].m
shards = shards[1:]
for i := range shards {
shard := &shards[i]
for key, spg := range shard.m {
// shard.m may be quite big, so this loop can take a lot of time and CPU.
// Stop processing data as soon as stopCh is closed without wasting CPU time.
select {
case <-spp.stopCh:
2024-04-27 20:08:03 +00:00
return nil
2024-04-26 21:47:50 +00:00
default:
}
spgBase := m[key]
if spgBase == nil {
m[key] = spg
} else {
for i, sfp := range spgBase.sfps {
sfp.mergeState(spg.sfps[i])
}
}
}
}
// Write per-group states to ppBase
2024-04-29 01:30:25 +00:00
byFields := spp.ps.byFields
2024-04-27 01:31:19 +00:00
if len(byFields) == 0 && len(m) == 0 {
// Special case - zero matching rows.
2024-04-29 01:23:41 +00:00
_ = shards[0].getStatsProcessors(nil)
2024-04-27 01:31:19 +00:00
m = shards[0].m
}
2024-04-26 21:47:50 +00:00
var values []string
var columns []BlockColumn
for key, spg := range m {
// m may be quite big, so this loop can take a lot of time and CPU.
// Stop processing data as soon as stopCh is closed without wasting CPU time.
select {
case <-spp.stopCh:
2024-04-27 20:08:03 +00:00
return nil
2024-04-26 21:47:50 +00:00
default:
}
// Unmarshal values for byFields from key.
values = values[:0]
keyBuf := bytesutil.ToUnsafeBytes(key)
for len(keyBuf) > 0 {
tail, v, err := encoding.UnmarshalBytes(keyBuf)
if err != nil {
logger.Panicf("BUG: cannot unmarshal value from keyBuf=%q: %w", keyBuf, err)
}
values = append(values, bytesutil.ToUnsafeString(v))
keyBuf = tail
}
if len(values) != len(byFields) {
logger.Panicf("BUG: unexpected number of values decoded from keyBuf; got %d; want %d", len(values), len(byFields))
}
// construct columns for byFields
columns = columns[:0]
for i, f := range byFields {
columns = append(columns, BlockColumn{
Name: f,
Values: values[i : i+1],
})
}
// construct columns for stats functions
for _, sfp := range spg.sfps {
name, value := sfp.finalizeStats()
columns = append(columns, BlockColumn{
Name: name,
Values: []string{value},
})
}
spp.ppBase.writeBlock(0, []int64{0}, columns)
}
2024-04-27 20:08:03 +00:00
return nil
2024-04-26 21:47:50 +00:00
}
2024-04-29 01:30:25 +00:00
func (ps *pipeStats) neededFields() []string {
2024-04-25 22:19:58 +00:00
var neededFields []string
m := make(map[string]struct{})
updateNeededFields := func(fields []string) {
for _, field := range fields {
if _, ok := m[field]; !ok {
m[field] = struct{}{}
neededFields = append(neededFields, field)
}
}
}
2024-04-29 01:30:25 +00:00
updateNeededFields(ps.byFields)
2024-04-25 22:19:58 +00:00
2024-04-29 01:30:25 +00:00
for _, f := range ps.funcs {
2024-04-25 22:19:58 +00:00
fields := f.neededFields()
updateNeededFields(fields)
}
return neededFields
}
2024-04-29 01:30:25 +00:00
func parseStatsPipe(lex *lexer) (*pipeStats, error) {
2024-04-25 22:19:58 +00:00
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing stats config")
}
2024-04-29 01:30:25 +00:00
var ps pipeStats
2024-04-25 22:19:58 +00:00
if lex.isKeyword("by") {
lex.nextToken()
fields, err := parseFieldNamesInParens(lex)
if err != nil {
return nil, fmt.Errorf("cannot parse 'by': %w", err)
}
2024-04-29 01:30:25 +00:00
ps.byFields = fields
2024-04-25 22:19:58 +00:00
}
var funcs []statsFunc
for {
sf, err := parseStatsFunc(lex)
if err != nil {
return nil, err
}
funcs = append(funcs, sf)
2024-04-26 23:53:32 +00:00
if lex.isKeyword("|", ")", "") {
2024-04-29 01:30:25 +00:00
ps.funcs = funcs
return &ps, nil
2024-04-25 22:19:58 +00:00
}
if !lex.isKeyword(",") {
2024-04-26 23:53:32 +00:00
return nil, fmt.Errorf("unexpected token %q; want ',', '|' or ')'", lex.token)
2024-04-25 22:19:58 +00:00
}
lex.nextToken()
}
}
func parseStatsFunc(lex *lexer) (statsFunc, error) {
switch {
case lex.isKeyword("count"):
2024-04-29 01:20:43 +00:00
sfc, err := parseStatsCount(lex)
2024-04-25 22:19:58 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'count' func: %w", err)
}
return sfc, nil
2024-04-27 02:26:15 +00:00
case lex.isKeyword("uniq"):
2024-04-29 01:20:43 +00:00
sfu, err := parseStatsUniq(lex)
2024-04-27 02:26:15 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'uniq' func: %w", err)
}
return sfu, nil
2024-04-29 01:08:35 +00:00
case lex.isKeyword("sum"):
2024-04-29 01:20:43 +00:00
sfs, err := parseStatsSum(lex)
2024-04-29 01:08:35 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse 'sum' func: %w", err)
}
return sfs, nil
2024-04-25 22:19:58 +00:00
default:
return nil, fmt.Errorf("unknown stats func %q", lex.token)
}
}
2024-04-27 02:26:15 +00:00
func parseResultName(lex *lexer) (string, error) {
if lex.isKeyword("as") {
if !lex.mustNextToken() {
return "", fmt.Errorf("missing token after 'as' keyword")
}
}
resultName, err := parseFieldName(lex)
if err != nil {
return "", fmt.Errorf("cannot parse 'as' field name: %w", err)
}
return resultName, nil
}
2024-04-27 00:50:19 +00:00
type headPipe struct {
n uint64
}
func (hp *headPipe) String() string {
return fmt.Sprintf("head %d", hp.n)
}
func (hp *headPipe) newPipeProcessor(_ int, _ <-chan struct{}, cancel func(), ppBase pipeProcessor) pipeProcessor {
2024-04-27 02:43:38 +00:00
if hp.n == 0 {
// Special case - notify the caller to stop writing data to the returned headPipeProcessor
cancel()
}
2024-04-27 00:50:19 +00:00
return &headPipeProcessor{
hp: hp,
cancel: cancel,
ppBase: ppBase,
}
}
type headPipeProcessor struct {
hp *headPipe
cancel func()
ppBase pipeProcessor
2024-04-28 10:52:21 +00:00
rowsProcessed atomic.Uint64
2024-04-27 00:50:19 +00:00
}
func (hpp *headPipeProcessor) writeBlock(workerID uint, timestamps []int64, columns []BlockColumn) {
2024-04-28 10:52:21 +00:00
rowsProcessed := hpp.rowsProcessed.Add(uint64(len(timestamps)))
if rowsProcessed <= hpp.hp.n {
2024-04-27 00:50:19 +00:00
// Fast path - write all the rows to ppBase.
hpp.ppBase.writeBlock(workerID, timestamps, columns)
return
}
// Slow path - overflow. Write the remaining rows if needed.
2024-04-28 10:52:21 +00:00
rowsProcessed -= uint64(len(timestamps))
if rowsProcessed >= hpp.hp.n {
2024-04-27 00:50:19 +00:00
// Nothing to write. There is no need in cancel() call, since it has been called by another goroutine.
return
}
// Write remaining rows.
2024-04-28 10:52:21 +00:00
rowsRemaining := hpp.hp.n - rowsProcessed
2024-04-27 00:50:19 +00:00
cs := make([]BlockColumn, len(columns))
for i, c := range columns {
cDst := &cs[i]
cDst.Name = c.Name
cDst.Values = c.Values[:rowsRemaining]
}
timestamps = timestamps[:rowsRemaining]
hpp.ppBase.writeBlock(workerID, timestamps, cs)
// Notify the caller that it should stop passing more data to writeBlock().
hpp.cancel()
}
2024-04-27 20:08:03 +00:00
func (hpp *headPipeProcessor) flush() error {
return nil
2024-04-27 00:50:19 +00:00
}
func parseHeadPipe(lex *lexer) (*headPipe, error) {
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing the number of head rows to return")
}
2024-04-27 19:15:56 +00:00
n, err := parseUint(lex.token)
2024-04-27 00:50:19 +00:00
if err != nil {
2024-04-27 01:14:00 +00:00
return nil, fmt.Errorf("cannot parse the number of head rows to return %q: %w", lex.token, err)
2024-04-27 00:50:19 +00:00
}
lex.nextToken()
hp := &headPipe{
n: n,
}
return hp, nil
}
2024-04-27 01:14:00 +00:00
type skipPipe struct {
n uint64
}
2024-04-29 01:30:25 +00:00
func (ps *skipPipe) String() string {
return fmt.Sprintf("skip %d", ps.n)
2024-04-27 01:14:00 +00:00
}
2024-04-29 01:30:25 +00:00
func (ps *skipPipe) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {
2024-04-27 01:14:00 +00:00
return &skipPipeProcessor{
2024-04-29 01:30:25 +00:00
ps: ps,
2024-04-27 01:14:00 +00:00
ppBase: ppBase,
}
}
type skipPipeProcessor struct {
2024-04-29 01:30:25 +00:00
ps *skipPipe
2024-04-27 01:14:00 +00:00
ppBase pipeProcessor
2024-04-28 10:52:21 +00:00
rowsProcessed atomic.Uint64
2024-04-27 01:14:00 +00:00
}
func (spp *skipPipeProcessor) writeBlock(workerID uint, timestamps []int64, columns []BlockColumn) {
2024-04-28 10:52:21 +00:00
rowsProcessed := spp.rowsProcessed.Add(uint64(len(timestamps)))
2024-04-29 01:30:25 +00:00
if rowsProcessed <= spp.ps.n {
2024-04-27 01:14:00 +00:00
return
}
2024-04-28 10:52:21 +00:00
rowsProcessed -= uint64(len(timestamps))
2024-04-29 01:30:25 +00:00
if rowsProcessed >= spp.ps.n {
2024-04-27 01:14:00 +00:00
spp.ppBase.writeBlock(workerID, timestamps, columns)
return
}
2024-04-29 01:30:25 +00:00
rowsRemaining := spp.ps.n - rowsProcessed
2024-04-27 01:14:00 +00:00
cs := make([]BlockColumn, len(columns))
for i, c := range columns {
cDst := &cs[i]
cDst.Name = c.Name
cDst.Values = c.Values[rowsRemaining:]
}
timestamps = timestamps[rowsRemaining:]
spp.ppBase.writeBlock(workerID, timestamps, cs)
}
2024-04-27 20:08:03 +00:00
func (spp *skipPipeProcessor) flush() error {
return nil
2024-04-27 01:14:00 +00:00
}
func parseSkipPipe(lex *lexer) (*skipPipe, error) {
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing the number of rows to skip")
}
2024-04-27 19:15:56 +00:00
n, err := parseUint(lex.token)
2024-04-27 01:14:00 +00:00
if err != nil {
return nil, fmt.Errorf("cannot parse the number of rows to skip %q: %w", lex.token, err)
}
lex.nextToken()
2024-04-29 01:30:25 +00:00
ps := &skipPipe{
2024-04-27 01:14:00 +00:00
n: n,
}
2024-04-29 01:30:25 +00:00
return ps, nil
2024-04-27 01:14:00 +00:00
}
2024-04-25 22:19:58 +00:00
func parseFieldNamesInParens(lex *lexer) ([]string, error) {
if !lex.isKeyword("(") {
return nil, fmt.Errorf("missing `(`")
}
var fields []string
for {
if !lex.mustNextToken() {
return nil, fmt.Errorf("missing field name or ')'")
}
if lex.isKeyword(")") {
lex.nextToken()
return fields, nil
}
if lex.isKeyword(",") {
return nil, fmt.Errorf("unexpected `,`")
}
2024-04-27 00:50:19 +00:00
field, err := parseFieldName(lex)
if err != nil {
return nil, fmt.Errorf("cannot parse field name: %w", err)
}
2024-04-25 22:19:58 +00:00
fields = append(fields, field)
switch {
case lex.isKeyword(")"):
lex.nextToken()
return fields, nil
case lex.isKeyword(","):
default:
return nil, fmt.Errorf("unexpected token: %q; expecting ',' or ')'", lex.token)
}
}
}
2024-04-27 00:50:19 +00:00
func parseFieldName(lex *lexer) (string, error) {
if lex.isKeyword(",", "(", ")", "[", "]", "|", "") {
return "", fmt.Errorf("unexpected token: %q", lex.token)
2024-04-25 22:19:58 +00:00
}
2024-04-27 00:50:19 +00:00
token := getCompoundToken(lex)
return token, nil
2024-04-25 22:19:58 +00:00
}
func fieldNamesString(fields []string) string {
a := make([]string, len(fields))
for i, f := range fields {
if f != "*" {
f = quoteTokenIfNeeded(f)
}
a[i] = f
}
return strings.Join(a, ", ")
}
func getFieldsIgnoreStar(fields []string) []string {
var result []string
for _, f := range fields {
if f != "*" {
result = append(result, f)
}
}
return result
}
2024-04-27 02:26:15 +00:00
2024-04-28 23:32:11 +00:00
func appendBlockColumnValues(dst [][]string, columns []BlockColumn, fields []string, rowsCount int) [][]string {
2024-04-27 02:26:15 +00:00
for _, f := range fields {
2024-04-29 01:22:27 +00:00
values := getBlockColumnValues(columns, f, rowsCount)
2024-04-28 21:19:40 +00:00
dst = append(dst, values)
2024-04-27 02:26:15 +00:00
}
return dst
}