VictoriaMetrics/lib/protoparser/influx/streamparser.go
Aliaksandr Valialkin 124f78857b app/{vminsert,vmagent}: improve data ingestion speed over a single connection
Process data obtianed from a single connection on all the available CPU cores.
2020-09-28 04:13:08 +03:00

216 lines
4.9 KiB
Go

package influx
import (
"bufio"
"flag"
"fmt"
"io"
"runtime"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/metrics"
)
var (
trimTimestamp = flag.Duration("influxTrimTimestamp", time.Millisecond, "Trim timestamps for Influx line protocol data to this duration. "+
"Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data")
)
// ParseStream parses r with the given args and calls callback for the parsed rows.
//
// The callback can be called multiple times for streamed data from r.
//
// callback shouldn't hold rows after returning.
func ParseStream(r io.Reader, isGzipped bool, precision, db string, callback func(db string, rows []Row) error) error {
if isGzipped {
zr, err := common.GetGzipReader(r)
if err != nil {
return fmt.Errorf("cannot read gzipped influx line protocol data: %w", err)
}
defer common.PutGzipReader(zr)
r = zr
}
// Default precision is 'ns'. See https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/#timestamp
tsMultiplier := int64(1e6)
switch precision {
case "ns":
tsMultiplier = 1e6
case "u", "us", "µ":
tsMultiplier = 1e3
case "ms":
tsMultiplier = 1
case "s":
tsMultiplier = -1e3
case "m":
tsMultiplier = -1e3 * 60
case "h":
tsMultiplier = -1e3 * 3600
}
ctx := getStreamContext(r)
defer putStreamContext(ctx)
for ctx.Read() {
uw := getUnmarshalWork()
uw.callback = callback
uw.db = db
uw.tsMultiplier = tsMultiplier
uw.reqBuf = append(uw.reqBuf[:0], ctx.reqBuf...)
common.ScheduleUnmarshalWork(uw)
}
return ctx.Error()
}
func (ctx *streamContext) Read() bool {
readCalls.Inc()
if ctx.err != nil {
return false
}
ctx.reqBuf, ctx.tailBuf, ctx.err = common.ReadLinesBlock(ctx.br, ctx.reqBuf, ctx.tailBuf)
if ctx.err != nil {
if ctx.err != io.EOF {
readErrors.Inc()
ctx.err = fmt.Errorf("cannot read influx line protocol data: %w", ctx.err)
}
return false
}
return true
}
var (
readCalls = metrics.NewCounter(`vm_protoparser_read_calls_total{type="influx"}`)
readErrors = metrics.NewCounter(`vm_protoparser_read_errors_total{type="influx"}`)
rowsRead = metrics.NewCounter(`vm_protoparser_rows_read_total{type="influx"}`)
)
type streamContext struct {
br *bufio.Reader
reqBuf []byte
tailBuf []byte
err error
}
func (ctx *streamContext) Error() error {
if ctx.err == io.EOF {
return nil
}
return ctx.err
}
func (ctx *streamContext) reset() {
ctx.br.Reset(nil)
ctx.reqBuf = ctx.reqBuf[:0]
ctx.tailBuf = ctx.tailBuf[:0]
ctx.err = nil
}
func getStreamContext(r io.Reader) *streamContext {
select {
case ctx := <-streamContextPoolCh:
ctx.br.Reset(r)
return ctx
default:
if v := streamContextPool.Get(); v != nil {
ctx := v.(*streamContext)
ctx.br.Reset(r)
return ctx
}
return &streamContext{
br: bufio.NewReaderSize(r, 64*1024),
}
}
}
func putStreamContext(ctx *streamContext) {
ctx.reset()
select {
case streamContextPoolCh <- ctx:
default:
streamContextPool.Put(ctx)
}
}
var streamContextPool sync.Pool
var streamContextPoolCh = make(chan *streamContext, runtime.GOMAXPROCS(-1))
type unmarshalWork struct {
rows Rows
callback func(db string, rows []Row) error
db string
tsMultiplier int64
reqBuf []byte
}
func (uw *unmarshalWork) reset() {
uw.rows.Reset()
uw.callback = nil
uw.db = ""
uw.tsMultiplier = 0
uw.reqBuf = uw.reqBuf[:0]
}
// Unmarshal implements common.UnmarshalWork
func (uw *unmarshalWork) Unmarshal() {
uw.rows.Unmarshal(bytesutil.ToUnsafeString(uw.reqBuf))
rows := uw.rows.Rows
rowsRead.Add(len(rows))
// Adjust timestamps according to uw.tsMultiplier
currentTs := time.Now().UnixNano() / 1e6
tsMultiplier := uw.tsMultiplier
if tsMultiplier >= 1 {
for i := range rows {
row := &rows[i]
if row.Timestamp == 0 {
row.Timestamp = currentTs
} else {
row.Timestamp /= tsMultiplier
}
}
} else if tsMultiplier < 0 {
tsMultiplier = -tsMultiplier
currentTs -= currentTs % tsMultiplier
for i := range rows {
row := &rows[i]
if row.Timestamp == 0 {
row.Timestamp = currentTs
} else {
row.Timestamp *= tsMultiplier
}
}
}
// Trim timestamps if required.
if tsTrim := trimTimestamp.Milliseconds(); tsTrim > 1 {
for i := range rows {
row := &rows[i]
row.Timestamp -= row.Timestamp % tsTrim
}
}
if err := uw.callback(uw.db, rows); err != nil {
logger.Errorf("error when processing imported data: %s", err)
putUnmarshalWork(uw)
return
}
putUnmarshalWork(uw)
}
func getUnmarshalWork() *unmarshalWork {
v := unmarshalWorkPool.Get()
if v == nil {
return &unmarshalWork{}
}
return v.(*unmarshalWork)
}
func putUnmarshalWork(uw *unmarshalWork) {
uw.reset()
unmarshalWorkPool.Put(uw)
}
var unmarshalWorkPool sync.Pool