app/victoria-logs: initial code release

This commit is contained in:
Aliaksandr Valialkin 2023-06-19 22:55:12 -07:00
parent aeac39cfd1
commit 87b66db47d
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
82 changed files with 31486 additions and 1 deletions

View file

@ -21,6 +21,7 @@ include package/release/Makefile
all: \
victoria-metrics-prod \
victoria-logs-prod \
vmagent-prod \
vmalert-prod \
vmauth-prod \
@ -33,6 +34,7 @@ clean:
publish: docker-scan \
publish-victoria-metrics \
publish-victoria-logs \
publish-vmagent \
publish-vmalert \
publish-vmauth \
@ -42,6 +44,7 @@ publish: docker-scan \
package: \
package-victoria-metrics \
package-victoria-logs \
package-vmagent \
package-vmalert \
package-vmauth \
@ -178,6 +181,7 @@ publish-release:
release: \
release-victoria-metrics \
release-victoria-logs \
release-vmutils
release-victoria-metrics: \
@ -191,7 +195,6 @@ release-victoria-metrics: \
release-victoria-metrics-openbsd-amd64 \
release-victoria-metrics-windows-amd64
# adds i386 arch
release-victoria-metrics-linux-386:
GOOS=linux GOARCH=386 $(MAKE) release-victoria-metrics-goos-goarch
@ -238,6 +241,63 @@ release-victoria-metrics-windows-goarch: victoria-metrics-windows-$(GOARCH)-prod
cd bin && rm -rf \
victoria-metrics-windows-$(GOARCH)-prod.exe
release-victoria-logs: \
release-victoria-logs-linux-386 \
release-victoria-logs-linux-amd64 \
release-victoria-logs-linux-arm \
release-victoria-logs-linux-arm64 \
release-victoria-logs-darwin-amd64 \
release-victoria-logs-darwin-arm64 \
release-victoria-logs-freebsd-amd64 \
release-victoria-logs-openbsd-amd64 \
release-victoria-logs-windows-amd64
release-victoria-logs-linux-386:
GOOS=linux GOARCH=386 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-linux-amd64:
GOOS=linux GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-linux-arm:
GOOS=linux GOARCH=arm $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-linux-arm64:
GOOS=linux GOARCH=arm64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-darwin-amd64:
GOOS=darwin GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-darwin-arm64:
GOOS=darwin GOARCH=arm64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-freebsd-amd64:
GOOS=freebsd GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-openbsd-amd64:
GOOS=openbsd GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
release-victoria-logs-windows-amd64:
GOARCH=amd64 $(MAKE) release-victoria-logs-windows-goarch
release-victoria-logs-goos-goarch: victoria-logs-$(GOOS)-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
victoria-logs-$(GOOS)-$(GOARCH)-prod \
&& sha256sum victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
victoria-logs-$(GOOS)-$(GOARCH)-prod \
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf victoria-logs-$(GOOS)-$(GOARCH)-prod
release-victoria-logs-windows-goarch: victoria-logs-windows-$(GOARCH)-prod
cd bin && \
zip victoria-logs-windows-$(GOARCH)-$(PKG_TAG).zip \
victoria-logs-windows-$(GOARCH)-prod.exe \
&& sha256sum victoria-logs-windows-$(GOARCH)-$(PKG_TAG).zip \
victoria-logs-windows-$(GOARCH)-prod.exe \
> victoria-logs-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
victoria-logs-windows-$(GOARCH)-prod.exe
release-vmutils: \
release-vmutils-linux-386 \
release-vmutils-linux-amd64 \

103
app/victoria-logs/Makefile Normal file
View file

@ -0,0 +1,103 @@
# All these commands must run from repository root.
victoria-logs:
APP_NAME=victoria-logs $(MAKE) app-local
victoria-logs-race:
APP_NAME=victoria-logs RACE=-race $(MAKE) app-local
victoria-logs-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker
victoria-logs-pure-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-pure
victoria-logs-linux-amd64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-amd64
victoria-logs-linux-arm-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-arm
victoria-logs-linux-arm64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-arm64
victoria-logs-linux-ppc64le-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-ppc64le
victoria-logs-linux-386-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-386
victoria-logs-darwin-amd64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-darwin-amd64
victoria-logs-darwin-arm64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-darwin-arm64
victoria-logs-freebsd-amd64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-freebsd-amd64
victoria-logs-openbsd-amd64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-openbsd-amd64
victoria-logs-windows-amd64-prod:
APP_NAME=victoria-logs $(MAKE) app-via-docker-windows-amd64
package-victoria-logs:
APP_NAME=victoria-logs $(MAKE) package-via-docker
package-victoria-logs-pure:
APP_NAME=victoria-logs $(MAKE) package-via-docker-pure
package-victoria-logs-amd64:
APP_NAME=victoria-logs $(MAKE) package-via-docker-amd64
package-victoria-logs-arm:
APP_NAME=victoria-logs $(MAKE) package-via-docker-arm
package-victoria-logs-arm64:
APP_NAME=victoria-logs $(MAKE) package-via-docker-arm64
package-victoria-logs-ppc64le:
APP_NAME=victoria-logs $(MAKE) package-via-docker-ppc64le
package-victoria-logs-386:
APP_NAME=victoria-logs $(MAKE) package-via-docker-386
publish-victoria-logs:
APP_NAME=victoria-logs $(MAKE) publish-via-docker
victoria-logs-linux-amd64:
APP_NAME=victoria-logs CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
victoria-logs-linux-arm:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
victoria-logs-linux-arm64:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
victoria-logs-linux-ppc64le:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
victoria-logs-linux-s390x:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
victoria-logs-linux-386:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
victoria-logs-darwin-amd64:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
victoria-logs-darwin-arm64:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
victoria-logs-freebsd-amd64:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
victoria-logs-openbsd-amd64:
APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
victoria-logs-windows-amd64:
GOARCH=amd64 APP_NAME=victoria-logs $(MAKE) app-local-windows-goarch
victoria-logs-pure:
APP_NAME=victoria-logs $(MAKE) app-local-pure

View file

@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8428
ENTRYPOINT ["/victoria-logs-prod"]
ARG src_binary
COPY $src_binary ./victoria-logs-prod

102
app/victoria-logs/main.go Normal file
View file

@ -0,0 +1,102 @@
package main
import (
"flag"
"fmt"
"net/http"
"os"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
)
var (
httpListenAddr = flag.String("httpListenAddr", ":9428", "TCP address to listen for http connections. See also -httpListenAddr.useProxyProtocol")
useProxyProtocol = flag.Bool("httpListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted at -httpListenAddr . "+
"See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt . "+
"With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing")
gogc = flag.Int("gogc", 100, "GOGC to use. See https://tip.golang.org/doc/gc-guide")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
cgroup.SetGOGC(*gogc)
buildinfo.Init()
logger.Init()
pushmetrics.Init()
logger.Infof("starting VictoriaLogs at %q...", *httpListenAddr)
startTime := time.Now()
vlstorage.Init()
vlselect.Init()
vlinsert.Init()
go httpserver.Serve(*httpListenAddr, *useProxyProtocol, requestHandler)
logger.Infof("started VictoriaLogs in %.3f seconds; see https://docs.victoriametrics.com/VictoriaLogs/", time.Since(startTime).Seconds())
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
startTime = time.Now()
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
vlinsert.Stop()
vlselect.Stop()
vlstorage.Stop()
fs.MustStopDirRemover()
logger.Infof("the VictoriaLogs has been stopped in %.3f seconds", time.Since(startTime).Seconds())
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
if r.Method != http.MethodGet {
return false
}
w.Header().Add("Content-Type", "text/html; charset=utf-8")
fmt.Fprintf(w, "<h2>Single-node VictoriaLogs</h2></br>")
fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/VictoriaLogs/'>https://docs.victoriametrics.com/VictoriaLogs/</a></br>")
fmt.Fprintf(w, "Useful endpoints:</br>")
httpserver.WriteAPIHelp(w, [][2]string{
{"metrics", "available service metrics"},
{"flags", "command-line flags"},
})
return true
}
if vlinsert.RequestHandler(w, r) {
return true
}
if vlselect.RequestHandler(w, r) {
return true
}
return false
}
func usage() {
const s = `
victoria-logs is a log management and analytics service.
See the docs at https://docs.victoriametrics.com/VictoriaLogs/
`
flagutil.Usage(s)
}

View file

@ -0,0 +1,12 @@
# See https://medium.com/on-docker/use-multi-stage-builds-to-inject-ca-certs-ad1e8f01de1b
ARG certs_image
ARG root_image
FROM $certs_image as certs
RUN apk update && apk upgrade && apk --update --no-cache add ca-certificates
FROM $root_image
COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
EXPOSE 8428
ENTRYPOINT ["/victoria-logs-prod"]
ARG TARGETARCH
COPY victoria-logs-linux-${TARGETARCH}-prod ./victoria-logs-prod

View file

@ -0,0 +1,20 @@
{% stripspace %}
{% func BulkResponse(n int, tookMs int64) %}
{
"took":{%dl tookMs %},
"errors":false,
"items":[
{% for i := 0; i < n; i++ %}
{
"create":{
"status":201
}
}
{% if i+1 < n %},{% endif %}
{% endfor %}
]
}
{% endfunc %}
{% endstripspace %}

View file

@ -0,0 +1,69 @@
// Code generated by qtc from "bulk_response.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
package elasticsearch
//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
func StreamBulkResponse(qw422016 *qt422016.Writer, n int, tookMs int64) {
//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
qw422016.N().S(`{"took":`)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:5
qw422016.N().DL(tookMs)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:5
qw422016.N().S(`,"errors":false,"items":[`)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:8
for i := 0; i < n; i++ {
//line app/vlinsert/elasticsearch/bulk_response.qtpl:8
qw422016.N().S(`{"create":{"status":201}}`)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
if i+1 < n {
//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
qw422016.N().S(`,`)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
}
//line app/vlinsert/elasticsearch/bulk_response.qtpl:15
}
//line app/vlinsert/elasticsearch/bulk_response.qtpl:15
qw422016.N().S(`]}`)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
}
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
func WriteBulkResponse(qq422016 qtio422016.Writer, n int, tookMs int64) {
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
StreamBulkResponse(qw422016, n, tookMs)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
qt422016.ReleaseWriter(qw422016)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
}
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
func BulkResponse(n int, tookMs int64) string {
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
qb422016 := qt422016.AcquireByteBuffer()
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
WriteBulkResponse(qb422016, n, tookMs)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
qs422016 := string(qb422016.B)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
qt422016.ReleaseByteBuffer(qb422016)
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
return qs422016
//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
}

View file

@ -0,0 +1,410 @@
package elasticsearch
import (
"bufio"
"errors"
"fmt"
"io"
"math"
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
"github.com/valyala/fastjson"
)
var (
maxLineSizeBytes = flagutil.NewBytes("insert.maxLineSizeBytes", 256*1024, "The maximum size of a single line, which can be read by /insert/* handlers")
)
// RequestHandler processes ElasticSearch insert requests
func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
w.Header().Add("Content-Type", "application/json")
// This header is needed for Logstash
w.Header().Set("X-Elastic-Product", "Elasticsearch")
if strings.HasPrefix(path, "/_ilm/policy") {
// Return fake response for ElasticSearch ilm request.
fmt.Fprintf(w, `{}`)
return true
}
if strings.HasPrefix(path, "/_index_template") {
// Return fake response for ElasticSearch index template request.
fmt.Fprintf(w, `{}`)
return true
}
if strings.HasPrefix(path, "/_ingest") {
// Return fake response for ElasticSearch ingest pipeline request.
// See: https://www.elastic.co/guide/en/elasticsearch/reference/8.8/put-pipeline-api.html
fmt.Fprintf(w, `{}`)
return true
}
if strings.HasPrefix(path, "/_nodes") {
// Return fake response for ElasticSearch nodes discovery request.
// See: https://www.elastic.co/guide/en/elasticsearch/reference/8.8/cluster.html
fmt.Fprintf(w, `{}`)
return true
}
switch path {
case "/":
switch r.Method {
case http.MethodGet:
// Return fake response for ElasticSearch ping request.
// See the latest available version for ElasticSearch at https://github.com/elastic/elasticsearch/releases
fmt.Fprintf(w, `{
"version": {
"number": "8.8.0"
}
}`)
case http.MethodHead:
// Return empty response for Logstash ping request.
}
return true
case "/_license":
// Return fake response for ElasticSearch license request.
fmt.Fprintf(w, `{
"license": {
"uid": "cbff45e7-c553-41f7-ae4f-9205eabd80xx",
"type": "oss",
"status": "active",
"expiry_date_in_millis" : 4000000000000
}
}`)
return true
case "/_bulk":
startTime := time.Now()
bulkRequestsTotal.Inc()
// Extract tenantID
tenantID, err := logstorage.GetTenantIDFromRequest(r)
if err != nil {
httpserver.Errorf(w, r, "%s", err)
return true
}
// Extract time field name from _time_field query arg
var timeField = "_time"
if tf := r.FormValue("_time_field"); tf != "" {
timeField = tf
}
// Extract message field name from _msg_field query arg
var msgField = ""
if msgf := r.FormValue("_msg_field"); msgf != "" {
msgField = msgf
}
// Extract stream field names from _stream_fields query arg
var streamFields []string
if sfs := r.FormValue("_stream_fields"); sfs != "" {
streamFields = strings.Split(sfs, ",")
}
// Extract field names, which must be ignored
var ignoreFields []string
if ifs := r.FormValue("ignore_fields"); ifs != "" {
ignoreFields = strings.Split(ifs, ",")
}
lr := logstorage.GetLogRows(streamFields, ignoreFields)
processLogMessage := func(timestamp int64, fields []logstorage.Field) {
lr.MustAdd(tenantID, timestamp, fields)
if lr.NeedFlush() {
vlstorage.MustAddRows(lr)
lr.Reset()
}
}
isGzip := r.Header.Get("Content-Encoding") == "gzip"
n, err := readBulkRequest(r.Body, isGzip, timeField, msgField, processLogMessage)
if err != nil {
logger.Warnf("cannot decode log message #%d in /_bulk request: %s", n, err)
return true
}
vlstorage.MustAddRows(lr)
logstorage.PutLogRows(lr)
tookMs := time.Since(startTime).Milliseconds()
bw := bufferedwriter.Get(w)
defer bufferedwriter.Put(bw)
WriteBulkResponse(bw, n, tookMs)
_ = bw.Flush()
return true
default:
return false
}
}
var bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
processLogMessage func(timestamp int64, fields []logstorage.Field),
) (int, error) {
// See https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
if isGzip {
zr, err := common.GetGzipReader(r)
if err != nil {
return 0, fmt.Errorf("cannot read gzipped _bulk request: %w", err)
}
defer common.PutGzipReader(zr)
r = zr
}
wcr := writeconcurrencylimiter.GetReader(r)
defer writeconcurrencylimiter.PutReader(wcr)
lb := lineBufferPool.Get()
defer lineBufferPool.Put(lb)
lb.B = bytesutil.ResizeNoCopyNoOverallocate(lb.B, maxLineSizeBytes.IntN())
sc := bufio.NewScanner(wcr)
sc.Buffer(lb.B, len(lb.B))
n := 0
nCheckpoint := 0
for {
ok, err := readBulkLine(sc, timeField, msgField, processLogMessage)
wcr.DecConcurrency()
if err != nil || !ok {
rowsIngestedTotal.Add(n - nCheckpoint)
return n, err
}
n++
if batchSize := n - nCheckpoint; n >= 1000 {
rowsIngestedTotal.Add(batchSize)
nCheckpoint = n
}
}
}
var lineBufferPool bytesutil.ByteBufferPool
var rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
processLogMessage func(timestamp int64, fields []logstorage.Field),
) (bool, error) {
// Decode command, must be "create" or "index"
if !sc.Scan() {
if err := sc.Err(); err != nil {
if errors.Is(err, bufio.ErrTooLong) {
return false, fmt.Errorf(`cannot read "create" or "index" command, since its size exceeds -insert.maxLineSizeBytes=%d`, maxLineSizeBytes.IntN())
}
return false, err
}
return false, nil
}
line := sc.Bytes()
p := parserPool.Get()
v, err := p.ParseBytes(line)
if err != nil {
return false, fmt.Errorf(`cannot parse "create" or "index" command: %w`, err)
}
if v.GetObject("create") == nil && v.GetObject("index") == nil {
return false, fmt.Errorf(`unexpected command %q; expected "create" or "index"`, v)
}
parserPool.Put(p)
// Decode log message
if !sc.Scan() {
if err := sc.Err(); err != nil {
if errors.Is(err, bufio.ErrTooLong) {
return false, fmt.Errorf("cannot read log message, since its size exceeds -insert.maxLineSizeBytes=%d", maxLineSizeBytes.IntN())
}
return false, err
}
return false, fmt.Errorf(`missing log message after the "create" or "index" command`)
}
line = sc.Bytes()
pctx := getParserCtx()
if err := pctx.parseLogMessage(line); err != nil {
invalidJSONLineLogger.Warnf("cannot parse json-encoded log entry: %s", err)
return true, nil
}
timestamp, err := extractTimestampFromFields(timeField, pctx.fields)
if err != nil {
invalidTimestampLogger.Warnf("skipping the log entry because cannot parse timestamp: %s", err)
return true, nil
}
updateMessageFieldName(msgField, pctx.fields)
processLogMessage(timestamp, pctx.fields)
putParserCtx(pctx)
return true, nil
}
var parserPool fastjson.ParserPool
var (
invalidTimestampLogger = logger.WithThrottler("invalidTimestampLogger", 5*time.Second)
invalidJSONLineLogger = logger.WithThrottler("invalidJSONLineLogger", 5*time.Second)
)
func extractTimestampFromFields(timeField string, fields []logstorage.Field) (int64, error) {
for i := range fields {
f := &fields[i]
if f.Name != timeField {
continue
}
timestamp, err := parseElasticsearchTimestamp(f.Value)
if err != nil {
return 0, err
}
f.Value = ""
return timestamp, nil
}
return time.Now().UnixNano(), nil
}
func updateMessageFieldName(msgField string, fields []logstorage.Field) {
if msgField == "" {
return
}
for i := range fields {
f := &fields[i]
if f.Name == msgField {
f.Name = "_msg"
return
}
}
}
type parserCtx struct {
p fastjson.Parser
buf []byte
prefixBuf []byte
fields []logstorage.Field
}
func (pctx *parserCtx) reset() {
pctx.buf = pctx.buf[:0]
pctx.prefixBuf = pctx.prefixBuf[:0]
fields := pctx.fields
for i := range fields {
lf := &fields[i]
lf.Name = ""
lf.Value = ""
}
pctx.fields = fields[:0]
}
func getParserCtx() *parserCtx {
v := parserCtxPool.Get()
if v == nil {
return &parserCtx{}
}
return v.(*parserCtx)
}
func putParserCtx(pctx *parserCtx) {
pctx.reset()
parserCtxPool.Put(pctx)
}
var parserCtxPool sync.Pool
func (pctx *parserCtx) parseLogMessage(msg []byte) error {
s := bytesutil.ToUnsafeString(msg)
v, err := pctx.p.Parse(s)
if err != nil {
return fmt.Errorf("cannot parse json: %w", err)
}
if t := v.Type(); t != fastjson.TypeObject {
return fmt.Errorf("expecting json dictionary; got %s", t)
}
pctx.reset()
pctx.fields, pctx.buf, pctx.prefixBuf = appendLogFields(pctx.fields, pctx.buf, pctx.prefixBuf, v)
return nil
}
func appendLogFields(dst []logstorage.Field, dstBuf, prefixBuf []byte, v *fastjson.Value) ([]logstorage.Field, []byte, []byte) {
o := v.GetObject()
o.Visit(func(k []byte, v *fastjson.Value) {
t := v.Type()
switch t {
case fastjson.TypeNull:
// Skip nulls
case fastjson.TypeObject:
// Flatten nested JSON objects.
// For example, {"foo":{"bar":"baz"}} is converted to {"foo.bar":"baz"}
prefixLen := len(prefixBuf)
prefixBuf = append(prefixBuf, k...)
prefixBuf = append(prefixBuf, '.')
dst, dstBuf, prefixBuf = appendLogFields(dst, dstBuf, prefixBuf, v)
prefixBuf = prefixBuf[:prefixLen]
case fastjson.TypeArray, fastjson.TypeNumber, fastjson.TypeTrue, fastjson.TypeFalse:
// Convert JSON arrays, numbers, true and false values to their string representation
dstBufLen := len(dstBuf)
dstBuf = v.MarshalTo(dstBuf)
value := dstBuf[dstBufLen:]
dst, dstBuf = appendLogField(dst, dstBuf, prefixBuf, k, value)
case fastjson.TypeString:
// Decode JSON strings
dstBufLen := len(dstBuf)
dstBuf = append(dstBuf, v.GetStringBytes()...)
value := dstBuf[dstBufLen:]
dst, dstBuf = appendLogField(dst, dstBuf, prefixBuf, k, value)
default:
logger.Panicf("BUG: unexpected JSON type: %s", t)
}
})
return dst, dstBuf, prefixBuf
}
func appendLogField(dst []logstorage.Field, dstBuf, prefixBuf, k, value []byte) ([]logstorage.Field, []byte) {
dstBufLen := len(dstBuf)
dstBuf = append(dstBuf, prefixBuf...)
dstBuf = append(dstBuf, k...)
name := dstBuf[dstBufLen:]
dst = append(dst, logstorage.Field{
Name: bytesutil.ToUnsafeString(name),
Value: bytesutil.ToUnsafeString(value),
})
return dst, dstBuf
}
func parseElasticsearchTimestamp(s string) (int64, error) {
if len(s) < len("YYYY-MM-DD") || s[len("YYYY")] != '-' {
// Try parsing timestamp in milliseconds
n, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, fmt.Errorf("cannot parse timestamp in milliseconds from %q: %w", s, err)
}
if n > int64(math.MaxInt64)/1e6 {
return 0, fmt.Errorf("too big timestamp in milliseconds: %d; mustn't exceed %d", n, int64(math.MaxInt64)/1e6)
}
if n < int64(math.MinInt64)/1e6 {
return 0, fmt.Errorf("too small timestamp in milliseconds: %d; must be bigger than %d", n, int64(math.MinInt64)/1e6)
}
n *= 1e6
return n, nil
}
if len(s) == len("YYYY-MM-DD") {
t, err := time.Parse("2006-01-02", s)
if err != nil {
return 0, fmt.Errorf("cannot parse date %q: %w", s, err)
}
return t.UnixNano(), nil
}
t, err := time.Parse(time.RFC3339, s)
if err != nil {
return 0, fmt.Errorf("cannot parse timestamp %q: %w", s, err)
}
return t.UnixNano(), nil
}

View file

@ -0,0 +1,97 @@
package elasticsearch
import (
"bytes"
"compress/gzip"
"fmt"
"reflect"
"strings"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
)
func TestReadBulkRequest(t *testing.T) {
f := func(data, timeField, msgField string, rowsExpected int, timestampsExpected []int64, resultExpected string) {
t.Helper()
var timestamps []int64
var result string
processLogMessage := func(timestamp int64, fields []logstorage.Field) {
timestamps = append(timestamps, timestamp)
a := make([]string, len(fields))
for i, f := range fields {
a[i] = fmt.Sprintf("%q:%q", f.Name, f.Value)
}
s := "{" + strings.Join(a, ",") + "}\n"
result += s
}
// Read the request without compression
r := bytes.NewBufferString(data)
rows, err := readBulkRequest(r, false, timeField, msgField, processLogMessage)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
if rows != rowsExpected {
t.Fatalf("unexpected rows read; got %d; want %d", rows, rowsExpected)
}
if !reflect.DeepEqual(timestamps, timestampsExpected) {
t.Fatalf("unexpected timestamps;\ngot\n%d\nwant\n%d", timestamps, timestampsExpected)
}
if result != resultExpected {
t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
}
// Read the request with compression
timestamps = nil
result = ""
compressedData := compressData(data)
r = bytes.NewBufferString(compressedData)
rows, err = readBulkRequest(r, true, timeField, msgField, processLogMessage)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
if rows != rowsExpected {
t.Fatalf("unexpected rows read; got %d; want %d", rows, rowsExpected)
}
if !reflect.DeepEqual(timestamps, timestampsExpected) {
t.Fatalf("unexpected timestamps;\ngot\n%d\nwant\n%d", timestamps, timestampsExpected)
}
if result != resultExpected {
t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
}
}
data := `{"create":{"_index":"filebeat-8.8.0"}}
{"@timestamp":"2023-06-06T04:48:11.735Z","log":{"offset":71770,"file":{"path":"/var/log/auth.log"}},"message":"foobar"}
{"create":{"_index":"filebeat-8.8.0"}}
{"@timestamp":"2023-06-06T04:48:12.735Z","message":"baz"}
{"create":{"_index":"filebeat-8.8.0"}}
{"message":"xyz","@timestamp":"2023-06-06T04:48:13.735Z","x":"y"}
`
timeField := "@timestamp"
msgField := "message"
rowsExpected := 3
timestampsExpected := []int64{1686026891735000000, 1686026892735000000, 1686026893735000000}
resultExpected := `{"@timestamp":"","log.offset":"71770","log.file.path":"/var/log/auth.log","_msg":"foobar"}
{"@timestamp":"","_msg":"baz"}
{"_msg":"xyz","@timestamp":"","x":"y"}
`
f(data, timeField, msgField, rowsExpected, timestampsExpected, resultExpected)
}
func compressData(s string) string {
var bb bytes.Buffer
zw := gzip.NewWriter(&bb)
if _, err := zw.Write([]byte(s)); err != nil {
panic(fmt.Errorf("unexpected error when compressing data: %s", err))
}
if err := zw.Close(); err != nil {
panic(fmt.Errorf("unexpected error when closing gzip writer: %s", err))
}
return bb.String()
}

View file

@ -0,0 +1,50 @@
package elasticsearch
import (
"bytes"
"fmt"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
)
func BenchmarkReadBulkRequest(b *testing.B) {
b.Run("gzip:off", func(b *testing.B) {
benchmarkReadBulkRequest(b, false)
})
b.Run("gzip:on", func(b *testing.B) {
benchmarkReadBulkRequest(b, true)
})
}
func benchmarkReadBulkRequest(b *testing.B, isGzip bool) {
data := `{"create":{"_index":"filebeat-8.8.0"}}
{"@timestamp":"2023-06-06T04:48:11.735Z","log":{"offset":71770,"file":{"path":"/var/log/auth.log"}},"message":"foobar"}
{"create":{"_index":"filebeat-8.8.0"}}
{"@timestamp":"2023-06-06T04:48:12.735Z","message":"baz"}
{"create":{"_index":"filebeat-8.8.0"}}
{"message":"xyz","@timestamp":"2023-06-06T04:48:13.735Z","x":"y"}
`
if isGzip {
data = compressData(data)
}
dataBytes := bytesutil.ToUnsafeBytes(data)
timeField := "@timestamp"
msgField := "message"
processLogMessage := func(timestmap int64, fields []logstorage.Field) {}
b.ReportAllocs()
b.SetBytes(int64(len(data)))
b.RunParallel(func(pb *testing.PB) {
r := &bytes.Reader{}
for pb.Next() {
r.Reset(dataBytes)
_, err := readBulkRequest(r, isGzip, timeField, msgField, processLogMessage)
if err != nil {
panic(fmt.Errorf("unexpected error: %s", err))
}
}
})
}

34
app/vlinsert/main.go Normal file
View file

@ -0,0 +1,34 @@
package vlinsert
import (
"net/http"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/elasticsearch"
)
// Init initializes vlinsert
func Init() {
}
// Stop stops vlinsert
func Stop() {
}
// RequestHandler handles insert requests for VictoriaLogs
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
path := r.URL.Path
if !strings.HasPrefix(path, "/insert/") {
return false
}
path = strings.TrimPrefix(path, "/insert")
path = strings.ReplaceAll(path, "//", "/")
switch {
case strings.HasPrefix(path, "/elasticsearch/"):
path = strings.TrimPrefix(path, "/elasticsearch")
return elasticsearch.RequestHandler(path, w, r)
default:
return false
}
}

View file

@ -0,0 +1,53 @@
package logsql
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
)
// ProcessQueryRequest handles /select/logsql/query request
func ProcessQueryRequest(w http.ResponseWriter, r *http.Request, stopCh <-chan struct{}) {
// Extract tenantID
tenantID, err := logstorage.GetTenantIDFromRequest(r)
if err != nil {
httpserver.Errorf(w, r, "%s", err)
return
}
qStr := r.FormValue("query")
q, err := logstorage.ParseQuery(qStr)
if err != nil {
httpserver.Errorf(w, r, "cannot parse query [%s]: %s", qStr, err)
return
}
w.Header().Set("Content-Type", "application/stream+json; charset=utf-8")
bw := bufferedwriter.Get(w)
defer bufferedwriter.Put(bw)
tenantIDs := []logstorage.TenantID{tenantID}
vlstorage.RunQuery(tenantIDs, q, stopCh, func(columns []logstorage.BlockColumn) {
if len(columns) == 0 {
return
}
rowsCount := len(columns[0].Values)
bb := blockResultPool.Get()
for rowIdx := 0; rowIdx < rowsCount; rowIdx++ {
WriteJSONRow(bb, columns, rowIdx)
}
// Do not check for error here, since the only valid error is when the client
// closes the connection during Write() call. There is no need in logging this error,
// since it may be too verbose and it doesn't give any actionable info.
_, _ = bw.Write(bb.B)
blockResultPool.Put(bb)
})
_ = bw.Flush()
}
var blockResultPool bytesutil.ByteBufferPool

View file

@ -0,0 +1,20 @@
{% import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
) %}
{% stripspace %}
// JSONRow creates JSON row from the given fields.
{% func JSONRow(columns []logstorage.BlockColumn, rowIdx int) %}
{
{% code c := &columns[0] %}
{%q= c.Name %}:{%q= c.Values[rowIdx] %}
{% code columns = columns[1:] %}
{% for colIdx := range columns %}
{% code c := &columns[colIdx] %}
,{%q= c.Name %}:{%q= c.Values[rowIdx] %}
{% endfor %}
}{% newline %}
{% endfunc %}
{% endstripspace %}

View file

@ -0,0 +1,90 @@
// Code generated by qtc from "query_response.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vlselect/logsql/query_response.qtpl:1
package logsql
//line app/vlselect/logsql/query_response.qtpl:1
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
)
// JSONRow creates JSON row from the given fields.
//line app/vlselect/logsql/query_response.qtpl:8
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vlselect/logsql/query_response.qtpl:8
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vlselect/logsql/query_response.qtpl:8
func StreamJSONRow(qw422016 *qt422016.Writer, columns []logstorage.BlockColumn, rowIdx int) {
//line app/vlselect/logsql/query_response.qtpl:8
qw422016.N().S(`{`)
//line app/vlselect/logsql/query_response.qtpl:10
c := &columns[0]
//line app/vlselect/logsql/query_response.qtpl:11
qw422016.N().Q(c.Name)
//line app/vlselect/logsql/query_response.qtpl:11
qw422016.N().S(`:`)
//line app/vlselect/logsql/query_response.qtpl:11
qw422016.N().Q(c.Values[rowIdx])
//line app/vlselect/logsql/query_response.qtpl:12
columns = columns[1:]
//line app/vlselect/logsql/query_response.qtpl:13
for colIdx := range columns {
//line app/vlselect/logsql/query_response.qtpl:14
c := &columns[colIdx]
//line app/vlselect/logsql/query_response.qtpl:14
qw422016.N().S(`,`)
//line app/vlselect/logsql/query_response.qtpl:15
qw422016.N().Q(c.Name)
//line app/vlselect/logsql/query_response.qtpl:15
qw422016.N().S(`:`)
//line app/vlselect/logsql/query_response.qtpl:15
qw422016.N().Q(c.Values[rowIdx])
//line app/vlselect/logsql/query_response.qtpl:16
}
//line app/vlselect/logsql/query_response.qtpl:16
qw422016.N().S(`}`)
//line app/vlselect/logsql/query_response.qtpl:17
qw422016.N().S(`
`)
//line app/vlselect/logsql/query_response.qtpl:18
}
//line app/vlselect/logsql/query_response.qtpl:18
func WriteJSONRow(qq422016 qtio422016.Writer, columns []logstorage.BlockColumn, rowIdx int) {
//line app/vlselect/logsql/query_response.qtpl:18
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vlselect/logsql/query_response.qtpl:18
StreamJSONRow(qw422016, columns, rowIdx)
//line app/vlselect/logsql/query_response.qtpl:18
qt422016.ReleaseWriter(qw422016)
//line app/vlselect/logsql/query_response.qtpl:18
}
//line app/vlselect/logsql/query_response.qtpl:18
func JSONRow(columns []logstorage.BlockColumn, rowIdx int) string {
//line app/vlselect/logsql/query_response.qtpl:18
qb422016 := qt422016.AcquireByteBuffer()
//line app/vlselect/logsql/query_response.qtpl:18
WriteJSONRow(qb422016, columns, rowIdx)
//line app/vlselect/logsql/query_response.qtpl:18
qs422016 := string(qb422016.B)
//line app/vlselect/logsql/query_response.qtpl:18
qt422016.ReleaseByteBuffer(qb422016)
//line app/vlselect/logsql/query_response.qtpl:18
return qs422016
//line app/vlselect/logsql/query_response.qtpl:18
}

140
app/vlselect/main.go Normal file
View file

@ -0,0 +1,140 @@
package vlselect
import (
"flag"
"fmt"
"net/http"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vlselect/logsql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
"github.com/VictoriaMetrics/metrics"
)
var (
maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+
"It shouldn't be high, since a single request can saturate all the CPU cores, while many concurrently executed requests may require high amounts of memory. "+
"See also -search.maxQueueDuration")
maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the search request waits for execution when -search.maxConcurrentRequests "+
"limit is reached; see also -search.maxQueryDuration")
maxQueryDuration = flag.Duration("search.maxQueryDuration", time.Second*30, "The maximum duration for query execution")
)
func getDefaultMaxConcurrentRequests() int {
n := cgroup.AvailableCPUs()
if n <= 4 {
n *= 2
}
if n > 16 {
// A single request can saturate all the CPU cores, so there is no sense
// in allowing higher number of concurrent requests - they will just contend
// for unavailable CPU time.
n = 16
}
return n
}
// Init initializes vlselect
func Init() {
concurrencyLimitCh = make(chan struct{}, *maxConcurrentRequests)
}
// Stop stops vlselect
func Stop() {
}
var concurrencyLimitCh chan struct{}
var (
concurrencyLimitReached = metrics.NewCounter(`vl_concurrent_select_limit_reached_total`)
concurrencyLimitTimeout = metrics.NewCounter(`vl_concurrent_select_limit_timeout_total`)
_ = metrics.NewGauge(`vl_concurrent_select_capacity`, func() float64 {
return float64(cap(concurrencyLimitCh))
})
_ = metrics.NewGauge(`vl_concurrent_select_current`, func() float64 {
return float64(len(concurrencyLimitCh))
})
)
// RequestHandler handles select requests for VictoriaLogs
func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
path := r.URL.Path
if !strings.HasPrefix(path, "/select/") {
return false
}
path = strings.TrimPrefix(path, "/select")
path = strings.ReplaceAll(path, "//", "/")
// Limit the number of concurrent queries.
startTime := time.Now()
stopCh := r.Context().Done()
select {
case concurrencyLimitCh <- struct{}{}:
defer func() { <-concurrencyLimitCh }()
default:
// Sleep for a while until giving up. This should resolve short bursts in requests.
concurrencyLimitReached.Inc()
d := getMaxQueryDuration(r)
if d > *maxQueueDuration {
d = *maxQueueDuration
}
t := timerpool.Get(d)
select {
case concurrencyLimitCh <- struct{}{}:
timerpool.Put(t)
defer func() { <-concurrencyLimitCh }()
case <-stopCh:
timerpool.Put(t)
remoteAddr := httpserver.GetQuotedRemoteAddr(r)
requestURI := httpserver.GetRequestURI(r)
logger.Infof("client has cancelled the request after %.3f seconds: remoteAddr=%s, requestURI: %q",
time.Since(startTime).Seconds(), remoteAddr, requestURI)
return true
case <-t.C:
timerpool.Put(t)
concurrencyLimitTimeout.Inc()
err := &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf("couldn't start executing the request in %.3f seconds, since -search.maxConcurrentRequests=%d concurrent requests "+
"are executed. Possible solutions: to reduce query load; to add more compute resources to the server; "+
"to increase -search.maxQueueDuration=%s; to increase -search.maxQueryDuration; to increase -search.maxConcurrentRequests",
d.Seconds(), *maxConcurrentRequests, maxQueueDuration),
StatusCode: http.StatusServiceUnavailable,
}
httpserver.Errorf(w, r, "%s", err)
return true
}
}
switch {
case path == "/logsql/query":
logsqlQueryRequests.Inc()
httpserver.EnableCORS(w, r)
logsql.ProcessQueryRequest(w, r, stopCh)
return true
default:
return false
}
}
// getMaxQueryDuration returns the maximum duration for query from r.
func getMaxQueryDuration(r *http.Request) time.Duration {
dms, err := httputils.GetDuration(r, "timeout", 0)
if err != nil {
dms = 0
}
d := time.Duration(dms) * time.Millisecond
if d <= 0 || d > *maxQueryDuration {
d = *maxQueryDuration
}
return d
}
var (
logsqlQueryRequests = metrics.NewCounter(`vl_http_requests_total{path="/select/logsql/query"}`)
)

149
app/vlstorage/main.go Normal file
View file

@ -0,0 +1,149 @@
package vlstorage
import (
"flag"
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
"github.com/VictoriaMetrics/metrics"
)
var (
retentionPeriod = flagutil.NewDuration("retentionPeriod", "7d", "Log entries with timestamps older than now-retentionPeriod are automatically deleted; "+
"log entries with timestamps outside the retention are also rejected during data ingestion; the minimum supported retention is 1d (one day); "+
"see https://docs.victoriametrics.com/VictoriaLogs/#retention")
futureRetention = flagutil.NewDuration("futureRetention", "2d", "Log entries with timestamps bigger than now+futureRetention are rejected during data ingestion; "+
"see https://docs.victoriametrics.com/VictoriaLogs/#retention")
storageDataPath = flag.String("storageDataPath", "victoria-logs-data", "Path to directory with the VictoriaLogs data; "+
"see https://docs.victoriametrics.com/VictoriaLogs/#storage")
inmemoryDataFlushInterval = flag.Duration("inmemoryDataFlushInterval", 5*time.Second, "The interval for guaranteed saving of in-memory data to disk. "+
"The saved data survives unclean shutdown such as OOM crash, hardware reset, SIGKILL, etc. "+
"Bigger intervals may help increasing lifetime of flash storage with limited write cycles (e.g. Raspberry PI). "+
"Smaller intervals increase disk IO load. Minimum supported value is 1s")
logNewStreams = flag.Bool("logNewStreams", false, "Whether to log creation of new streams; this can be useful for debugging of high cardinality issues with log streams; "+
"see https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields ; see also -logIngestedRows")
logIngestedRows = flag.Bool("logIngestedRows", false, "Whether to log all the ingested log entries; this can be useful for debugging of data ingestion; "+
"see https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion ; see also -logNewStreams")
)
// Init initializes vlstorage.
//
// Stop must be called when vlstorage is no longer needed
func Init() {
if strg != nil {
logger.Panicf("BUG: Init() has been already called")
}
if retentionPeriod.Msecs < 24*3600*1000 {
logger.Fatalf("-retentionPeriod cannot be smaller than a day; got %s", retentionPeriod)
}
cfg := &logstorage.StorageConfig{
Retention: time.Millisecond * time.Duration(retentionPeriod.Msecs),
FlushInterval: *inmemoryDataFlushInterval,
FutureRetention: time.Millisecond * time.Duration(futureRetention.Msecs),
LogNewStreams: *logNewStreams,
LogIngestedRows: *logIngestedRows,
}
strg = logstorage.MustOpenStorage(*storageDataPath, cfg)
storageMetrics = initStorageMetrics(strg)
metrics.RegisterSet(storageMetrics)
}
// Stop stops vlstorage.
func Stop() {
metrics.UnregisterSet(storageMetrics)
storageMetrics = nil
strg.MustClose()
strg = nil
}
var strg *logstorage.Storage
var storageMetrics *metrics.Set
// MustAddRows adds lr to vlstorage
func MustAddRows(lr *logstorage.LogRows) {
strg.MustAddRows(lr)
}
// RunQuery runs the given q and calls processBlock for the returned data blocks
func RunQuery(tenantIDs []logstorage.TenantID, q *logstorage.Query, stopCh <-chan struct{}, processBlock func(columns []logstorage.BlockColumn)) {
strg.RunQuery(tenantIDs, q, stopCh, processBlock)
}
func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
ssCache := &logstorage.StorageStats{}
var ssCacheLock sync.Mutex
var lastUpdateTime time.Time
m := func() *logstorage.StorageStats {
ssCacheLock.Lock()
defer ssCacheLock.Unlock()
if time.Since(lastUpdateTime) < time.Second {
return ssCache
}
var ss logstorage.StorageStats
strg.UpdateStats(&ss)
ssCache = &ss
lastUpdateTime = time.Now()
return ssCache
}
ms := metrics.NewSet()
ms.NewGauge(fmt.Sprintf(`vl_free_disk_space_bytes{path=%q}`, *storageDataPath), func() float64 {
return float64(fs.MustGetFreeSpace(*storageDataPath))
})
ms.NewGauge(`vl_rows{type="inmemory"}`, func() float64 {
return float64(m().InmemoryRowsCount)
})
ms.NewGauge(`vl_rows{type="file"}`, func() float64 {
return float64(m().FileRowsCount)
})
ms.NewGauge(`vl_parts{type="inmemory"}`, func() float64 {
return float64(m().InmemoryParts)
})
ms.NewGauge(`vl_parts{type="file"}`, func() float64 {
return float64(m().FileParts)
})
ms.NewGauge(`vl_blocks{type="inmemory"}`, func() float64 {
return float64(m().InmemoryBlocks)
})
ms.NewGauge(`vl_blocks{type="file"}`, func() float64 {
return float64(m().FileBlocks)
})
ms.NewGauge(`vl_partitions`, func() float64 {
return float64(m().PartitionsCount)
})
ms.NewGauge(`vl_streams_created_total`, func() float64 {
return float64(m().StreamsCreatedTotal)
})
ms.NewGauge(`vl_compressed_data_size_bytes{type="inmemory"}`, func() float64 {
return float64(m().CompressedInmemorySize)
})
ms.NewGauge(`vl_compressed_data_size_bytes{type="file"}`, func() float64 {
return float64(m().CompressedFileSize)
})
ms.NewGauge(`vl_uncompressed_data_size_bytes{type="inmemory"}`, func() float64 {
return float64(m().UncompressedInmemorySize)
})
ms.NewGauge(`vl_uncompressed_data_size_bytes{type="file"}`, func() float64 {
return float64(m().UncompressedFileSize)
})
ms.NewGauge(`vlinsert_rows_dropped_total{reason="too_big_timestamp"}`, func() float64 {
return float64(m().RowsDroppedTooBigTimestamp)
})
ms.NewGauge(`vlinsert_rows_dropped_total{reason="too_small_timestamp"}`, func() float64 {
return float64(m().RowsDroppedTooSmallTimestamp)
})
return ms
}

1087
docs/VictoriaLogs/LogsQL.md Normal file

File diff suppressed because it is too large Load diff

481
docs/VictoriaLogs/README.md Normal file
View file

@ -0,0 +1,481 @@
# VictoriaLogs
VictoriaLogs is log management and log analytics system from [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/).
It provides the following key features:
- VictoriaLogs can accept logs from popular log collectors, which support
[ElasticSearch data ingestion format](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html). See [these docs](#data-ingestion).
[Grafana Loki data ingestion format](https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki) will be supported in the near future -
see [the Roadmap](https://docs.victoriametrics.com/VictoriaLogs/Roadmap.html).
- VictoriaLogs is much easier to setup and operate comparing to ElasticSearch and Grafana Loki. See [these docs](#operation).
- VictoriaLogs provides easy yet powerful query language with full-text search capabilities across
all the [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) -
see [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html).
- VictoriaLogs can be seamlessly combined with good old Unix tools for log analysis such as `grep`, `less`, `sort`, `jq`, etc.
See [these docs](#querying-via-command-line) for details.
- VictoriaLogs capacity and performance scales lineraly with the available resources (CPU, RAM, disk IO, disk space).
It runs smoothly on both Raspberry PI and a beefy server with hundreds of CPU cores and terabytes of RAM.
- VictoriaLogs can handle much bigger data volumes than ElasticSearch and Grafana Loki when running on comparable hardware.
A single-node VictoriaLogs instance can substitute large ElasticSearch cluster.
## Operation
### How to run VictoriaLogs
Checkout VictoriaLogs source code. It is located in the VictoriaMetrics repository:
```bash
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
```
Then build VictoriaLogs. The build command requires [Go 1.20](https://golang.org/doc/install).
```bash
make victoria-logs
```
Then run the built binary:
```bash
bin/victoria-logs
```
VictoriaLogs is ready to [receive logs](#data-ingestion) and [query logs](#querying) at the TCP port `9428` now!
It has no any external dependencies, so it may run in various environments without additional setup and configuration.
VictoriaLogs automatically adapts to the available CPU and RAM resources. It also automatically setups and creates
the needed indexes during [data ingestion](#data-ingestion).
It is possible to change the TCP port via `-httpListenAddr` command-line flag. For example, the following command
starts VictoriaLogs, which accepts incoming requests at port `9200` (aka ElasticSearch HTTP API port):
```bash
/path/to/victoria-logs -httpListenAddr=:9200
```
VictoriaLogs stores the ingested data to the `victoria-logs-data` directory by default. The directory can be changed
via `-storageDataPath` command-line flag. See [these docs](#storage) for details.
By default VictoriaLogs stores log entries with timestamps in the time range `[now-7d, now]`, while dropping logs outside the given time range.
E.g. it uses the retention of 7 days. Read [these docs](#retention) on how to control the retention for the [ingested](#data-ingestion) logs.
It is recommended setting up monitoring of VictoriaLogs according to [these docs](#monitoring).
### Data ingestion
VictoriaLogs supports the following data ingestion techniques:
- Via [Filebeat](https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-overview.html). See [these docs](#filebeat-setup).
- Via [Logstash](https://www.elastic.co/guide/en/logstash/current/introduction.html). See [these docs](#logstash-setup).
The ingested log entries can be queried according to [these docs](#querying).
#### Data ingestion troubleshooting
VictoriaLogs provides the following command-line flags, which can help debugging data ingestion issues:
- `-logNewStreams` - if this flag is passed to VictoriaLogs, then it logs all the newly
registered [log streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields).
This may help debugging [high cardinality issues](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality).
- `-logIngestedRows` - if this flag is passed to VictoriaLogs, then it logs all the ingested
[log entries](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model).
VictoriaLogs exposes various [metrics](#monitoring), which may help debugging data ingestion issues:
- `vl_rows_ingested_total` - the number of ingested [log entries](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model)
since the last VictoriaLogs restart. If this number icreases over time, then logs are successfully ingested into VictoriaLogs.
The ingested logs can be inspected in logs by passing `-logIngestedRows` command-line flag to VictoriaLogs.
- `vl_streams_created_total` - the number of created [log streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields)
since the last VictoriaLogs restart. If this metric grows rapidly during extended periods of time, then this may lead
to [high cardinality issues](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality).
The newly created log streams can be inspected in logs by passing `-logNewStreams` command-line flag to VictoriaLogs.
#### Filebeat setup
Specify [`output.elasicsearch`](https://www.elastic.co/guide/en/beats/filebeat/current/elasticsearch-output.html) section in the `filebeat.yml`
for sending the collected logs to VictoriaLogs:
```yml
output.elasticsearch:
hosts: ["http://localhost:9428/insert/elasticsearch/"]
parameters:
_msg_field: "message"
_time_field: "@timestamp"
_stream_fields: "host.hostname,log.file.path"
```
Substitute the `localhost:9428` address inside `hosts` section with the real TCP address of VictoriaLogs.
The `_msg_field` parameter must contain the field name with the log message generated by Filebeat. This is usually `message` field.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) for details.
The `_time_field` parameter must contain the field name with the log timestamp generated by Filebeat. This is usually `@timestamp` field.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field) for details.
It is recommended specifying comma-separated list of field names, which uniquely identify every log stream collected by Filebeat, in the `_stream_fields` parameter.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) for details.
If some [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) aren't needed,
then VictoriaLogs can be instructed to ignore them during data ingestion - just pass `ignore_fields` parameter with comma-separated list of fields to ignore.
For example, the following config instructs VictoriaLogs to ignore `log.offset` and `event.original` fields in the ingested logs:
```yml
output.elasticsearch:
hosts: ["http://localhost:9428/insert/elasticsearch/"]
parameters:
_msg_field: "message"
_time_field: "@timestamp"
_stream_fields: "host.name,log.file.path"
ignore_fields: "log.offset,event.original"
```
When Filebeat ingests logs into VictoriaLogs at a high rate, then it may be needed to tune `worker` and `bulk_max_size` options.
For example, the following config is optimized for higher than usual ingestion rate:
```yml
output.elasticsearch:
hosts: ["http://localhost:9428/insert/elasticsearch/"]
parameters:
_msg_field: "message"
_time_field: "@timestamp"
_stream_fields: "host.name,log.file.path"
worker: 8
bulk_max_size: 1000
```
If the Filebeat sends logs to VictoriaLogs in another datacenter, then it may be useful enabling data compression via `compression_level` option.
This usually allows saving network bandwidth and costs by up to 5 times:
```yml
output.elasticsearch:
hosts: ["http://localhost:9428/insert/elasticsearch/"]
parameters:
_msg_field: "message"
_time_field: "@timestamp"
_stream_fields: "host.name,log.file.path"
compression_level: 1
```
By default the ingested logs are stored in the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy).
If you need storing logs in other tenant, then specify the needed tenant via `headers` at `output.elasticsearch` section.
For example, the following `filebeat.yml` config instructs Filebeat to store the data to `(AccountID=12, ProjectID=34)` tenant:
```yml
output.elasticsearch:
hosts: ["http://localhost:9428/insert/elasticsearch/"]
headers:
AccountID: 12
ProjectID: 34
parameters:
_msg_field: "message"
_time_field: "@timestamp"
_stream_fields: "host.name,log.file.path"
```
The ingested log entries can be queried according to [these docs](#querying).
See also [data ingestion troubleshooting](#data-ingestion-trobuleshooting) docs.
#### Logstash setup
Specify [`output.elasticsearch`](https://www.elastic.co/guide/en/logstash/current/plugins-outputs-elasticsearch.html) section in the `logstash.conf` file
for sending the collected logs to VictoriaLogs:
```conf
output {
elasticsearch {
hosts => ["http://localhost:9428/insert/elasticsearch/"]
parameters => {
"_msg_field" => "message"
"_time_field" => "@timestamp"
"_stream_fields" => "host.name,process.name"
}
}
}
```
Substitute `localhost:9428` address inside `hosts` with the real TCP address of VictoriaLogs.
The `_msg_field` parameter must contain the field name with the log message generated by Logstash. This is usually `message` field.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) for details.
The `_time_field` parameter must contain the field name with the log timestamp generated by Logstash. This is usually `@timestamp` field.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field) for details.
It is recommended specifying comma-separated list of field names, which uniquely identify every log stream collected by Logstash, in the `_stream_fields` parameter.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) for details.
If some [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) aren't needed,
then VictoriaLogs can be instructed to ignore them during data ingestion - just pass `ignore_fields` parameter with comma-separated list of fields to ignore.
For example, the following config instructs VictoriaLogs to ignore `log.offset` and `event.original` fields in the ingested logs:
```conf
output {
elasticsearch {
hosts => ["http://localhost:9428/insert/elasticsearch/"]
parameters => {
"_msg_field" => "message"
"_time_field" => "@timestamp"
"_stream_fields" => "host.hostname,process.name"
"ignore_fields" => "log.offset,event.original"
}
}
}
```
If the Logstash sends logs to VictoriaLogs in another datacenter, then it may be useful enabling data compression via `http_compression: true` option.
This usually allows saving network bandwidth and costs by up to 5 times:
```conf
output {
elasticsearch {
hosts => ["http://localhost:9428/insert/elasticsearch/"]
parameters => {
"_msg_field" => "message"
"_time_field" => "@timestamp"
"_stream_fields" => "host.hostname,process.name"
}
http_compression => true
}
}
```
By default the ingested logs are stored in the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy).
If you need storing logs in other tenant, then specify the needed tenant via `custom_headers` at `output.elasticsearch` section.
For example, the following `logstash.conf` config instructs Logstash to store the data to `(AccountID=12, ProjectID=34)` tenant:
```conf
output {
elasticsearch {
hosts => ["http://localhost:9428/insert/elasticsearch/"]
custom_headers => {
"AccountID" => "1"
"ProjectID" => "2"
}
parameters => {
"_msg_field" => "message"
"_time_field" => "@timestamp"
"_stream_fields" => "host.hostname,process.name"
}
}
}
```
The ingested log entries can be queried according to [these docs](#querying).
See also [data ingestion troubleshooting](#data-ingestion-trobuleshooting) docs.
### Querying
VictoriaLogs can be queried at the `/select/logsql/query` endpoint. The [LogsQL](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html)
query must be passed via `query` argument. For example, the following query returns all the log entries with the `error` word:
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error'
```
The `query` argument can be passed either in the request url itself (aka HTTP GET request) or via request body
with the `x-www-form-urlencoded` encoding (aka HTTP POST request). The HTTP POST is useful for sending long queries
when they do not fit the maximum url length of the used clients and proxies.
See [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) for details on what can be passed to the `query` arg.
The `query` arg must be properly encoded with [percent encoding](https://en.wikipedia.org/wiki/URL_encoding) when passing it to `curl`
or similar tools.
The `/select/logsql/query` endpoint returns [a stream of JSON lines](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON),
where each line contains JSON-encoded log entry in the form `{field1="value1",...,fieldN="valueN"}`.
Example response:
```
{"_msg":"error: disconnect from 19.54.37.22: Auth fail [preauth]","_stream":"{}","_time":"2023-01-01T13:32:13Z"}
{"_msg":"some other error","_stream":"{}","_time":"2023-01-01T13:32:15Z"}
```
The matching lines are sent to the response stream as soon as they are found in VictoriaLogs storage.
This means that the returned response may contain billions of lines for queries matching too many log entries.
The response can be interrupted at any time by closing the connection to VictoriaLogs server.
This allows post-processing the returned lines at the client side with the usual Unix commands such as `grep`, `jq`, `less`, `head`, etc.
See [these docs](#querying-via-command-line) for more details.
The returned lines aren't sorted by default, since sorting disables the ability to send matching log entries to response stream as soon as they are found.
Query results can be sorted either at VictoriaLogs side according [to these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#sorting)
or at client side with the usual `sort` command according to [these docs](#querying-via-command-line).
By default the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy) is queried.
If you need querying other tenant, then specify the needed tenant via http request headers. For example, the following query searches
for log messages at `(AccountID=12, ProjectID=34)` tenant:
```bash
curl http://localhost:9428/select/logsql/query -H 'AccountID: 12' -H 'ProjectID: 34' -d 'query=error'
```
The number of requests to `/select/logsql/query` can be [monitored](#monitoring) with `vl_http_requests_total{path="/select/logsql/query"}` metric.
#### Querying via command-line
VictoriaLogs provides good integration with `curl` and other command-line tools because of the following features:
- VictoriaLogs sends the matching log entries to the response stream as soon as they are found.
This allows forwarding the response stream to arbitrary [Unix pipes](https://en.wikipedia.org/wiki/Pipeline_(Unix)).
- VictoriaLogs automatically adjusts query execution speed to the speed of the client, which reads the response stream.
For example, if the response stream is piped to `less` command, then the query is suspended
until the `less` command reads the next block from the response stream.
- VictoriaLogs automatically cancels query execution when the client closes the response stream.
For example, if the query response is piped to `head` command, then VictoriaLogs stops executing the query
when the `head` command closes the response stream.
These features allow executing queries at command-line interface, which potentially select billions of rows,
without the risk of high resource usage (CPU, RAM, disk IO) at VictoriaLogs server.
For example, the following query can return very big number of matching log entries (e.g. billions) if VictoriaLogs contains
many log messages with the `error` [word](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#word):
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error'
```
If the command returns "never-ending" response, then just press `ctrl+C` at any time in order to cancel the query.
VictoriaLogs notices that the response stream is closed, so it cancels the query and instantly stops consuming CPU, RAM and disk IO for this query.
Then just use `head` command for investigating the returned log messages and narrowing down the query:
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error' | head -10
```
The `head -10` command reads only the first 10 log messages from the response and then closes the response stream.
This automatically cancels the query at VictoriaLogs side, so it stops consuming CPU, RAM and disk IO resources.
Sometimes it may be more convenient to use `less` command instead of `head` during the investigation of the returned response:
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error' | less
```
The `less` command reads the response stream on demand, when the user scrolls down the output.
VictoriaLogs suspends query execution when `less` stops reading the response stream.
It doesn't consume CPU and disk IO resources during this time. It resumes query execution
when the `less` continues reading the response stream.
Suppose that the initial investigation of the returned query results helped determining that the needed log messages contain
`cannot open file` [phrase](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#phrase-filter).
Then the query can be narrowed down to `error AND "cannot open file"`
(see [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#logical-filter) about `AND` operator).
Then run the updated command in order to continue the investigation:
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error AND "cannot open file"' | head
```
Note that the `query` arg must be properly encoded with [percent encoding](https://en.wikipedia.org/wiki/URL_encoding) when passing it to `curl`
or similar tools.
The `pipe the query to "head" or "less" -> investigate the results -> refine the query` iteration
can be repeated multiple times until the needed log messages are found.
The returned VictoriaLogs query response can be post-processed with any combination of Unix commands,
which are usually used for log analysis - `grep`, `jq`, `awk`, `sort`, `uniq`, `wc`, etc.
For example, the following command uses `wc -l` Unix command for counting the number of log messages
with the `error` [word](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#word)
received from [streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) with `app="nginx"` field
during the last 5 minutes:
```bash
curl http://localhost:9428/select/logsql/query -d 'query=_stream:{app="nginx"} AND _time:[now-5m,now] AND error' | wc -l
```
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter) about `_stream` filter,
[these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#time-filter) about `_time` filter
and [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#logical-filter) about `AND` operator.
The following example shows how to sort query results by the [`_time` field](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field):
```bash
curl http://localhost:9428/select/logsql/query -d 'query=error' | jq -r '._time + " " + ._msg' | sort | less
```
This command uses `jq` for extracting [`_time`](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field)
and [`_msg`](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) fields from the returned results,
and piping them to `sort` command.
Note that the `sort` command needs to read all the response stream before returning the sorted results. So the command above
can take non-trivial amounts of time if the `query` returns too many results. The solution is to narrow down the `query`
before sorting the results. See [these tips](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#performance-tips)
on how to narrow down query results.
The following example calculates stats on the number of log messages received during the last 5 minutes
grouped by `log.level` [field](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model):
```bash
curl http://localhost:9428/select/logsql/query -d 'query=_time:[now-5m,now] log.level:*' | jq -r '."log.level"' | sort | uniq -c
```
The query selects all the log messages with non-empty `log.level` field via ["any value" filter](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#any-value-filter),
then pipes them to `jq` command, which extracts the `log.level` field value from the returned JSON stream, then the extracted `log.level` values
are sorted with `sort` command and, finally, they are passed to `uniq -c` command for calculating the needed stats.
See also:
- [Key concepts](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html).
- [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html).
### Monitoring
VictoriaLogs exposes internal metrics in Prometheus exposition format at `http://localhost:9428/metrics` page.
It is recommended to set up monitoring of these metrics via VictoriaMetrics
(see [these docs](https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter)),
vmagent (see [these docs](https://docs.victoriametrics.com/vmagent.html#how-to-collect-metrics-in-prometheus-format)) or via Prometheus.
VictoriaLogs emits own logs to stdout. It is recommended investigating these logs during troubleshooting.
### Retention
By default VictoriaLogs stores log entries with timestamps in the time range `[now-7d, now]`, while dropping logs outside the given time range.
E.g. it uses the retention of 7 days. The retention can be configured with `-retentionPeriod` command-line flag.
This flag accepts values starting from `1d` (one day) up to `100y` (100 years). See [these docs](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-durations)
for the supported duration formats.
For example, the following command starts VictoriaLogs with the retention of 8 weeks:
```bash
/path/to/victoria-logs -retentionPeriod=8w
```
VictoriaLogs stores the [ingested](#data-ingestion) logs in per-day partition directories. It automatically drops partition directories
outside the configured retention.
VictoriaLogs automatically drops logs at [data ingestion](#data-ingestion) stage if they have timestamps outside the configured retention.
A sample of dropped logs is logged with `WARN` message in order to simplify troubleshooting.
The `vlinsert_rows_dropped_total` [metric](#monitoring) is incremented each time an ingested log entry is dropped because of timestamp outside the retention.
It is recommended setting up the following alerting rule at [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to be notified
when logs with wrong timestamps are ingested into VictoriaLogs:
```metricsql
rate(vlinsert_rows_dropped_total[5m]) > 0
```
By default VictoriaLogs doesn't accept log entries with timestamps bigger than `now+2d`, e.g. 2 days in the future.
If you need accepting logs with bigger timestamps, then specify the desired "future retention" via `-futureRetention` command-line flag.
This flag accepts values starting from `1d`. See [these docs](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-durations)
for the supported duration formats.
For example, the following command starts VictoriaLogs, which accepts logs with timestamps up to a year in the future:
```bash
/path/to/victoria-logs -futureRetention=1y
```
### Storage
VictoriaLogs stores all its data in a single directory - `victoria-logs-data`. The path to the directory can be changed via `-storageDataPath` command-line flag.
For example, the following command starts VictoriaLogs, which stores the data at `/var/lib/victoria-logs`:
```bash
/path/to/victoria-logs -storageDataPath=/var/lib/victoria-logs
```
VictoriaLogs automatically creates the `-storageDataPath` directory on the first run if it is missing.

View file

@ -0,0 +1,37 @@
# VictoriaLogs roadmap
The VictoriaLogs Preview is ready for evaluation in production. It is recommended running it alongside the existing solutions
such as ElasticSearch and Grafana Loki and comparing their resource usage and usability.
It isn't recommended migrating from existing solutions to VictoriaLogs Preview yet.
The following functionality is available in VictoriaLogs Preview:
- [Data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
- [Querying](https://docs.victoriametrics.com/VictoriaLogs/#querying).
- [Querying via command-line](https://docs.victoriametrics.com/VictoriaLogs/#querying-via-command-line).
See [operation docs](https://docs.victoriametrics.com/VictoriaLogs/#operation) for details.
The following functionality is planned in the future versions of VictoriaLogs:
- Support for [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) from popular log collectors and formats:
- Promtail (aka Grafana Loki)
- Vector.dev
- Fluentbit
- Fluentd
- Syslog
- Add missing functionality to [LogsQL](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html):
- [Stream context](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-context).
- [Transformation functions](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#transformations).
- [Post-filtering](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#post-filters).
- [Stats calculations](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stats).
- [Sorting](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#sorting).
- [Limiters](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#limiters).
- The ability to use subqueries inside [in()](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#multi-exact-filter) function.
- Live tailing for [LogsQL filters](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#filters) aka `tail -f`.
- Web UI with the following abilities:
- Explore the ingested logs.
- Build graphs over time for the ingested logs.
- Ability to make instant snapshots and backups in the way [similar to VictoriaMetrics](https://docs.victoriametrics.com/#how-to-work-with-snapshots).
- Cluster version of VictoriaLogs.
- Ability to store data to object storage (such as S3, GCS, Minio).

View file

@ -0,0 +1,219 @@
# VictoriaLogs key concepts
## Data model
VictoriaLogs works with structured logs. Every log entry may contain arbitrary number of `key=value` pairs (aka fields).
A single log entry can be expressed as a single-level [JSON](https://www.json.org/json-en.html) object with string keys and values.
For example:
```json
{
"job": "my-app",
"instance": "host123:4567",
"level": "error",
"client_ip": "1.2.3.4",
"trace_id": "1234-56789-abcdef",
"_msg": "failed to serve the client request"
}
```
VictoriaLogs automatically transforms multi-level JSON (aka nested JSON) into single-level JSON
during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) according to the following rules:
- Nested dictionaries are flattened by concatenating dictionary keys with `.` char. For example, the following multi-level JSON
is transformed into the following single-level JSON:
```json
{
"host": {
"name": "foobar"
"os": {
"version": "1.2.3"
}
}
}
```
```json
{
"host.name": "foobar",
"host.os.version": "1.2.3"
}
```
- Arrays, numbers and boolean values are converted into strings. This simplifies [full-text search](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) over such values.
For example, the following JSON with an array, a number and a boolean value is converted into the following JSON with string values:
```json
{
"tags": ["foo", "bar"],
"offset": 12345,
"is_error": false
}
```
```json
{
"tags": "[\"foo\", \"bar\"]",
"offset": "12345",
"is_error": "false"
}
```
Both label name and label value may contain arbitrary chars. Such chars must be encoded
during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
according to [JSON string encoding](https://www.rfc-editor.org/rfc/rfc7159.html#section-7).
Unicode chars must be encoded with [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoding:
```json
{
"label with whitepsace": "value\nwith\nnewlines",
"Поле": "价值",
}
```
VictoriaLogs automatically indexes all the fields in all the [ingested](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) logs.
This enables [full-text search](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) across all the fields.
VictoriaLogs supports the following field types:
* [`_msg` field](#message-field)
* [`_time` field](#time-field)
* [`_stream` fields](#stream-fields)
* [other fields](#other-fields)
### Message field
Every ingested [log entry](#data-model) must contain at least a `_msg` field with the actual log message. For example, this is the minimal
log entry, which can be ingested into VictoriaLogs:
```json
{
"_msg": "some log message"
}
```
If the actual log message has other than `_msg` field name, then it is possible to specify the real log message field
via `_msg_field` query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
For example, if log message is located in the `event.original` field, then specify `_msg_field=event.original` query arg
during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
### Time field
The ingested [log entries](#data-model) may contain `_time` field with the timestamp of the ingested log entry.
For example:
```json
{
"_msg": "some log message",
"_time": "2023-04-12T06:38:11.095Z"
}
```
If the actual timestamp has other than `_time` field name, then it is possible to specify the real timestamp
field via `_time_field` query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
For example, if timestamp is located in the `event.created` field, then specify `_time_field=event.created` query arg
during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
If `_time` field is missing, then the data ingestion time is used as log entry timestamp.
The log entry timestamp allows quickly narrowing down the search to a particular time range.
See [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#time-filter) for details.
### Stream fields
Some [structured logging](#data-model) fields may uniquely identify the application instance, which generates log entries.
This may be either a single field such as `instance=host123:456` or a set of fields such as
`(datacenter=..., env=..., job=..., instance=...)` or
`(kubernetes.namespace=..., kubernetes.node.name=..., kubernetes.pod.name=..., kubernetes.container.name=...)`.
Log entries received from a single application instance form a log stream in VictoriaLogs.
VictoriaLogs optimizes storing and querying of individual log streams. This provides the following benefits:
- Reduced disk space usage, since a log stream from a single application instance is usually compressed better
than a mixed log stream from multiple distinct applications.
- Increased query performance, since VictoriaLogs needs to scan lower amounts of data
when [searching by stream labels](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter).
VictoriaLogs cannot determine automatically, which fields uniquely identify every log stream,
so it stores all the received log entries in a single default stream - `{}`.
This may lead to not-so-optimal resource usage and query performance.
Therefore it is recommended specifying stream-level fields via `_stream_fields` query arg
during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
For example, if logs from Kubernetes containers have the following fields:
```json
{
"kubernetes.namespace": "some-namespace",
"kubernetes.node.name": "some-node",
"kubernetes.pod.name": "some-pod",
"kubernetes.container.name": "some-container",
"_msg": "some log message"
}
```
then sepcify `_stream_fields=kubernetes.namespace,kubernetes.node.name,kubernetes.pod.name,kubernetes.container.name`
query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) in order to properly store
per-container logs into distinct streams.
#### How to determine which fields must be associated with log streams?
[Log streams](#stream-fields) can be associated with fields, which simultaneously meet the following conditions:
- Fields, which remain constant across log entries received from a single application instance.
- Fields, which uniquely identify the application instance. For example, `instance`, `host`, `container`, etc.
Sometimes a single application instance may generate multiple log streams and store them into distinct log files.
In this case it is OK to associate the log stream with filepath fields such as `log.file.path` additionally to instance-specific fields.
Structured logs may contain big number of fields, which do not change across log entries received from a single application instance.
There is no need in associating all these fields with log stream - it is enough to associate only those fields, which uniquely identify
the application instance across all the ingested logs. Additionally, some fields such as `datacenter`, `environment`, `namespace`, `job` or `app`,
can be associated with log stream in order to optimize searching by these fields with [stream filtering](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter).
Never associate log streams with fields, which may change across log entries of the same application instance. See [these docs](#high-cardinality) for details.
#### High cardinality
Some fields in the [ingested logs](#data-model) may contain big number of unique values across log entries.
For example, fields with names such as `ip`, `user_id` or `trace_id` tend to contain big number of unique values.
VictoriaLogs works perfectly with such fields unless they are associated with [log streams](#stream-fields).
Never associate high-cardinality fields with [log streams](#stream-fields), since this may result
to the following issues:
- Performance degradation during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
and [querying](https://docs.victoriametrics.com/VictoriaLogs/#querying)
- Increased memory usage
- Increased CPU usage
- Increased disk space usage
- Increased disk read / write IO
VictoriaLogs exposes `vl_streams_created_total` [metric](https://docs.victoriametrics.com/VictoriaLogs/#monitoring),
which shows the number of created streams since the last VictoriaLogs restart. If this metric grows at a rapid rate
during long period of time, then there are high chances of high cardinality issues mentioned above.
VictoriaLogs can log all the newly registered streams when `-logNewStreams` command-line flag is passed to it.
This can help narrowing down and eliminating high-cardinality fields from [log streams](#stream-fields).
### Other fields
The rest of [structured logging](#data-model) fields are optional. They can be used for simplifying and optimizing search queries.
For example, it is usually faster to search over a dedicated `trace_id` field instead of searching for the `trace_id` inside long log message.
E.g. the `trace_id:XXXX-YYYY-ZZZZ` query usually works faster than the `_msg:"trace_id=XXXX-YYYY-ZZZZ"` query.
See [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) for more details.
## Multitenancy
VictoriaLogs supports multitenancy. A tenant is identified by `(AccountID, ProjectID)` pair, where `AccountID` and `ProjectID` are arbitrary 32-bit unsigned integeres.
The `AccountID` and `ProjectID` fields can be set during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
and [querying](https://docs.victoriametrics.com/VictoriaLogs/#querying) via `AccountID` and `ProjectID` request headers.
If `AccountID` and/or `ProjectID` request headers aren't set, then the default `0` value is used.
VictoriaLogs has very low overhead for per-tenant management, so it is OK to have thousands of tenants in a single VictoriaLogs instance.
VictoriaLogs doesn't perform per-tenant authorization. Use [vmauth](https://docs.victoriametrics.com/vmauth.html) or similar tools for per-tenant authorization.

31
lib/logstorage/arena.go Normal file
View file

@ -0,0 +1,31 @@
package logstorage
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
)
type arena struct {
b []byte
}
func (a *arena) reset() {
a.b = a.b[:0]
}
func (a *arena) copyBytes(b []byte) []byte {
ab := a.b
abLen := len(ab)
ab = append(ab, b...)
result := ab[abLen:]
a.b = ab
return result
}
func (a *arena) newBytes(size int) []byte {
ab := a.b
abLen := len(ab)
ab = bytesutil.ResizeWithCopyMayOverallocate(ab, abLen+size)
result := ab[abLen:]
a.b = ab
return result
}

650
lib/logstorage/block.go Normal file
View file

@ -0,0 +1,650 @@
package logstorage
import (
"fmt"
"sort"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// block represents a block of log entries.
type block struct {
// timestamps contains timestamps for log entries.
timestamps []int64
// columns contains values for fields seen in log entries.
columns []column
// constColumns contains fields with constant values across all the block entries.
constColumns []Field
}
func (b *block) reset() {
b.timestamps = b.timestamps[:0]
cs := b.columns
for i := range cs {
cs[i].reset()
}
b.columns = cs[:0]
ccs := b.constColumns
for i := range ccs {
ccs[i].Reset()
}
b.constColumns = ccs[:0]
}
// uncompressedSizeBytes returns the total size of the origianl log entries stored in b.
//
// It is supposed that every log entry has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func (b *block) uncompressedSizeBytes() uint64 {
rowsCount := uint64(b.Len())
// Take into account timestamps
n := rowsCount * uint64(len(time.RFC3339Nano))
// Take into account columns
cs := b.columns
for i := range cs {
c := &cs[i]
nameLen := uint64(len(c.name))
if nameLen == 0 {
nameLen = uint64(len("_msg"))
}
for _, v := range c.values {
if len(v) > 0 {
n += nameLen + 2 + uint64(len(v))
}
}
}
// Take into account constColumns
ccs := b.constColumns
for i := range ccs {
cc := &ccs[i]
nameLen := uint64(len(cc.Name))
if nameLen == 0 {
nameLen = uint64(len("_msg"))
}
n += rowsCount * (2 + nameLen + uint64(len(cc.Value)))
}
return n
}
// uncompressedRowsSizeBytes returns the size of the uncompressed rows.
//
// It is supposed that every row has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func uncompressedRowsSizeBytes(rows [][]Field) uint64 {
n := uint64(0)
for _, fields := range rows {
n += uncompressedRowSizeBytes(fields)
}
return n
}
// uncompressedRowSizeBytes returns the size of uncompressed row.
//
// It is supposed that the row has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func uncompressedRowSizeBytes(fields []Field) uint64 {
n := uint64(len(time.RFC3339Nano)) // log timestamp
for _, f := range fields {
nameLen := len(f.Name)
if nameLen == 0 {
nameLen = len("_msg")
}
n += uint64(2 + nameLen + len(f.Value))
}
return n
}
// column contains values for the given field name seen in log entries.
type column struct {
// name is the field name
name string
// values is the values seen for the given log entries.
values []string
}
func (c *column) reset() {
c.name = ""
values := c.values
for i := range values {
values[i] = ""
}
c.values = values[:0]
}
func (c *column) areSameValues() bool {
values := c.values
if len(values) < 2 {
return true
}
value := values[0]
for _, v := range values[1:] {
if value != v {
return false
}
}
return true
}
func (c *column) resizeValues(valuesLen int) []string {
values := c.values
if n := valuesLen - cap(values); n > 0 {
values = append(values[:cap(values)], make([]string, n)...)
}
values = values[:valuesLen]
c.values = values
return values
}
// mustWriteTo writes c to sw and updates ch accordingly.
func (c *column) mustWriteTo(ch *columnHeader, sw *streamWriters) {
ch.reset()
valuesWriter := &sw.fieldValuesWriter
bloomFilterWriter := &sw.fieldBloomFilterWriter
if c.name == "" {
valuesWriter = &sw.messageValuesWriter
bloomFilterWriter = &sw.messageBloomFilterWriter
}
ch.name = c.name
// encode values
ve := getValuesEncoder()
ch.valueType, ch.minValue, ch.maxValue = ve.encode(c.values, &ch.valuesDict)
bb := longTermBufPool.Get()
defer longTermBufPool.Put(bb)
// marshal values
bb.B = marshalStringsBlock(bb.B[:0], ve.values)
putValuesEncoder(ve)
ch.valuesSize = uint64(len(bb.B))
if ch.valuesSize > maxValuesBlockSize {
logger.Panicf("BUG: too valuesSize: %d bytes; mustn't exceed %d bytes", ch.valuesSize, maxValuesBlockSize)
}
ch.valuesOffset = valuesWriter.bytesWritten
valuesWriter.MustWrite(bb.B)
// create and marshal bloom filter for c.values
if ch.valueType != valueTypeDict {
tokensBuf := getTokensBuf()
tokensBuf.A = tokenizeStrings(tokensBuf.A[:0], c.values)
bb.B = bloomFilterMarshal(bb.B[:0], tokensBuf.A)
putTokensBuf(tokensBuf)
} else {
// there is no need in ecoding bloom filter for dictiory type,
// since it isn't used during querying - all the dictionary values are available in ch.valuesDict
bb.B = bb.B[:0]
}
ch.bloomFilterSize = uint64(len(bb.B))
if ch.bloomFilterSize > maxBloomFilterBlockSize {
logger.Panicf("BUG: too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", ch.bloomFilterSize, maxBloomFilterBlockSize)
}
ch.bloomFilterOffset = bloomFilterWriter.bytesWritten
bloomFilterWriter.MustWrite(bb.B)
}
func (b *block) assertValid() {
// Check that timestamps are in ascending order
timestamps := b.timestamps
for i := 1; i < len(timestamps); i++ {
if timestamps[i-1] > timestamps[i] {
logger.Panicf("BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d",
timestamps[i-1], timestamps[i])
}
}
// Check that the number of items in each column matches the number of items in the block.
itemsCount := len(timestamps)
columns := b.columns
for _, c := range columns {
if len(c.values) != itemsCount {
logger.Panicf("BUG: unexpected number of values for column %q: got %d; want %d", c.name, len(c.values), itemsCount)
}
}
}
// MustInitFromRows initializes b from the given timestamps and rows.
//
// It is expected that timestamps are sorted.
func (b *block) MustInitFromRows(timestamps []int64, rows [][]Field) {
b.reset()
assertTimestampsSorted(timestamps)
b.timestamps = append(b.timestamps, timestamps...)
b.mustInitFromRows(rows)
b.sortColumnsByName()
}
func (b *block) mustInitFromRows(rows [][]Field) {
rowsLen := len(rows)
if rowsLen == 0 {
// Nothing to do
return
}
if areSameFieldsInRows(rows) {
// Fast path - all the log entries have the same fields
fields := rows[0]
for i := range fields {
f := &fields[i]
if areSameValuesForColumn(rows, i) {
cc := b.extendConstColumns()
cc.Name = f.Name
cc.Value = f.Value
} else {
c := b.extendColumns()
c.name = f.Name
values := c.resizeValues(rowsLen)
for j := range rows {
values[j] = rows[j][i].Value
}
}
}
return
}
// Slow path - log entries contain different set of fields
// Determine indexes for columns
columnIdxs := getColumnIdxs()
for i := range rows {
fields := rows[i]
for j := range fields {
name := fields[j].Name
if _, ok := columnIdxs[name]; !ok {
columnIdxs[name] = len(columnIdxs)
}
}
}
// Initialize columns
cs := b.resizeColumns(len(columnIdxs))
for name, idx := range columnIdxs {
c := &cs[idx]
c.name = name
c.resizeValues(rowsLen)
}
// Write rows to block
for i := range rows {
for _, f := range rows[i] {
idx := columnIdxs[f.Name]
cs[idx].values[i] = f.Value
}
}
putColumnIdxs(columnIdxs)
// Detect const columns
for i := len(cs) - 1; i >= 0; i-- {
c := &cs[i]
if !c.areSameValues() {
continue
}
cc := b.extendConstColumns()
cc.Name = c.name
cc.Value = c.values[0]
c.reset()
if i < len(cs)-1 {
swapColumns(c, &cs[len(cs)-1])
}
cs = cs[:len(cs)-1]
}
b.columns = cs
}
func swapColumns(a, b *column) {
*a, *b = *b, *a
}
func areSameValuesForColumn(rows [][]Field, colIdx int) bool {
if len(rows) < 2 {
return true
}
value := rows[0][colIdx].Value
rows = rows[1:]
for i := range rows {
if value != rows[i][colIdx].Value {
return false
}
}
return true
}
func assertTimestampsSorted(timestamps []int64) {
for i := range timestamps {
if i > 0 && timestamps[i-1] > timestamps[i] {
logger.Panicf("BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d",
timestamps[i-1], timestamps[i])
}
}
}
func (b *block) extendConstColumns() *Field {
ccs := b.constColumns
if cap(ccs) > len(ccs) {
ccs = ccs[:len(ccs)+1]
} else {
ccs = append(ccs, Field{})
}
b.constColumns = ccs
return &ccs[len(ccs)-1]
}
func (b *block) extendColumns() *column {
cs := b.columns
if cap(cs) > len(cs) {
cs = cs[:len(cs)+1]
} else {
cs = append(cs, column{})
}
b.columns = cs
return &cs[len(cs)-1]
}
func (b *block) resizeColumns(columnsLen int) []column {
cs := b.columns[:0]
if n := columnsLen - cap(cs); n > 0 {
cs = append(cs[:cap(cs)], make([]column, n)...)
}
cs = cs[:columnsLen]
b.columns = cs
return cs
}
func (b *block) sortColumnsByName() {
if len(b.columns)+len(b.constColumns) > maxColumnsPerBlock {
logger.Panicf("BUG: too big number of columns detected in the block: %d; the number of columns mustn't exceed %d",
len(b.columns)+len(b.constColumns), maxColumnsPerBlock)
}
cs := getColumnsSorter()
cs.columns = b.columns
sort.Sort(cs)
putColumnsSorter(cs)
ccs := getConstColumnsSorter()
ccs.columns = b.constColumns
sort.Sort(ccs)
putConstColumnsSorter(ccs)
}
// Len returns the number of log entries in b.
func (b *block) Len() int {
return len(b.timestamps)
}
// InitFromBlockData unmarshals bd to b.
//
// sbu and vd are used as a temporary storage for unmarshaled column values.
//
// The b becomes outdated after sbu or vd is reset.
func (b *block) InitFromBlockData(bd *blockData, sbu *stringsBlockUnmarshaler, vd *valuesDecoder) error {
b.reset()
if bd.rowsCount > maxRowsPerBlock {
return fmt.Errorf("too many entries found in the block: %d; mustn't exceed %d", bd.rowsCount, maxRowsPerBlock)
}
rowsCount := int(bd.rowsCount)
// unmarshal timestamps
td := &bd.timestampsData
var err error
b.timestamps, err = encoding.UnmarshalTimestamps(b.timestamps[:0], td.data, td.marshalType, td.minTimestamp, rowsCount)
if err != nil {
return fmt.Errorf("cannot unmarshal timestamps: %w", err)
}
// unmarshal columns
cds := bd.columnsData
cs := b.resizeColumns(len(cds))
for i := range cds {
cd := &cds[i]
c := &cs[i]
c.name = cd.name
c.values, err = sbu.unmarshal(c.values[:0], cd.valuesData, uint64(rowsCount))
if err != nil {
return fmt.Errorf("cannot unmarshal column %d: %w", i, err)
}
if err = vd.decodeInplace(c.values, cd.valueType, &cd.valuesDict); err != nil {
return fmt.Errorf("cannot decode column values: %w", err)
}
}
// unmarshal constColumns
b.constColumns = append(b.constColumns[:0], bd.constColumns...)
return nil
}
// mustWriteTo writes b with the given sid to sw and updates bh accordingly
func (b *block) mustWriteTo(sid *streamID, bh *blockHeader, sw *streamWriters) {
// Do not store the version used for encoding directly in the block data, since:
// - all the blocks in the same part use the same encoding
// - the block encoding version can be put in metadata file for the part (aka metadataFilename)
b.assertValid()
bh.reset()
bh.streamID = *sid
bh.uncompressedSizeBytes = b.uncompressedSizeBytes()
bh.rowsCount = uint64(b.Len())
// Marshal timestamps
mustWriteTimestampsTo(&bh.timestampsHeader, b.timestamps, sw)
// Marshal columns
cs := b.columns
csh := getColumnsHeader()
chs := csh.resizeColumnHeaders(len(cs))
for i := range cs {
cs[i].mustWriteTo(&chs[i], sw)
}
csh.constColumns = append(csh.constColumns[:0], b.constColumns...)
bb := longTermBufPool.Get()
bb.B = csh.marshal(bb.B)
putColumnsHeader(csh)
bh.columnsHeaderOffset = sw.columnsHeaderWriter.bytesWritten
bh.columnsHeaderSize = uint64(len(bb.B))
if bh.columnsHeaderSize > maxColumnsHeaderSize {
logger.Panicf("BUG: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", bh.columnsHeaderSize, maxColumnsHeaderSize)
}
sw.columnsHeaderWriter.MustWrite(bb.B)
longTermBufPool.Put(bb)
}
// appendRows appends log entries from b to dst.
func (b *block) appendRows(dst *rows) {
// copy timestamps
dst.timestamps = append(dst.timestamps, b.timestamps...)
// copy columns
fieldsBuf := dst.fieldsBuf
ccs := b.constColumns
cs := b.columns
for i := range b.timestamps {
fieldsLen := len(fieldsBuf)
// copy const columns
for j := range ccs {
cc := &ccs[j]
fieldsBuf = append(fieldsBuf, Field{
Name: cc.Name,
Value: cc.Value,
})
}
// copy other columns
for j := range cs {
c := &cs[j]
value := c.values[i]
if len(value) == 0 {
continue
}
fieldsBuf = append(fieldsBuf, Field{
Name: c.name,
Value: value,
})
}
dst.rows = append(dst.rows, fieldsBuf[fieldsLen:])
}
dst.fieldsBuf = fieldsBuf
}
func areSameFieldsInRows(rows [][]Field) bool {
if len(rows) < 2 {
return true
}
fields := rows[0]
rows = rows[1:]
for i := range rows {
leFields := rows[i]
if len(fields) != len(leFields) {
return false
}
for j := range leFields {
if leFields[j].Name != fields[j].Name {
return false
}
}
}
return true
}
var columnIdxsPool sync.Pool
func getColumnIdxs() map[string]int {
v := columnIdxsPool.Get()
if v == nil {
return make(map[string]int)
}
return v.(map[string]int)
}
func putColumnIdxs(m map[string]int) {
for k := range m {
delete(m, k)
}
columnIdxsPool.Put(m)
}
func getBlock() *block {
v := blockPool.Get()
if v == nil {
return &block{}
}
return v.(*block)
}
func putBlock(b *block) {
b.reset()
blockPool.Put(b)
}
var blockPool sync.Pool
type columnsSorter struct {
columns []column
}
func (cs *columnsSorter) reset() {
cs.columns = nil
}
func (cs *columnsSorter) Len() int {
return len(cs.columns)
}
func (cs *columnsSorter) Less(i, j int) bool {
columns := cs.columns
return columns[i].name < columns[j].name
}
func (cs *columnsSorter) Swap(i, j int) {
columns := cs.columns
columns[i], columns[j] = columns[j], columns[i]
}
func getColumnsSorter() *columnsSorter {
v := columnsSorterPool.Get()
if v == nil {
return &columnsSorter{}
}
return v.(*columnsSorter)
}
func putColumnsSorter(cs *columnsSorter) {
cs.reset()
columnsSorterPool.Put(cs)
}
var columnsSorterPool sync.Pool
type constColumnsSorter struct {
columns []Field
}
func (ccs *constColumnsSorter) reset() {
ccs.columns = nil
}
func (ccs *constColumnsSorter) Len() int {
return len(ccs.columns)
}
func (ccs *constColumnsSorter) Less(i, j int) bool {
columns := ccs.columns
return columns[i].Name < columns[j].Name
}
func (ccs *constColumnsSorter) Swap(i, j int) {
columns := ccs.columns
columns[i], columns[j] = columns[j], columns[i]
}
func getConstColumnsSorter() *constColumnsSorter {
v := constColumnsSorterPool.Get()
if v == nil {
return &constColumnsSorter{}
}
return v.(*constColumnsSorter)
}
func putConstColumnsSorter(ccs *constColumnsSorter) {
ccs.reset()
constColumnsSorterPool.Put(ccs)
}
var constColumnsSorterPool sync.Pool
// mustWriteTimestampsTo writes timestamps to sw and updates th accordingly
func mustWriteTimestampsTo(th *timestampsHeader, timestamps []int64, sw *streamWriters) {
th.reset()
bb := longTermBufPool.Get()
bb.B, th.marshalType, th.minTimestamp = encoding.MarshalTimestamps(bb.B[:0], timestamps, 64)
if len(bb.B) > maxTimestampsBlockSize {
logger.Panicf("BUG: too big block with timestamps: %d bytes; the maximum supported size is %d bytes", len(bb.B), maxTimestampsBlockSize)
}
th.maxTimestamp = timestamps[len(timestamps)-1]
th.blockOffset = sw.timestampsWriter.bytesWritten
th.blockSize = uint64(len(bb.B))
sw.timestampsWriter.MustWrite(bb.B)
longTermBufPool.Put(bb)
}

View file

@ -0,0 +1,383 @@
package logstorage
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// blockData contains packed data for a single block.
//
// The main purpose of this struct is to reduce the work needed during background merge of parts.
// If the block is full, then the blockData can be written to the destination part
// without the need to unpack it.
type blockData struct {
// streamID is id of the stream for the data
streamID streamID
// uncompressedSizeBytes is the original (uncompressed) size of log entries stored in the block
uncompressedSizeBytes uint64
// rowsCount is the number of log entries in the block
rowsCount uint64
// timestampsData contains the encoded timestamps data for the block
timestampsData timestampsData
// columnsData contains packed per-column data.
columnsData []columnData
// constColumns contains data for const columns across the block.
constColumns []Field
// a is used for storing byte slices for timestamps and columns.
//
// It reduces fragmentation for them.
a arena
}
// reset resets bd for subsequent re-use
func (bd *blockData) reset() {
bd.streamID.reset()
bd.uncompressedSizeBytes = 0
bd.rowsCount = 0
bd.timestampsData.reset()
cds := bd.columnsData
for i := range cds {
cds[i].reset()
}
bd.columnsData = cds[:0]
ccs := bd.constColumns
for i := range ccs {
ccs[i].Reset()
}
bd.constColumns = ccs[:0]
bd.a.reset()
}
func (bd *blockData) resizeColumnsData(columnsDataLen int) []columnData {
cds := bd.columnsData
if n := columnsDataLen - cap(cds); n > 0 {
cds = append(cds[:cap(cds)], make([]columnData, n)...)
}
cds = cds[:columnsDataLen]
bd.columnsData = cds
return cds
}
// copyFrom copies src to bd.
func (bd *blockData) copyFrom(src *blockData) {
bd.reset()
bd.streamID = src.streamID
bd.uncompressedSizeBytes = src.uncompressedSizeBytes
bd.rowsCount = src.rowsCount
bd.timestampsData.copyFrom(&src.timestampsData, &bd.a)
cdsSrc := src.columnsData
cds := bd.resizeColumnsData(len(cdsSrc))
for i := range cds {
cds[i].copyFrom(&cdsSrc[i], &bd.a)
}
bd.columnsData = cds
bd.constColumns = append(bd.constColumns[:0], src.constColumns...)
}
// unmarshalRows appends unmarshaled from bd log entries to dst.
//
// The returned log entries are valid until sbu and vd are valid.
func (bd *blockData) unmarshalRows(dst *rows, sbu *stringsBlockUnmarshaler, vd *valuesDecoder) error {
b := getBlock()
defer putBlock(b)
if err := b.InitFromBlockData(bd, sbu, vd); err != nil {
return err
}
b.appendRows(dst)
return nil
}
// mustWriteTo writes bd with the given sid to sw and updates bh accordingly
func (bd *blockData) mustWriteTo(bh *blockHeader, sw *streamWriters) {
// Do not store the version used for encoding directly in the block data, since:
// - all the blocks in the same part use the same encoding
// - the block encoding version can be put in metadata file for the part (aka metadataFilename)
bh.reset()
bh.streamID = bd.streamID
bh.uncompressedSizeBytes = bd.uncompressedSizeBytes
bh.rowsCount = bd.rowsCount
// Marshal timestamps
bd.timestampsData.mustWriteTo(&bh.timestampsHeader, sw)
// Marshal columns
cds := bd.columnsData
csh := getColumnsHeader()
chs := csh.resizeColumnHeaders(len(cds))
for i := range cds {
cds[i].mustWriteTo(&chs[i], sw)
}
csh.constColumns = append(csh.constColumns[:0], bd.constColumns...)
bb := longTermBufPool.Get()
bb.B = csh.marshal(bb.B)
putColumnsHeader(csh)
bh.columnsHeaderOffset = sw.columnsHeaderWriter.bytesWritten
bh.columnsHeaderSize = uint64(len(bb.B))
if bh.columnsHeaderSize > maxColumnsHeaderSize {
logger.Panicf("BUG: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", bh.columnsHeaderSize, maxColumnsHeaderSize)
}
sw.columnsHeaderWriter.MustWrite(bb.B)
longTermBufPool.Put(bb)
}
// mustReadFrom reads block data associated with bh from sr to bd.
func (bd *blockData) mustReadFrom(bh *blockHeader, sr *streamReaders) {
bd.reset()
bd.streamID = bh.streamID
bd.uncompressedSizeBytes = bh.uncompressedSizeBytes
bd.rowsCount = bh.rowsCount
// Read timestamps
bd.timestampsData.mustReadFrom(&bh.timestampsHeader, sr, &bd.a)
// Read columns
if bh.columnsHeaderOffset != sr.columnsHeaderReader.bytesRead {
logger.Panicf("FATAL: %s: unexpected columnsHeaderOffset=%d; must equal to the number of bytes read: %d",
sr.columnsHeaderReader.Path(), bh.columnsHeaderOffset, sr.columnsHeaderReader.bytesRead)
}
columnsHeaderSize := bh.columnsHeaderSize
if columnsHeaderSize > maxColumnsHeaderSize {
logger.Panicf("BUG: %s: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", sr.columnsHeaderReader.Path(), columnsHeaderSize, maxColumnsHeaderSize)
}
bb := longTermBufPool.Get()
bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(columnsHeaderSize))
sr.columnsHeaderReader.MustReadFull(bb.B)
csh := getColumnsHeader()
if err := csh.unmarshal(bb.B); err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal columnsHeader: %s", sr.columnsHeaderReader.Path(), err)
}
longTermBufPool.Put(bb)
chs := csh.columnHeaders
cds := bd.resizeColumnsData(len(chs))
for i := range chs {
cds[i].mustReadFrom(&chs[i], sr, &bd.a)
}
bd.constColumns = append(bd.constColumns[:0], csh.constColumns...)
putColumnsHeader(csh)
}
// timestampsData contains the encoded timestamps data.
type timestampsData struct {
// data contains packed timestamps data.
data []byte
// marshalType is the marshal type for timestamps
marshalType encoding.MarshalType
// minTimestamp is the minimum timestamp in the timestamps data
minTimestamp int64
// maxTimestamp is the maximum timestamp in the timestamps data
maxTimestamp int64
}
// reset resets td for subsequent re-use
func (td *timestampsData) reset() {
td.data = nil
td.marshalType = 0
td.minTimestamp = 0
td.maxTimestamp = 0
}
// copyFrom copies src to td.
func (td *timestampsData) copyFrom(src *timestampsData, a *arena) {
td.reset()
td.data = a.copyBytes(src.data)
td.marshalType = src.marshalType
td.minTimestamp = src.minTimestamp
td.maxTimestamp = src.maxTimestamp
}
// mustWriteTo writes td to sw and updates th accordingly
func (td *timestampsData) mustWriteTo(th *timestampsHeader, sw *streamWriters) {
th.reset()
th.marshalType = td.marshalType
th.minTimestamp = td.minTimestamp
th.maxTimestamp = td.maxTimestamp
th.blockOffset = sw.timestampsWriter.bytesWritten
th.blockSize = uint64(len(td.data))
if th.blockSize > maxTimestampsBlockSize {
logger.Panicf("BUG: too big timestampsHeader.blockSize: %d bytes; mustn't exceed %d bytes", th.blockSize, maxTimestampsBlockSize)
}
sw.timestampsWriter.MustWrite(td.data)
}
// mustReadFrom reads timestamps data associated with th from sr to td.
func (td *timestampsData) mustReadFrom(th *timestampsHeader, sr *streamReaders, a *arena) {
td.reset()
td.marshalType = th.marshalType
td.minTimestamp = th.minTimestamp
td.maxTimestamp = th.maxTimestamp
timestampsReader := &sr.timestampsReader
if th.blockOffset != timestampsReader.bytesRead {
logger.Panicf("FATAL: %s: unexpected timestampsHeader.blockOffset=%d; must equal to the number of bytes read: %d",
timestampsReader.Path(), th.blockOffset, timestampsReader.bytesRead)
}
timestampsBlockSize := th.blockSize
if timestampsBlockSize > maxTimestampsBlockSize {
logger.Panicf("FATAL: %s: too big timestamps block with %d bytes; the maximum supported block size is %d bytes",
timestampsReader.Path(), timestampsBlockSize, maxTimestampsBlockSize)
}
td.data = a.newBytes(int(timestampsBlockSize))
timestampsReader.MustReadFull(td.data)
}
// columnData contains packed data for a single column.
type columnData struct {
// name is the column name
name string
// valueType is the type of values stored in valuesData
valueType valueType
// minValue is the minimum encoded uint* or float64 value in the columnHeader
//
// It is used for fast detection of whether the given columnHeader contains values in the given range
minValue uint64
// maxValue is the maximum encoded uint* or float64 value in the columnHeader
//
// It is used for fast detection of whether the given columnHeader contains values in the given range
maxValue uint64
// valuesDict contains unique values for valueType = valueTypeDict
valuesDict valuesDict
// valuesData contains packed values data for the given column
valuesData []byte
// bloomFilterData contains packed bloomFilter data for the given column
bloomFilterData []byte
}
// reset rests cd for subsequent re-use
func (cd *columnData) reset() {
cd.name = ""
cd.valueType = 0
cd.minValue = 0
cd.maxValue = 0
cd.valuesDict.reset()
cd.valuesData = nil
cd.bloomFilterData = nil
}
// copyFrom copies src to cd.
func (cd *columnData) copyFrom(src *columnData, a *arena) {
cd.reset()
cd.name = src.name
cd.valueType = src.valueType
cd.minValue = src.minValue
cd.maxValue = src.maxValue
cd.valuesDict.copyFrom(&src.valuesDict)
cd.valuesData = a.copyBytes(src.valuesData)
cd.bloomFilterData = a.copyBytes(src.bloomFilterData)
}
// mustWriteTo writes cd to sw and updates ch accordingly.
func (cd *columnData) mustWriteTo(ch *columnHeader, sw *streamWriters) {
ch.reset()
valuesWriter := &sw.fieldValuesWriter
bloomFilterWriter := &sw.fieldBloomFilterWriter
if cd.name == "" {
valuesWriter = &sw.messageValuesWriter
bloomFilterWriter = &sw.messageBloomFilterWriter
}
ch.name = cd.name
ch.valueType = cd.valueType
ch.minValue = cd.minValue
ch.maxValue = cd.maxValue
ch.valuesDict.copyFrom(&cd.valuesDict)
// marshal values
ch.valuesSize = uint64(len(cd.valuesData))
if ch.valuesSize > maxValuesBlockSize {
logger.Panicf("BUG: too big valuesSize: %d bytes; mustn't exceed %d bytes", ch.valuesSize, maxValuesBlockSize)
}
ch.valuesOffset = valuesWriter.bytesWritten
valuesWriter.MustWrite(cd.valuesData)
// marshal bloom filter
ch.bloomFilterSize = uint64(len(cd.bloomFilterData))
if ch.bloomFilterSize > maxBloomFilterBlockSize {
logger.Panicf("BUG: too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", ch.bloomFilterSize, maxBloomFilterBlockSize)
}
ch.bloomFilterOffset = bloomFilterWriter.bytesWritten
bloomFilterWriter.MustWrite(cd.bloomFilterData)
}
// mustReadFrom reads columns data associated with ch from sr to cd.
func (cd *columnData) mustReadFrom(ch *columnHeader, sr *streamReaders, a *arena) {
cd.reset()
valuesReader := &sr.fieldValuesReader
bloomFilterReader := &sr.fieldBloomFilterReader
if ch.name == "" {
valuesReader = &sr.messageValuesReader
bloomFilterReader = &sr.messageBloomFilterReader
}
cd.name = ch.name
cd.valueType = ch.valueType
cd.minValue = ch.minValue
cd.maxValue = ch.maxValue
cd.valuesDict.copyFrom(&ch.valuesDict)
// read values
if ch.valuesOffset != valuesReader.bytesRead {
logger.Panicf("FATAL: %s: unexpected columnHeader.valuesOffset=%d; must equal to the number of bytes read: %d",
valuesReader.Path(), ch.valuesOffset, valuesReader.bytesRead)
}
valuesSize := ch.valuesSize
if valuesSize > maxValuesBlockSize {
logger.Panicf("FATAL: %s: values block size cannot exceed %d bytes; got %d bytes", valuesReader.Path(), maxValuesBlockSize, valuesSize)
}
cd.valuesData = a.newBytes(int(valuesSize))
valuesReader.MustReadFull(cd.valuesData)
// read bloom filter
// bloom filter is missing in valueTypeDict.
if ch.valueType != valueTypeDict {
if ch.bloomFilterOffset != bloomFilterReader.bytesRead {
logger.Panicf("FATAL: %s: unexpected columnHeader.bloomFilterOffset=%d; must equal to the number of bytes read: %d",
bloomFilterReader.Path(), ch.bloomFilterOffset, bloomFilterReader.bytesRead)
}
bloomFilterSize := ch.bloomFilterSize
if bloomFilterSize > maxBloomFilterBlockSize {
logger.Panicf("FATAL: %s: bloom filter block size cannot exceed %d bytes; got %d bytes", bloomFilterReader.Path(), maxBloomFilterBlockSize, bloomFilterSize)
}
cd.bloomFilterData = a.newBytes(int(bloomFilterSize))
bloomFilterReader.MustReadFull(cd.bloomFilterData)
}
}

View file

@ -0,0 +1,106 @@
package logstorage
import (
"reflect"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
func TestBlockDataReset(t *testing.T) {
bd := &blockData{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 432,
},
},
uncompressedSizeBytes: 2344,
rowsCount: 134,
timestampsData: timestampsData{
data: []byte("foo"),
marshalType: encoding.MarshalTypeDeltaConst,
minTimestamp: 1234,
maxTimestamp: 23443,
},
columnsData: []columnData{
{
name: "foo",
valueType: valueTypeUint16,
valuesData: []byte("aaa"),
bloomFilterData: []byte("bsdf"),
},
},
constColumns: []Field{
{
Name: "foo",
Value: "bar",
},
},
}
bd.reset()
bdZero := &blockData{
columnsData: []columnData{},
constColumns: []Field{},
}
if !reflect.DeepEqual(bd, bdZero) {
t.Fatalf("unexpected non-zero blockData after reset: %v", bd)
}
}
func TestBlockDataCopyFrom(t *testing.T) {
f := func(bd *blockData) {
t.Helper()
var bd2 blockData
bd2.copyFrom(bd)
bd2.a.b = nil
if !reflect.DeepEqual(bd, &bd2) {
t.Fatalf("unexpected blockData copy\ngot\n%v\nwant\n%v", &bd2, bd)
}
// Try copying it again to the same destination
bd2.copyFrom(bd)
bd2.a.b = nil
if !reflect.DeepEqual(bd, &bd2) {
t.Fatalf("unexpected blockData copy to the same destination\ngot\n%v\nwant\n%v", &bd2, bd)
}
}
f(&blockData{})
bd := &blockData{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 432,
},
},
uncompressedSizeBytes: 8943,
rowsCount: 134,
timestampsData: timestampsData{
data: []byte("foo"),
marshalType: encoding.MarshalTypeDeltaConst,
minTimestamp: 1234,
maxTimestamp: 23443,
},
columnsData: []columnData{
{
name: "foo",
valueType: valueTypeUint16,
valuesData: []byte("aaa"),
bloomFilterData: []byte("bsdf"),
},
{
name: "bar",
valuesData: []byte("aaa"),
bloomFilterData: []byte("bsdf"),
},
},
constColumns: []Field{
{
Name: "foobar",
Value: "baz",
},
},
}
f(bd)
}

View file

@ -0,0 +1,766 @@
package logstorage
import (
"fmt"
"math"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// blockHeader contains information about a single block.
//
// blockHeader is stored in the indexFilename file.
type blockHeader struct {
// streamID is a stream id for entries in the block
streamID streamID
// uncompressedSizeBytes is the original (uncompressed) size of log entries stored in the block
uncompressedSizeBytes uint64
// rowsCount is the number of log entries stored in the block
rowsCount uint64
// timestampsHeader contains information about timestamps for log entries in the block
timestampsHeader timestampsHeader
// columnsHeaderOffset is the offset of columnsHeader at columnsHeaderFilename
columnsHeaderOffset uint64
// columnsHeaderSize is the size of columnsHeader at columnsHeaderFilename
columnsHeaderSize uint64
}
// reset resets bh, so it can be re-used.
func (bh *blockHeader) reset() {
bh.streamID.reset()
bh.uncompressedSizeBytes = 0
bh.rowsCount = 0
bh.timestampsHeader.reset()
bh.columnsHeaderOffset = 0
bh.columnsHeaderSize = 0
}
func (bh *blockHeader) copyFrom(src *blockHeader) {
bh.reset()
bh.streamID = src.streamID
bh.uncompressedSizeBytes = src.uncompressedSizeBytes
bh.rowsCount = src.rowsCount
bh.timestampsHeader.copyFrom(&src.timestampsHeader)
bh.columnsHeaderOffset = src.columnsHeaderOffset
bh.columnsHeaderSize = src.columnsHeaderSize
}
// marshal appends the marshaled bh to dst and returns the result.
func (bh *blockHeader) marshal(dst []byte) []byte {
// Do not store the version used for encoding directly in the block header, since:
// - all the block headers in the same part use the same encoding
// - the block header encoding version can be put in metadata file for the part (aka metadataFilename)
dst = bh.streamID.marshal(dst)
dst = encoding.MarshalVarUint64(dst, bh.uncompressedSizeBytes)
dst = encoding.MarshalVarUint64(dst, bh.rowsCount)
dst = bh.timestampsHeader.marshal(dst)
dst = encoding.MarshalVarUint64(dst, bh.columnsHeaderOffset)
dst = encoding.MarshalVarUint64(dst, bh.columnsHeaderSize)
return dst
}
// unmarshal unmarshals bh from src and returns the remaining tail.
func (bh *blockHeader) unmarshal(src []byte) ([]byte, error) {
bh.reset()
srcOrig := src
// unmarshal bh.streamID
tail, err := bh.streamID.unmarshal(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal streamID: %w", err)
}
src = tail
// unmarshal bh.uncompressedSizeBytes
tail, n, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal uncompressedSizeBytes: %w", err)
}
bh.uncompressedSizeBytes = n
src = tail
// unmarshal bh.rowsCount
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal rowsCount: %w", err)
}
if n > maxRowsPerBlock {
return srcOrig, fmt.Errorf("too big value for rowsCount: %d; mustn't exceed %d", n, maxRowsPerBlock)
}
bh.rowsCount = n
src = tail
// unmarshal bh.timestampsHeader
tail, err = bh.timestampsHeader.unmarshal(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal timestampsHeader: %w", err)
}
src = tail
// unmarshal columnsHeaderOffset
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal columnsHeaderOffset: %w", err)
}
bh.columnsHeaderOffset = n
src = tail
// unmarshal columnsHeaderSize
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal columnsHeaderSize: %w", err)
}
if n > maxColumnsHeaderSize {
return srcOrig, fmt.Errorf("too big value for columnsHeaderSize: %d; mustn't exceed %d", n, maxColumnsHeaderSize)
}
bh.columnsHeaderSize = n
src = tail
return src, nil
}
func getBlockHeader() *blockHeader {
v := blockHeaderPool.Get()
if v == nil {
return &blockHeader{}
}
return v.(*blockHeader)
}
func putBlockHeader(bh *blockHeader) {
bh.reset()
blockHeaderPool.Put(bh)
}
var blockHeaderPool sync.Pool
// unmarshalBlockHeaders appends unmarshaled from src blockHeader entries to dst and returns the result.
func unmarshalBlockHeaders(dst []blockHeader, src []byte) ([]blockHeader, error) {
dstOrig := dst
for len(src) > 0 {
if len(dst) < cap(dst) {
dst = dst[:len(dst)+1]
} else {
dst = append(dst, blockHeader{})
}
bh := &dst[len(dst)-1]
tail, err := bh.unmarshal(src)
if err != nil {
return dstOrig, fmt.Errorf("cannot unmarshal blockHeader entries: %w", err)
}
src = tail
}
if err := validateBlockHeaders(dst[len(dstOrig):]); err != nil {
return dstOrig, err
}
return dst, nil
}
func validateBlockHeaders(bhs []blockHeader) error {
for i := 1; i < len(bhs); i++ {
bhCurr := &bhs[i]
bhPrev := &bhs[i-1]
if bhCurr.streamID.less(&bhPrev.streamID) {
return fmt.Errorf("unexpected blockHeader with smaller streamID=%s after bigger streamID=%s at position %d", &bhCurr.streamID, &bhPrev.streamID, i)
}
if !bhCurr.streamID.equal(&bhPrev.streamID) {
continue
}
thCurr := bhCurr.timestampsHeader
thPrev := bhPrev.timestampsHeader
if thCurr.minTimestamp < thPrev.minTimestamp {
return fmt.Errorf("unexpected blockHeader with smaller timestamp=%d after bigger timestamp=%d at position %d", thCurr.minTimestamp, thPrev.minTimestamp, i)
}
}
return nil
}
func resetBlockHeaders(bhs []blockHeader) []blockHeader {
for i := range bhs {
bhs[i].reset()
}
return bhs[:0]
}
func getColumnsHeader() *columnsHeader {
v := columnsHeaderPool.Get()
if v == nil {
return &columnsHeader{}
}
return v.(*columnsHeader)
}
func putColumnsHeader(csh *columnsHeader) {
csh.reset()
columnsHeaderPool.Put(csh)
}
var columnsHeaderPool sync.Pool
// columnsHeader contains information about columns in a single block.
//
// columnsHeader is stored in the columnsHeaderFilename file.
type columnsHeader struct {
// columnHeaders contains the information about every column seen in the block.
columnHeaders []columnHeader
// constColumns contain fields with constant values across all the block entries.
constColumns []Field
}
func (csh *columnsHeader) reset() {
chs := csh.columnHeaders
for i := range chs {
chs[i].reset()
}
csh.columnHeaders = chs[:0]
ccs := csh.constColumns
for i := range ccs {
ccs[i].Reset()
}
csh.constColumns = ccs[:0]
}
func (csh *columnsHeader) getConstColumnValue(name string) string {
if name == "_msg" {
name = ""
}
ccs := csh.constColumns
for i := range ccs {
cc := &ccs[i]
if cc.Name == name {
return cc.Value
}
}
return ""
}
func (csh *columnsHeader) getColumnHeader(name string) *columnHeader {
if name == "_msg" {
name = ""
}
chs := csh.columnHeaders
for i := range chs {
ch := &chs[i]
if ch.name == name {
return ch
}
}
return nil
}
func (csh *columnsHeader) resizeConstColumns(columnsLen int) []Field {
ccs := csh.constColumns
if n := columnsLen - cap(ccs); n > 0 {
ccs = append(ccs[:cap(ccs)], make([]Field, n)...)
}
ccs = ccs[:columnsLen]
csh.constColumns = ccs
return ccs
}
func (csh *columnsHeader) resizeColumnHeaders(columnHeadersLen int) []columnHeader {
chs := csh.columnHeaders
if n := columnHeadersLen - cap(chs); n > 0 {
chs = append(chs[:cap(chs)], make([]columnHeader, n)...)
}
chs = chs[:columnHeadersLen]
csh.columnHeaders = chs
return chs
}
func (csh *columnsHeader) marshal(dst []byte) []byte {
chs := csh.columnHeaders
dst = encoding.MarshalVarUint64(dst, uint64(len(chs)))
for i := range chs {
dst = chs[i].marshal(dst)
}
ccs := csh.constColumns
dst = encoding.MarshalVarUint64(dst, uint64(len(ccs)))
for i := range ccs {
dst = ccs[i].marshal(dst)
}
return dst
}
func (csh *columnsHeader) unmarshal(src []byte) error {
csh.reset()
// unmarshal columnHeaders
tail, n, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return fmt.Errorf("cannot unmarshal columnHeaders len: %w", err)
}
if n > maxColumnsPerBlock {
return fmt.Errorf("too many column headers: %d; mustn't exceed %d", n, maxColumnsPerBlock)
}
src = tail
chs := csh.resizeColumnHeaders(int(n))
for i := range chs {
tail, err = chs[i].unmarshal(src)
if err != nil {
return fmt.Errorf("cannot unmarshal columnHeader %d out of %d columnHeaders: %w", i, len(chs), err)
}
src = tail
}
csh.columnHeaders = chs
// unmarshal constColumns
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return fmt.Errorf("cannot unmarshal constColumns len: %w", err)
}
if n+uint64(len(csh.columnHeaders)) > maxColumnsPerBlock {
return fmt.Errorf("too many columns: %d; mustn't exceed %d", n+uint64(len(csh.columnHeaders)), maxColumnsPerBlock)
}
src = tail
ccs := csh.resizeConstColumns(int(n))
for i := range ccs {
tail, err = ccs[i].unmarshal(src)
if err != nil {
return fmt.Errorf("cannot unmarshal constColumn %d out of %d columns: %w", i, len(ccs), err)
}
src = tail
}
// Verify that the src is empty
if len(src) > 0 {
return fmt.Errorf("unexpected non-empty tail left after unmarshaling columnsHeader: len(tail)=%d", len(src))
}
return nil
}
// columnHeaders contains information for values, which belong to a single label in a single block.
//
// The main column with an empty name is stored in messageValuesFilename,
// while the rest of columns are stored in fieldValuesFilename.
// This allows minimizing disk read IO when filtering by non-message columns.
//
// Every block column contains also a bloom filter for all the tokens stored in the column.
// This bloom filter is used for fast determining whether the given block may contain the given tokens.
//
// Tokens in bloom filter depend on valueType:
//
// - valueTypeString stores lowercased tokens seen in all the values
// - valueTypeDict doesn't store anything in the bloom filter, since all the encoded values
// are available directly in the valuesDict field
// - valueTypeUint8, valueTypeUint16, valueTypeUint32 and valueTypeUint64 stores encoded uint values
// - valueTypeFloat64 stores encoded float64 values
// - valueTypeIPv4 stores encoded into uint32 ips
// - valueTypeTimestampISO8601 stores encoded into uint64 timestamps
//
// Bloom filters for main column with an empty name is stored in messageBloomFilename,
// while the rest of columns are stored in fieldBloomFilename.
type columnHeader struct {
// name contains column name aka label name
name string
// valueType is the type of values stored in the block
valueType valueType
// minValue is the minimum encoded value for uint*, ipv4, timestamp and float64 value in the columnHeader
//
// It is used for fast detection of whether the given columnHeader contains values in the given range
minValue uint64
// maxValue is the maximum encoded value for uint*, ipv4, timestamp and float64 value in the columnHeader
//
// It is used for fast detection of whether the given columnHeader contains values in the given range
maxValue uint64
// valuesDict contains unique values for valueType = valueTypeDict
valuesDict valuesDict
// valuesOffset contains the offset of the block in either messageValuesFilename or fieldValuesFilename
valuesOffset uint64
// valuesSize contains the size of the block in either messageValuesFilename or fieldValuesFilename
valuesSize uint64
// bloomFilterOffset contains the offset of the bloom filter in either messageBloomFilename or fieldBloomFilename
bloomFilterOffset uint64
// bloomFilterSize contains the size of the bloom filter in either messageBloomFilename or fieldBloomFilename
bloomFilterSize uint64
}
// reset resets ch
func (ch *columnHeader) reset() {
ch.name = ""
ch.valueType = 0
ch.minValue = 0
ch.maxValue = 0
ch.valuesDict.reset()
ch.valuesOffset = 0
ch.valuesSize = 0
ch.bloomFilterOffset = 0
ch.bloomFilterSize = 0
}
// marshal appends marshaled ch to dst and returns the result.
func (ch *columnHeader) marshal(dst []byte) []byte {
// check minValue/maxValue
if ch.valueType == valueTypeFloat64 {
minValue := math.Float64frombits(ch.minValue)
maxValue := math.Float64frombits(ch.maxValue)
if minValue > maxValue {
logger.Panicf("BUG: minValue=%g must be smaller than maxValue=%g", minValue, maxValue)
}
} else {
if ch.minValue > ch.maxValue {
logger.Panicf("BUG: minValue=%d must be smaller than maxValue=%d", ch.minValue, ch.maxValue)
}
}
// Encode common fields - ch.name and ch.valueType
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(ch.name))
dst = append(dst, byte(ch.valueType))
// Encode other fields depending on ch.valueType
switch ch.valueType {
case valueTypeString:
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeDict:
dst = ch.valuesDict.marshal(dst)
dst = ch.marshalValues(dst)
case valueTypeUint8:
dst = append(dst, byte(ch.minValue))
dst = append(dst, byte(ch.maxValue))
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeUint16:
dst = encoding.MarshalUint16(dst, uint16(ch.minValue))
dst = encoding.MarshalUint16(dst, uint16(ch.maxValue))
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeUint32:
dst = encoding.MarshalUint32(dst, uint32(ch.minValue))
dst = encoding.MarshalUint32(dst, uint32(ch.maxValue))
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeUint64:
dst = encoding.MarshalUint64(dst, ch.minValue)
dst = encoding.MarshalUint64(dst, ch.maxValue)
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeFloat64:
// float64 values are encoded as uint64 via math.Float64bits()
dst = encoding.MarshalUint64(dst, ch.minValue)
dst = encoding.MarshalUint64(dst, ch.maxValue)
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeIPv4:
dst = encoding.MarshalUint32(dst, uint32(ch.minValue))
dst = encoding.MarshalUint32(dst, uint32(ch.maxValue))
dst = ch.marshalValuesAndBloomFilters(dst)
case valueTypeTimestampISO8601:
// timestamps are encoded in nanoseconds
dst = encoding.MarshalUint64(dst, ch.minValue)
dst = encoding.MarshalUint64(dst, ch.maxValue)
dst = ch.marshalValuesAndBloomFilters(dst)
default:
logger.Panicf("BUG: unknown valueType=%d", ch.valueType)
}
return dst
}
func (ch *columnHeader) marshalValuesAndBloomFilters(dst []byte) []byte {
dst = ch.marshalValues(dst)
dst = ch.marshalBloomFilters(dst)
return dst
}
func (ch *columnHeader) marshalValues(dst []byte) []byte {
dst = encoding.MarshalVarUint64(dst, ch.valuesOffset)
dst = encoding.MarshalVarUint64(dst, ch.valuesSize)
return dst
}
func (ch *columnHeader) marshalBloomFilters(dst []byte) []byte {
dst = encoding.MarshalVarUint64(dst, ch.bloomFilterOffset)
dst = encoding.MarshalVarUint64(dst, ch.bloomFilterSize)
return dst
}
// unmarshal unmarshals ch from src and returns the tail left after unmarshaling.
func (ch *columnHeader) unmarshal(src []byte) ([]byte, error) {
ch.reset()
srcOrig := src
// Unmarshal column name
tail, data, err := encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal column name: %w", err)
}
// Do not use bytesutil.InternBytes(data) here, since it works slower than the string(data) in prod
ch.name = string(data)
src = tail
// Unmarshal value type
if len(src) < 1 {
return srcOrig, fmt.Errorf("cannot unmarshal valueType from 0 bytes for column %q; need at least 1 byte", ch.name)
}
ch.valueType = valueType(src[0])
src = src[1:]
// Unmarshal the rest of data depending on valueType
switch ch.valueType {
case valueTypeString:
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeString for column %q: %w", ch.name, err)
}
src = tail
case valueTypeDict:
tail, err = ch.valuesDict.unmarshal(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal dict at valueTypeDict for column %q: %w", ch.name, err)
}
src = tail
tail, err = ch.unmarshalValues(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values at valueTypeDict for column %q: %w", ch.name, err)
}
src = tail
case valueTypeUint8:
if len(src) < 2 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint8 from %d bytes for column %q; need at least 2 bytes", len(src), ch.name)
}
ch.minValue = uint64(src[0])
ch.maxValue = uint64(src[1])
src = src[2:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint8 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeUint16:
if len(src) < 4 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint16 from %d bytes for column %q; need at least 4 bytes", len(src), ch.name)
}
ch.minValue = uint64(encoding.UnmarshalUint16(src))
ch.maxValue = uint64(encoding.UnmarshalUint16(src[2:]))
src = src[4:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint16 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeUint32:
if len(src) < 8 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint32 from %d bytes for column %q; need at least 8 bytes", len(src), ch.name)
}
ch.minValue = uint64(encoding.UnmarshalUint32(src))
ch.maxValue = uint64(encoding.UnmarshalUint32(src[4:]))
src = src[8:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint32 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeUint64:
if len(src) < 16 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint64 from %d bytes for column %q; need at least 16 bytes", len(src), ch.name)
}
ch.minValue = encoding.UnmarshalUint64(src)
ch.maxValue = encoding.UnmarshalUint64(src[8:])
src = src[16:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint64 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeFloat64:
if len(src) < 16 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeFloat64 from %d bytes for column %q; need at least 16 bytes", len(src), ch.name)
}
// min and max values must be converted to real values with math.Float64frombits() during querying.
ch.minValue = encoding.UnmarshalUint64(src)
ch.maxValue = encoding.UnmarshalUint64(src[8:])
src = src[16:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeFloat64 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeIPv4:
if len(src) < 8 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeIPv4 from %d bytes for column %q; need at least 8 bytes", len(src), ch.name)
}
ch.minValue = uint64(encoding.UnmarshalUint32(src))
ch.maxValue = uint64(encoding.UnmarshalUint32(src[4:]))
src = src[8:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeIPv4 for column %q: %w", ch.name, err)
}
src = tail
case valueTypeTimestampISO8601:
if len(src) < 16 {
return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeTimestampISO8601 from %d bytes for column %q; need at least 16 bytes",
len(src), ch.name)
}
ch.minValue = encoding.UnmarshalUint64(src)
ch.maxValue = encoding.UnmarshalUint64(src[8:])
src = src[16:]
tail, err = ch.unmarshalValuesAndBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeTimestampISO8601 for column %q: %w", ch.name, err)
}
src = tail
default:
return srcOrig, fmt.Errorf("unexpected valueType=%d for column %q", ch.valueType, ch.name)
}
return src, nil
}
func (ch *columnHeader) unmarshalValuesAndBloomFilters(src []byte) ([]byte, error) {
srcOrig := src
tail, err := ch.unmarshalValues(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal values: %w", err)
}
src = tail
tail, err = ch.unmarshalBloomFilters(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal bloom filters: %w", err)
}
src = tail
return src, nil
}
func (ch *columnHeader) unmarshalValues(src []byte) ([]byte, error) {
srcOrig := src
tail, n, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal valuesOffset: %w", err)
}
ch.valuesOffset = n
src = tail
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal valuesSize: %w", err)
}
if n > maxValuesBlockSize {
return srcOrig, fmt.Errorf("too big valuesSize: %d bytes; mustn't exceed %d bytes", n, maxValuesBlockSize)
}
ch.valuesSize = n
src = tail
return src, nil
}
func (ch *columnHeader) unmarshalBloomFilters(src []byte) ([]byte, error) {
srcOrig := src
tail, n, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal bloomFilterOffset: %w", err)
}
ch.bloomFilterOffset = n
src = tail
tail, n, err = encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal bloomFilterSize: %w", err)
}
if n > maxBloomFilterBlockSize {
return srcOrig, fmt.Errorf("too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", n, maxBloomFilterBlockSize)
}
ch.bloomFilterSize = n
src = tail
return src, nil
}
// timestampsHeader contains the information about timestamps block.
type timestampsHeader struct {
// blockOffset is an offset of timestamps block inside timestampsFilename file
blockOffset uint64
// blockSize is the size of the timestamps block inside timestampsFilename file
blockSize uint64
// minTimestamp is the mimumum timestamp seen in the block
minTimestamp int64
// maxTimestamp is the maximum timestamp seen in the block
maxTimestamp int64
// marshalType is the type used for encoding the timestamps block
marshalType encoding.MarshalType
}
// reset resets th, so it can be reused
func (th *timestampsHeader) reset() {
th.blockOffset = 0
th.blockSize = 0
th.minTimestamp = 0
th.maxTimestamp = 0
th.marshalType = 0
}
func (th *timestampsHeader) copyFrom(src *timestampsHeader) {
th.blockOffset = src.blockOffset
th.blockSize = src.blockSize
th.minTimestamp = src.minTimestamp
th.maxTimestamp = src.maxTimestamp
th.marshalType = src.marshalType
}
// marshal appends marshaled th to dst and returns the result.
func (th *timestampsHeader) marshal(dst []byte) []byte {
dst = encoding.MarshalUint64(dst, th.blockOffset)
dst = encoding.MarshalUint64(dst, th.blockSize)
dst = encoding.MarshalUint64(dst, uint64(th.minTimestamp))
dst = encoding.MarshalUint64(dst, uint64(th.maxTimestamp))
dst = append(dst, byte(th.marshalType))
return dst
}
// unmarshal unmarshals th from src and returns the tail left after the unmarshaling.
func (th *timestampsHeader) unmarshal(src []byte) ([]byte, error) {
th.reset()
if len(src) < 33 {
return src, fmt.Errorf("cannot unmarshal timestampsHeader from %d bytes; need at least 33 bytes", len(src))
}
th.blockOffset = encoding.UnmarshalUint64(src)
th.blockSize = encoding.UnmarshalUint64(src[8:])
th.minTimestamp = int64(encoding.UnmarshalUint64(src[16:]))
th.maxTimestamp = int64(encoding.UnmarshalUint64(src[24:]))
th.marshalType = encoding.MarshalType(src[32])
return src[33:], nil
}

View file

@ -0,0 +1,454 @@
package logstorage
import (
"reflect"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
func TestBlockHeaderMarshalUnmarshal(t *testing.T) {
f := func(bh *blockHeader, marshaledLen int) {
t.Helper()
data := bh.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected lengths of the marshaled blockHeader; got %d; want %d", len(data), marshaledLen)
}
bh2 := &blockHeader{}
tail, err := bh2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error in unmarshal: %s", err)
}
if len(tail) > 0 {
t.Fatalf("unexpected non-empty tail after unmarshal: %X", tail)
}
if !reflect.DeepEqual(bh, bh2) {
t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", bh2, bh)
}
}
f(&blockHeader{}, 61)
f(&blockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
lo: 3443,
hi: 23434,
},
},
uncompressedSizeBytes: 4344,
rowsCount: 1234,
timestampsHeader: timestampsHeader{
blockOffset: 13234,
blockSize: 8843,
minTimestamp: -4334,
maxTimestamp: 23434,
marshalType: encoding.MarshalTypeNearestDelta2,
},
columnsHeaderOffset: 4384,
columnsHeaderSize: 894,
}, 65)
}
func TestColumnsHeaderMarshalUnmarshal(t *testing.T) {
f := func(csh *columnsHeader, marshaledLen int) {
t.Helper()
data := csh.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected lengths of the marshaled columnsHeader; got %d; want %d", len(data), marshaledLen)
}
csh2 := &columnsHeader{}
err := csh2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error in unmarshal: %s", err)
}
if !reflect.DeepEqual(csh, csh2) {
t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", csh2, csh)
}
}
f(&columnsHeader{}, 2)
f(&columnsHeader{
columnHeaders: []columnHeader{
{
name: "foobar",
valueType: valueTypeString,
valuesOffset: 12345,
valuesSize: 23434,
bloomFilterOffset: 89843,
bloomFilterSize: 8934,
},
{
name: "message",
valueType: valueTypeUint16,
minValue: 123,
maxValue: 456,
valuesOffset: 3412345,
valuesSize: 234434,
bloomFilterOffset: 83,
bloomFilterSize: 34,
},
},
constColumns: []Field{
{
Name: "foo",
Value: "bar",
},
},
}, 50)
}
func TestBlockHeaderUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
bh := getBlockHeader()
defer putBlockHeader(bh)
tail, err := bh.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail;\ngot\n%q\nwant\n%q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
bh := blockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
lo: 3443,
hi: 23434,
},
},
uncompressedSizeBytes: 4344,
rowsCount: 1234,
timestampsHeader: timestampsHeader{
blockOffset: 13234,
blockSize: 8843,
minTimestamp: -4334,
maxTimestamp: 23434,
marshalType: encoding.MarshalTypeNearestDelta2,
},
columnsHeaderOffset: 4384,
columnsHeaderSize: 894,
}
data := bh.marshal(nil)
for len(data) > 0 {
data = data[:len(data)-1]
f(data)
}
}
func TestColumnsHeaderUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
csh := getColumnsHeader()
defer putColumnsHeader(csh)
err := csh.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
}
f(nil)
f([]byte("foo"))
csh := columnsHeader{
columnHeaders: []columnHeader{
{
name: "foobar",
valueType: valueTypeString,
valuesOffset: 12345,
valuesSize: 23434,
bloomFilterOffset: 89843,
bloomFilterSize: 8934,
},
{
name: "message",
valueType: valueTypeUint16,
minValue: 123,
maxValue: 456,
valuesOffset: 3412345,
valuesSize: 234434,
bloomFilterOffset: 83,
bloomFilterSize: 34,
},
},
constColumns: []Field{
{
Name: "foo",
Value: "bar",
},
},
}
data := csh.marshal(nil)
for len(data) > 0 {
data = data[:len(data)-1]
f(data)
}
}
func TestBlockHeaderReset(t *testing.T) {
bh := &blockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
lo: 3443,
hi: 23434,
},
},
uncompressedSizeBytes: 8984,
rowsCount: 1234,
timestampsHeader: timestampsHeader{
blockOffset: 13234,
blockSize: 8843,
minTimestamp: -4334,
maxTimestamp: 23434,
marshalType: encoding.MarshalTypeNearestDelta2,
},
columnsHeaderOffset: 12332,
columnsHeaderSize: 234,
}
bh.reset()
bhZero := &blockHeader{}
if !reflect.DeepEqual(bh, bhZero) {
t.Fatalf("unexpected non-zero blockHeader after reset: %v", bh)
}
}
func TestColumnsHeaderReset(t *testing.T) {
csh := &columnsHeader{
columnHeaders: []columnHeader{
{
name: "foobar",
valueType: valueTypeString,
valuesOffset: 12345,
valuesSize: 23434,
bloomFilterOffset: 89843,
bloomFilterSize: 8934,
},
{
name: "message",
valueType: valueTypeUint16,
minValue: 123,
maxValue: 456,
valuesOffset: 3412345,
valuesSize: 234434,
bloomFilterOffset: 83,
bloomFilterSize: 34,
},
},
constColumns: []Field{
{
Name: "foo",
Value: "bar",
},
},
}
csh.reset()
cshZero := &columnsHeader{
columnHeaders: []columnHeader{},
constColumns: []Field{},
}
if !reflect.DeepEqual(csh, cshZero) {
t.Fatalf("unexpected non-zero columnsHeader after reset: %v", csh)
}
}
func TestMarshalUnmarshalBlockHeaders(t *testing.T) {
f := func(bhs []blockHeader, marshaledLen int) {
t.Helper()
var data []byte
for i := range bhs {
data = bhs[i].marshal(data)
}
if len(data) != marshaledLen {
t.Fatalf("unexpected length for marshaled blockHeader entries; got %d; want %d", len(data), marshaledLen)
}
bhs2, err := unmarshalBlockHeaders(nil, data)
if err != nil {
t.Fatalf("unexpected error when unmarshaling blockHeader entries: %s", err)
}
if !reflect.DeepEqual(bhs, bhs2) {
t.Fatalf("unexpected blockHeader entries unmarshaled\ngot\n%v\nwant\n%v", bhs2, bhs)
}
}
f(nil, 0)
f([]blockHeader{{}}, 61)
f([]blockHeader{
{},
{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
lo: 3443,
hi: 23434,
},
},
uncompressedSizeBytes: 89894,
rowsCount: 1234,
timestampsHeader: timestampsHeader{
blockOffset: 13234,
blockSize: 8843,
minTimestamp: -4334,
maxTimestamp: 23434,
marshalType: encoding.MarshalTypeNearestDelta2,
},
columnsHeaderOffset: 12332,
columnsHeaderSize: 234,
},
}, 127)
}
func TestColumnHeaderMarshalUnmarshal(t *testing.T) {
f := func(ch *columnHeader, marshaledLen int) {
t.Helper()
data := ch.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected marshaled length of columnHeader; got %d; want %d", len(data), marshaledLen)
}
var ch2 columnHeader
tail, err := ch2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error in umarshal(%v): %s", ch, err)
}
if len(tail) > 0 {
t.Fatalf("unexpected non-empty tail after unmarshal(%v): %X", ch, tail)
}
if !reflect.DeepEqual(ch, &ch2) {
t.Fatalf("unexpected columnHeader after unmarshal;\ngot\n%v\nwant\n%v", &ch2, ch)
}
}
f(&columnHeader{
name: "foo",
valueType: valueTypeUint8,
}, 11)
ch := &columnHeader{
name: "foobar",
valueType: valueTypeDict,
valuesOffset: 12345,
valuesSize: 254452,
}
ch.valuesDict.getOrAdd("abc")
f(ch, 18)
}
func TestColumnHeaderUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var ch columnHeader
tail, err := ch.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
ch := &columnHeader{
name: "abc",
valueType: valueTypeUint16,
bloomFilterSize: 3244,
}
data := ch.marshal(nil)
f(data[:len(data)-1])
}
func TestColumnHeaderReset(t *testing.T) {
ch := &columnHeader{
name: "foobar",
valueType: valueTypeUint16,
valuesOffset: 12345,
valuesSize: 254452,
bloomFilterOffset: 34898234,
bloomFilterSize: 873434,
}
ch.valuesDict.getOrAdd("abc")
ch.reset()
chZero := &columnHeader{}
chZero.valuesDict.values = []string{}
if !reflect.DeepEqual(ch, chZero) {
t.Fatalf("unexpected non-zero columnHeader after reset: %v", ch)
}
}
func TestTimestampsHeaderMarshalUnmarshal(t *testing.T) {
f := func(th *timestampsHeader, marshaledLen int) {
t.Helper()
data := th.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected length of marshaled timestampsHeader; got %d; want %d", len(data), marshaledLen)
}
var th2 timestampsHeader
tail, err := th2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error in unmarshal(%v): %s", th, err)
}
if len(tail) > 0 {
t.Fatalf("unexpected non-nil tail after unmarshal(%v): %X", th, tail)
}
if !reflect.DeepEqual(th, &th2) {
t.Fatalf("unexpected timestampsHeader after unmarshal; got\n%v\nwant\n%v", &th2, th)
}
}
f(&timestampsHeader{}, 33)
f(&timestampsHeader{
blockOffset: 12345,
blockSize: 3424834,
minTimestamp: -123443,
maxTimestamp: 234343,
marshalType: encoding.MarshalTypeZSTDNearestDelta,
}, 33)
}
func TestTimestampsHeaderUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var th timestampsHeader
tail, err := th.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
}
func TestTimestampsHeaderReset(t *testing.T) {
th := &timestampsHeader{
blockOffset: 12345,
blockSize: 3424834,
minTimestamp: -123443,
maxTimestamp: 234343,
marshalType: encoding.MarshalTypeZSTDNearestDelta,
}
th.reset()
thZero := &timestampsHeader{}
if !reflect.DeepEqual(th, thZero) {
t.Fatalf("unexpected non-zero timestampsHeader after reset: %v", th)
}
}

View file

@ -0,0 +1,645 @@
package logstorage
import (
"strconv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type blockSearchWork struct {
// p is the part where the block belongs to.
p *part
// so contains search options for the block search
so *searchOptions
// bh is the header of the block to search.
bh blockHeader
}
func newBlockSearchWork(p *part, so *searchOptions, bh *blockHeader) *blockSearchWork {
var bsw blockSearchWork
bsw.p = p
bsw.so = so
bsw.bh.copyFrom(bh)
return &bsw
}
func getBlockSearch() *blockSearch {
v := blockSearchPool.Get()
if v == nil {
return &blockSearch{}
}
return v.(*blockSearch)
}
func putBlockSearch(bs *blockSearch) {
bs.reset()
blockSearchPool.Put(bs)
}
var blockSearchPool sync.Pool
type blockSearch struct {
// bsw is the actual work to perform on the given block pointed by bsw.ph
bsw *blockSearchWork
// br contains result for the search in the block after search() call
br blockResult
// timestampsCache contains cached timestamps for the given block.
timestampsCache *encoding.Int64s
// bloomFilterCache contains cached bloom filters for requested columns in the given block
bloomFilterCache map[string]*bloomFilter
// valuesCache contains cached values for requested columns in the given block
valuesCache map[string]*stringBucket
// sbu is used for unmarshaling local columns
sbu stringsBlockUnmarshaler
// csh is the columnsHeader associated with the given block
csh columnsHeader
}
func (bs *blockSearch) reset() {
bs.bsw = nil
bs.br.reset()
if bs.timestampsCache != nil {
encoding.PutInt64s(bs.timestampsCache)
bs.timestampsCache = nil
}
bloomFilterCache := bs.bloomFilterCache
for k, bf := range bloomFilterCache {
putBloomFilter(bf)
delete(bloomFilterCache, k)
}
valuesCache := bs.valuesCache
for k, values := range valuesCache {
putStringBucket(values)
delete(valuesCache, k)
}
bs.sbu.reset()
bs.csh.reset()
}
func (bs *blockSearch) partPath() string {
return bs.bsw.p.path
}
func (bs *blockSearch) search(bsw *blockSearchWork) {
bs.reset()
bs.bsw = bsw
bs.csh.initFromBlockHeader(bsw.p, &bsw.bh)
// search rows matching the given filter
bm := getFilterBitmap(int(bsw.bh.rowsCount))
bm.setBits()
bs.bsw.so.filter.apply(bs, bm)
bs.br.mustInit(bs, bm)
if bm.isZero() {
putFilterBitmap(bm)
return
}
// fetch the requested columns to bs.br.
for _, columnName := range bs.bsw.so.resultColumnNames {
switch columnName {
case "_stream":
bs.br.addStreamColumn(bs)
case "_time":
bs.br.addTimeColumn(bs)
default:
v := bs.csh.getConstColumnValue(columnName)
if v != "" {
bs.br.addConstColumn(v)
continue
}
ch := bs.csh.getColumnHeader(columnName)
if ch == nil {
bs.br.addConstColumn("")
} else {
bs.br.addColumn(bs, ch, bm)
}
}
}
putFilterBitmap(bm)
}
func (csh *columnsHeader) initFromBlockHeader(p *part, bh *blockHeader) {
bb := longTermBufPool.Get()
columnsHeaderSize := bh.columnsHeaderSize
if columnsHeaderSize > maxColumnsHeaderSize {
logger.Panicf("FATAL: %s: columns header size cannot exceed %d bytes; got %d bytes", p.path, maxColumnsHeaderSize, columnsHeaderSize)
}
bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(columnsHeaderSize))
p.columnsHeaderFile.MustReadAt(bb.B, int64(bh.columnsHeaderOffset))
if err := csh.unmarshal(bb.B); err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal columns header: %s", p.path, err)
}
longTermBufPool.Put(bb)
}
// getBloomFilterForColumn returns bloom filter for the given ch.
//
// The returned bloom filter belongs to bs, so it becomes invalid after bs reset.
func (bs *blockSearch) getBloomFilterForColumn(ch *columnHeader) *bloomFilter {
bf := bs.bloomFilterCache[ch.name]
if bf != nil {
return bf
}
p := bs.bsw.p
bloomFilterFile := p.fieldBloomFilterFile
if ch.name == "" {
bloomFilterFile = p.messageBloomFilterFile
}
bb := longTermBufPool.Get()
bloomFilterSize := ch.bloomFilterSize
if bloomFilterSize > maxBloomFilterBlockSize {
logger.Panicf("FATAL: %s: bloom filter block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxBloomFilterBlockSize, bloomFilterSize)
}
bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(bloomFilterSize))
bloomFilterFile.MustReadAt(bb.B, int64(ch.bloomFilterOffset))
bf = getBloomFilter()
if err := bf.unmarshal(bb.B); err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal bloom filter: %s", bs.partPath(), err)
}
longTermBufPool.Put(bb)
if bs.bloomFilterCache == nil {
bs.bloomFilterCache = make(map[string]*bloomFilter)
}
bs.bloomFilterCache[ch.name] = bf
return bf
}
// getValuesForColumn returns block values for the given ch.
//
// The returned values belong to bs, so they become invalid after bs reset.
func (bs *blockSearch) getValuesForColumn(ch *columnHeader) []string {
values := bs.valuesCache[ch.name]
if values != nil {
return values.a
}
p := bs.bsw.p
valuesFile := p.fieldValuesFile
if ch.name == "" {
valuesFile = p.messageValuesFile
}
bb := longTermBufPool.Get()
valuesSize := ch.valuesSize
if valuesSize > maxValuesBlockSize {
logger.Panicf("FATAL: %s: values block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxValuesBlockSize, valuesSize)
}
bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(valuesSize))
valuesFile.MustReadAt(bb.B, int64(ch.valuesOffset))
values = getStringBucket()
var err error
values.a, err = bs.sbu.unmarshal(values.a[:0], bb.B, bs.bsw.bh.rowsCount)
longTermBufPool.Put(bb)
if err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal column %q: %s", bs.partPath(), ch.name, err)
}
if bs.valuesCache == nil {
bs.valuesCache = make(map[string]*stringBucket)
}
bs.valuesCache[ch.name] = values
return values.a
}
// getTimestamps returns timestamps for the given bs.
//
// The returned timestamps belong to bs, so they become invalid after bs reset.
func (bs *blockSearch) getTimestamps() []int64 {
timestamps := bs.timestampsCache
if timestamps != nil {
return timestamps.A
}
p := bs.bsw.p
bb := longTermBufPool.Get()
th := &bs.bsw.bh.timestampsHeader
blockSize := th.blockSize
if blockSize > maxTimestampsBlockSize {
logger.Panicf("FATAL: %s: timestamps block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxTimestampsBlockSize, blockSize)
}
bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(blockSize))
p.timestampsFile.MustReadAt(bb.B, int64(th.blockOffset))
rowsCount := int(bs.bsw.bh.rowsCount)
timestamps = encoding.GetInt64s(rowsCount)
var err error
timestamps.A, err = encoding.UnmarshalTimestamps(timestamps.A[:0], bb.B, th.marshalType, th.minTimestamp, rowsCount)
longTermBufPool.Put(bb)
if err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal timestamps: %s", bs.partPath(), err)
}
bs.timestampsCache = timestamps
return timestamps.A
}
// mustReadBlockHeaders reads ih block headers from p, appends them to dst and returns the result.
func (ih *indexBlockHeader) mustReadBlockHeaders(dst []blockHeader, p *part) []blockHeader {
bbCompressed := longTermBufPool.Get()
indexBlockSize := ih.indexBlockSize
if indexBlockSize > maxIndexBlockSize {
logger.Panicf("FATAL: %s: index block size cannot exceed %d bytes; got %d bytes", p.indexFile.Path(), maxIndexBlockSize, indexBlockSize)
}
bbCompressed.B = bytesutil.ResizeNoCopyMayOverallocate(bbCompressed.B, int(indexBlockSize))
p.indexFile.MustReadAt(bbCompressed.B, int64(ih.indexBlockOffset))
bb := longTermBufPool.Get()
var err error
bb.B, err = encoding.DecompressZSTD(bb.B, bbCompressed.B)
longTermBufPool.Put(bbCompressed)
if err != nil {
logger.Panicf("FATAL: %s: cannot decompress indexBlock read at offset %d with size %d: %s", p.indexFile.Path(), ih.indexBlockOffset, ih.indexBlockSize, err)
}
dst, err = unmarshalBlockHeaders(dst, bb.B)
longTermBufPool.Put(bb)
if err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal block headers read at offset %d with size %d: %s", p.indexFile.Path(), ih.indexBlockOffset, ih.indexBlockSize, err)
}
return dst
}
type blockResult struct {
buf []byte
valuesBuf []string
// streamID is streamID for the given blockResult
streamID streamID
// cs contain values for result columns
cs []blockResultColumn
// timestamps contain timestamps for the selected log entries
timestamps []int64
}
func (br *blockResult) reset() {
br.buf = br.buf[:0]
vb := br.valuesBuf
for i := range vb {
vb[i] = ""
}
br.valuesBuf = vb[:0]
br.streamID.reset()
cs := br.cs
for i := range cs {
cs[i].reset()
}
br.cs = cs[:0]
br.timestamps = br.timestamps[:0]
}
func (br *blockResult) RowsCount() int {
return len(br.timestamps)
}
func (br *blockResult) mustInit(bs *blockSearch, bm *filterBitmap) {
br.reset()
br.streamID = bs.bsw.bh.streamID
if !bm.isZero() {
// Initialize timestamps, since they are used for determining the number of rows in br.RowsCount()
srcTimestamps := bs.getTimestamps()
dstTimestamps := br.timestamps[:0]
bm.forEachSetBit(func(idx int) bool {
ts := srcTimestamps[idx]
dstTimestamps = append(dstTimestamps, ts)
return true
})
br.timestamps = dstTimestamps
}
}
func (br *blockResult) addColumn(bs *blockSearch, ch *columnHeader, bm *filterBitmap) {
buf := br.buf
valuesBuf := br.valuesBuf
valuesBufLen := len(valuesBuf)
var dictValues []string
appendValue := func(v string) {
bufLen := len(buf)
buf = append(buf, v...)
s := bytesutil.ToUnsafeString(buf[bufLen:])
valuesBuf = append(valuesBuf, s)
}
switch ch.valueType {
case valueTypeString:
visitValues(bs, ch, bm, func(v string) bool {
appendValue(v)
return true
})
case valueTypeDict:
dictValues = ch.valuesDict.values
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 1 {
logger.Panicf("FATAL: %s: unexpected dict value size for column %q; got %d bytes; want 1 byte", bs.partPath(), ch.name, len(v))
}
dictIdx := v[0]
if int(dictIdx) >= len(dictValues) {
logger.Panicf("FATAL: %s: too big dict index for column %q: %d; should be smaller than %d", bs.partPath(), ch.name, dictIdx, len(dictValues))
}
appendValue(v)
return true
})
case valueTypeUint8:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 1 {
logger.Panicf("FATAL: %s: unexpected size for uint8 column %q; got %d bytes; want 1 byte", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeUint16:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 2 {
logger.Panicf("FATAL: %s: unexpected size for uint16 column %q; got %d bytes; want 2 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeUint32:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 4 {
logger.Panicf("FATAL: %s: unexpected size for uint32 column %q; got %d bytes; want 4 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeUint64:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 8 {
logger.Panicf("FATAL: %s: unexpected size for uint64 column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeFloat64:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 8 {
logger.Panicf("FATAL: %s: unexpected size for float64 column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeIPv4:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 4 {
logger.Panicf("FATAL: %s: unexpected size for ipv4 column %q; got %d bytes; want 4 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
case valueTypeTimestampISO8601:
visitValues(bs, ch, bm, func(v string) bool {
if len(v) != 8 {
logger.Panicf("FATAL: %s: unexpected size for timestmap column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
}
appendValue(v)
return true
})
default:
logger.Panicf("FATAL: %s: unknown valueType=%d for column %q", bs.partPath(), ch.valueType, ch.name)
}
encodedValues := valuesBuf[valuesBufLen:]
valuesBufLen = len(valuesBuf)
for _, v := range dictValues {
appendValue(v)
}
dictValues = valuesBuf[valuesBufLen:]
br.cs = append(br.cs, blockResultColumn{
valueType: ch.valueType,
dictValues: dictValues,
encodedValues: encodedValues,
})
br.buf = buf
br.valuesBuf = valuesBuf
}
func (br *blockResult) addTimeColumn(bs *blockSearch) {
br.cs = append(br.cs, blockResultColumn{
isTime: true,
})
}
func (br *blockResult) addStreamColumn(bs *blockSearch) {
bb := bbPool.Get()
bb.B = bs.bsw.p.pt.appendStreamTagsByStreamID(bb.B[:0], &br.streamID)
if len(bb.B) > 0 {
st := GetStreamTags()
mustUnmarshalStreamTags(st, bb.B)
bb.B = st.marshalString(bb.B[:0])
PutStreamTags(st)
}
s := bytesutil.ToUnsafeString(bb.B)
br.addConstColumn(s)
bbPool.Put(bb)
}
func (br *blockResult) addConstColumn(value string) {
buf := br.buf
bufLen := len(buf)
buf = append(buf, value...)
s := bytesutil.ToUnsafeString(buf[bufLen:])
br.buf = buf
valuesBuf := br.valuesBuf
valuesBufLen := len(valuesBuf)
valuesBuf = append(valuesBuf, s)
br.valuesBuf = valuesBuf
br.cs = append(br.cs, blockResultColumn{
isConst: true,
valueType: valueTypeUnknown,
encodedValues: valuesBuf[valuesBufLen:],
})
}
// getColumnValues returns values for the column with the given idx.
//
// The returned values are valid until br.reset() is called.
func (br *blockResult) getColumnValues(idx int) []string {
c := &br.cs[idx]
if c.values != nil {
return c.values
}
buf := br.buf
valuesBuf := br.valuesBuf
valuesBufLen := len(valuesBuf)
if c.isConst {
v := c.encodedValues[0]
for range br.timestamps {
valuesBuf = append(valuesBuf, v)
}
c.values = valuesBuf[valuesBufLen:]
br.valuesBuf = valuesBuf
return c.values
}
if c.isTime {
for _, timestamp := range br.timestamps {
t := time.Unix(0, timestamp).UTC()
bufLen := len(buf)
buf = t.AppendFormat(buf, time.RFC3339Nano)
s := bytesutil.ToUnsafeString(buf[bufLen:])
valuesBuf = append(valuesBuf, s)
}
c.values = valuesBuf[valuesBufLen:]
br.buf = buf
br.valuesBuf = valuesBuf
return c.values
}
appendValue := func(v string) {
bufLen := len(buf)
buf = append(buf, v...)
s := bytesutil.ToUnsafeString(buf[bufLen:])
valuesBuf = append(valuesBuf, s)
}
switch c.valueType {
case valueTypeString:
c.values = c.encodedValues
return c.values
case valueTypeDict:
dictValues := c.dictValues
for _, v := range c.encodedValues {
dictIdx := v[0]
appendValue(dictValues[dictIdx])
}
case valueTypeUint8:
bb := bbPool.Get()
for _, v := range c.encodedValues {
n := uint64(v[0])
bb.B = strconv.AppendUint(bb.B[:0], n, 10)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeUint16:
bb := bbPool.Get()
for _, v := range c.encodedValues {
b := bytesutil.ToUnsafeBytes(v)
n := uint64(encoding.UnmarshalUint16(b))
bb.B = strconv.AppendUint(bb.B[:0], n, 10)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeUint32:
bb := bbPool.Get()
for _, v := range c.encodedValues {
b := bytesutil.ToUnsafeBytes(v)
n := uint64(encoding.UnmarshalUint32(b))
bb.B = strconv.AppendUint(bb.B[:0], n, 10)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeUint64:
bb := bbPool.Get()
for _, v := range c.encodedValues {
b := bytesutil.ToUnsafeBytes(v)
n := encoding.UnmarshalUint64(b)
bb.B = strconv.AppendUint(bb.B[:0], n, 10)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeFloat64:
bb := bbPool.Get()
for _, v := range c.encodedValues {
bb.B = toFloat64String(bb.B[:0], v)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeIPv4:
bb := bbPool.Get()
for _, v := range c.encodedValues {
bb.B = toIPv4String(bb.B[:0], v)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
case valueTypeTimestampISO8601:
bb := bbPool.Get()
for _, v := range c.encodedValues {
bb.B = toTimestampISO8601String(bb.B[:0], v)
appendValue(bytesutil.ToUnsafeString(bb.B))
}
bbPool.Put(bb)
default:
logger.Panicf("BUG: unknown valueType=%d", c.valueType)
}
c.values = valuesBuf[valuesBufLen:]
br.buf = buf
br.valuesBuf = valuesBuf
return c.values
}
type blockResultColumn struct {
// isConst is set to true if the column is const.
//
// The column value is stored in encodedValues[0]
isConst bool
// isTime is set to true if the column contains _time values.
//
// The column values are stored in blockResult.timestamps
isTime bool
// valueType is the type of non-cost value
valueType valueType
// dictValues contain dictionary values for valueTypeDict column
dictValues []string
// encodedValues contain encoded values for non-const column
encodedValues []string
// values contain decoded values after getColumnValues() call for the given column
values []string
}
func (c *blockResultColumn) reset() {
c.isConst = false
c.isTime = false
c.valueType = valueTypeUnknown
c.dictValues = nil
c.encodedValues = nil
c.values = nil
}

View file

@ -0,0 +1,288 @@
package logstorage
import (
"container/heap"
"fmt"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// mustMergeBlockStreams merges bsrs to bsw and updates ph accordingly.
//
// Finalize() is guaranteed to be called on bsrs and bsw before returning from the func.
func mustMergeBlockStreams(ph *partHeader, bsw *blockStreamWriter, bsrs []*blockStreamReader, stopCh <-chan struct{}) {
bsm := getBlockStreamMerger()
bsm.mustInit(bsw, bsrs)
for len(bsm.readersHeap) > 0 {
if needStop(stopCh) {
break
}
bsr := bsm.readersHeap[0]
bsm.mustWriteBlock(&bsr.blockData, bsw)
if bsr.NextBlock() {
heap.Fix(&bsm.readersHeap, 0)
} else {
heap.Pop(&bsm.readersHeap)
}
}
bsm.mustFlushRows()
putBlockStreamMerger(bsm)
bsw.Finalize(ph)
mustCloseBlockStreamReaders(bsrs)
}
// blockStreamMerger merges block streams
type blockStreamMerger struct {
// bsw is the block stream writer to write the merged blocks.
bsw *blockStreamWriter
// bsrs contains the original readers passed to mustInit().
// They are used by ReadersPaths()
bsrs []*blockStreamReader
// readersHeap contains a heap of readers to read blocks to merge.
readersHeap blockStreamReadersHeap
// streamID is the stream ID for the pending data.
streamID streamID
// sbu is the unmarshaler for strings in rows and rowsTmp.
sbu *stringsBlockUnmarshaler
// vd is the decoder for unmarshaled strings.
vd *valuesDecoder
// bd is the pending blockData.
// bd is unpacked into rows when needed.
bd blockData
// rows is pending log entries.
rows rows
// rowsTmp is temporary storage for log entries during merge.
rowsTmp rows
// uncompressedRowsSizeBytes is the current size of uncompressed rows.
//
// It is used for flushing rows to blocks when their size reaches maxUncompressedBlockSize
uncompressedRowsSizeBytes uint64
}
func (bsm *blockStreamMerger) reset() {
bsm.bsw = nil
rhs := bsm.readersHeap
for i := range rhs {
rhs[i] = nil
}
bsm.readersHeap = rhs[:0]
bsm.streamID.reset()
bsm.resetRows()
}
func (bsm *blockStreamMerger) resetRows() {
if bsm.sbu != nil {
putStringsBlockUnmarshaler(bsm.sbu)
bsm.sbu = nil
}
if bsm.vd != nil {
putValuesDecoder(bsm.vd)
bsm.vd = nil
}
bsm.bd.reset()
bsm.rows.reset()
bsm.rowsTmp.reset()
bsm.uncompressedRowsSizeBytes = 0
}
func (bsm *blockStreamMerger) mustInit(bsw *blockStreamWriter, bsrs []*blockStreamReader) {
bsm.reset()
bsm.bsw = bsw
bsm.bsrs = bsrs
rsh := bsm.readersHeap[:0]
for _, bsr := range bsrs {
if bsr.NextBlock() {
rsh = append(rsh, bsr)
}
}
bsm.readersHeap = rsh
heap.Init(&bsm.readersHeap)
}
// mustWriteBlock writes bd to bsm
func (bsm *blockStreamMerger) mustWriteBlock(bd *blockData, bsw *blockStreamWriter) {
bsm.checkNextBlock(bd)
switch {
case !bd.streamID.equal(&bsm.streamID):
// The bd contains another streamID.
// Write the current log entries under the current streamID, then process the bd.
bsm.mustFlushRows()
bsm.streamID = bd.streamID
if bd.uncompressedSizeBytes >= maxUncompressedBlockSize {
// Fast path - write full bd to the output without extracting log entries from it.
bsw.MustWriteBlockData(bd)
} else {
// Slow path - copy the bd to the curr bd.
bsm.bd.copyFrom(bd)
}
case bd.uncompressedSizeBytes >= maxUncompressedBlockSize:
// The bd contains the same streamID and it is full,
// so it can be written next after the current log entries
// without the need to merge the bd with the current log entries.
// Write the current log entries and then the bd.
bsm.mustFlushRows()
bsw.MustWriteBlockData(bd)
default:
// The bd contains the same streamID and it isn't full,
// so it must be merged with the current log entries.
bsm.mustMergeRows(bd)
}
}
// checkNextBlock checks whether the bd can be written next after the current data.
func (bsm *blockStreamMerger) checkNextBlock(bd *blockData) {
if len(bsm.rows.timestamps) > 0 && bsm.bd.rowsCount > 0 {
logger.Panicf("BUG: bsm.bd must be empty when bsm.rows isn't empty! got %d log entries in bsm.bd", bsm.bd.rowsCount)
}
if bd.streamID.less(&bsm.streamID) {
logger.Panicf("FATAL: cannot merge %s: the streamID=%s for the next block is smaller than the streamID=%s for the current block",
bsm.ReadersPaths(), &bd.streamID, &bsm.streamID)
}
if !bd.streamID.equal(&bsm.streamID) {
return
}
// streamID at bd equals streamID at bsm. Check that minTimestamp in bd is bigger or equal to the minTimestmap at bsm.
if bd.rowsCount == 0 {
return
}
nextMinTimestamp := bd.timestampsData.minTimestamp
if len(bsm.rows.timestamps) == 0 {
if bsm.bd.rowsCount == 0 {
return
}
minTimestamp := bsm.bd.timestampsData.minTimestamp
if nextMinTimestamp < minTimestamp {
logger.Panicf("FATAL: cannot merge %s: the next block's minTimestamp=%d is smaller than the minTimestamp=%d for the current block",
bsm.ReadersPaths(), nextMinTimestamp, minTimestamp)
}
return
}
minTimestamp := bsm.rows.timestamps[0]
if nextMinTimestamp < minTimestamp {
logger.Panicf("FATAL: cannot merge %s: the next block's minTimestamp=%d is smaller than the minTimestamp=%d for log entries for the current block",
bsm.ReadersPaths(), nextMinTimestamp, minTimestamp)
}
}
// ReadersPaths returns paths for input blockStreamReaders
func (bsm *blockStreamMerger) ReadersPaths() string {
paths := make([]string, len(bsm.bsrs))
for i, bsr := range bsm.bsrs {
paths[i] = bsr.Path()
}
return fmt.Sprintf("[%s]", strings.Join(paths, ","))
}
// mustMergeRows merges the current log entries inside bsm with bd log entries.
func (bsm *blockStreamMerger) mustMergeRows(bd *blockData) {
if bsm.bd.rowsCount > 0 {
// Unmarshal log entries from bsm.bd
bsm.mustUnmarshalRows(&bsm.bd)
bsm.bd.reset()
}
// Unmarshal log entries from bd
rowsLen := len(bsm.rows.timestamps)
bsm.mustUnmarshalRows(bd)
// Merge unmarshaled log entries
timestamps := bsm.rows.timestamps
rows := bsm.rows.rows
bsm.rowsTmp.mergeRows(timestamps[:rowsLen], timestamps[rowsLen:], rows[:rowsLen], rows[rowsLen:])
bsm.rows, bsm.rowsTmp = bsm.rowsTmp, bsm.rows
bsm.rowsTmp.reset()
if bsm.uncompressedRowsSizeBytes >= maxUncompressedBlockSize {
bsm.mustFlushRows()
}
}
func (bsm *blockStreamMerger) mustUnmarshalRows(bd *blockData) {
rowsLen := len(bsm.rows.timestamps)
if bsm.sbu == nil {
bsm.sbu = getStringsBlockUnmarshaler()
}
if bsm.vd == nil {
bsm.vd = getValuesDecoder()
}
if err := bd.unmarshalRows(&bsm.rows, bsm.sbu, bsm.vd); err != nil {
logger.Panicf("FATAL: cannot merge %s: cannot unmarshal log entries from blockData: %s", bsm.ReadersPaths(), err)
}
bsm.uncompressedRowsSizeBytes += uncompressedRowsSizeBytes(bsm.rows.rows[rowsLen:])
}
func (bsm *blockStreamMerger) mustFlushRows() {
if len(bsm.rows.timestamps) == 0 {
bsm.bsw.MustWriteBlockData(&bsm.bd)
} else {
bsm.bsw.MustWriteRows(&bsm.streamID, bsm.rows.timestamps, bsm.rows.rows)
}
bsm.resetRows()
}
func getBlockStreamMerger() *blockStreamMerger {
v := blockStreamMergerPool.Get()
if v == nil {
return &blockStreamMerger{}
}
return v.(*blockStreamMerger)
}
func putBlockStreamMerger(bsm *blockStreamMerger) {
bsm.reset()
blockStreamMergerPool.Put(bsm)
}
var blockStreamMergerPool sync.Pool
type blockStreamReadersHeap []*blockStreamReader
func (h *blockStreamReadersHeap) Len() int {
return len(*h)
}
func (h *blockStreamReadersHeap) Less(i, j int) bool {
x := *h
a := &x[i].blockData
b := &x[j].blockData
if !a.streamID.equal(&b.streamID) {
return a.streamID.less(&b.streamID)
}
return a.timestampsData.minTimestamp < b.timestampsData.minTimestamp
}
func (h *blockStreamReadersHeap) Swap(i, j int) {
x := *h
x[i], x[j] = x[j], x[i]
}
func (h *blockStreamReadersHeap) Push(v interface{}) {
bsr := v.(*blockStreamReader)
*h = append(*h, bsr)
}
func (h *blockStreamReadersHeap) Pop() interface{} {
x := *h
bsr := x[len(x)-1]
x[len(x)-1] = nil
*h = x[:len(x)-1]
return bsr
}

View file

@ -0,0 +1,383 @@
package logstorage
import (
"path/filepath"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type readerWithStats struct {
r filestream.ReadCloser
bytesRead uint64
}
func (r *readerWithStats) reset() {
r.r = nil
r.bytesRead = 0
}
func (r *readerWithStats) init(rc filestream.ReadCloser) {
r.reset()
r.r = rc
}
// Path returns the path to r file
func (r *readerWithStats) Path() string {
return r.r.Path()
}
// MustReadFull reads len(data) to r.
func (r *readerWithStats) MustReadFull(data []byte) {
fs.MustReadData(r.r, data)
r.bytesRead += uint64(len(data))
}
func (r *readerWithStats) Read(p []byte) (int, error) {
n, err := r.r.Read(p)
r.bytesRead += uint64(n)
return n, err
}
func (r *readerWithStats) MustClose() {
r.r.MustClose()
r.r = nil
}
// streamReaders contains readers for blockStreamReader
type streamReaders struct {
metaindexReader readerWithStats
indexReader readerWithStats
columnsHeaderReader readerWithStats
timestampsReader readerWithStats
fieldValuesReader readerWithStats
fieldBloomFilterReader readerWithStats
messageValuesReader readerWithStats
messageBloomFilterReader readerWithStats
}
func (sr *streamReaders) reset() {
sr.metaindexReader.reset()
sr.indexReader.reset()
sr.columnsHeaderReader.reset()
sr.timestampsReader.reset()
sr.fieldValuesReader.reset()
sr.fieldBloomFilterReader.reset()
sr.messageValuesReader.reset()
sr.messageBloomFilterReader.reset()
}
func (sr *streamReaders) init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader, fieldValuesReader, fieldBloomFilterReader,
messageValuesReader, messageBloomFilterReader filestream.ReadCloser,
) {
sr.metaindexReader.init(metaindexReader)
sr.indexReader.init(indexReader)
sr.columnsHeaderReader.init(columnsHeaderReader)
sr.timestampsReader.init(timestampsReader)
sr.fieldValuesReader.init(fieldValuesReader)
sr.fieldBloomFilterReader.init(fieldBloomFilterReader)
sr.messageValuesReader.init(messageValuesReader)
sr.messageBloomFilterReader.init(messageBloomFilterReader)
}
func (sr *streamReaders) totalBytesRead() uint64 {
n := uint64(0)
n += sr.metaindexReader.bytesRead
n += sr.indexReader.bytesRead
n += sr.columnsHeaderReader.bytesRead
n += sr.timestampsReader.bytesRead
n += sr.fieldValuesReader.bytesRead
n += sr.fieldBloomFilterReader.bytesRead
n += sr.messageValuesReader.bytesRead
n += sr.messageBloomFilterReader.bytesRead
return n
}
func (sr *streamReaders) MustClose() {
sr.metaindexReader.MustClose()
sr.indexReader.MustClose()
sr.columnsHeaderReader.MustClose()
sr.timestampsReader.MustClose()
sr.fieldValuesReader.MustClose()
sr.fieldBloomFilterReader.MustClose()
sr.messageValuesReader.MustClose()
sr.messageBloomFilterReader.MustClose()
}
// blockStreamReader is used for reading blocks in streaming manner from a part.
type blockStreamReader struct {
// blockData contains the data for the last read block
blockData blockData
// ph is the header for the part
ph partHeader
// streamReaders contains data readers in stream mode
streamReaders streamReaders
// indexBlockHeaders contains the list of all the indexBlockHeader entries for the part
indexBlockHeaders []indexBlockHeader
// blockHeaders contains the list of blockHeader entries for the current indexBlockHeader pointed by nextIndexBlockIdx
blockHeaders []blockHeader
// nextIndexBlockIdx is the index of the next item to read from indexBlockHeaders
nextIndexBlockIdx int
// nextBlockIdx is the index of the next item to read from blockHeaders
nextBlockIdx int
// globalUncompressedSizeBytes is the total size of log entries seen in the part
globalUncompressedSizeBytes uint64
// globalRowsCount is the number of log entries seen in the part
globalRowsCount uint64
// globalBlocksCount is the number of blocks seen in the part
globalBlocksCount uint64
// sidLast is the stream id for the previously read block
sidLast streamID
// minTimestampLast is the minimum timestamp for the previously read block
minTimestampLast int64
}
// reset resets bsr, so it can be re-used
func (bsr *blockStreamReader) reset() {
bsr.blockData.reset()
bsr.ph.reset()
bsr.streamReaders.reset()
ihs := bsr.indexBlockHeaders
if len(ihs) > 10e3 {
// The ihs len is unbound, so it is better to drop too long indexBlockHeaders in order to reduce memory usage
ihs = nil
}
for i := range ihs {
ihs[i].reset()
}
bsr.indexBlockHeaders = ihs[:0]
bhs := bsr.blockHeaders
for i := range bhs {
bhs[i].reset()
}
bsr.blockHeaders = bhs[:0]
bsr.nextIndexBlockIdx = 0
bsr.nextBlockIdx = 0
bsr.globalUncompressedSizeBytes = 0
bsr.globalRowsCount = 0
bsr.globalBlocksCount = 0
bsr.sidLast.reset()
bsr.minTimestampLast = 0
}
// Path returns part path for bsr (e.g. file path, url or in-memory reference)
func (bsr *blockStreamReader) Path() string {
path := bsr.streamReaders.metaindexReader.Path()
return filepath.Dir(path)
}
// MustInitFromInmemoryPart initializes bsr from mp.
func (bsr *blockStreamReader) MustInitFromInmemoryPart(mp *inmemoryPart) {
bsr.reset()
bsr.ph = mp.ph
// Initialize streamReaders
metaindexReader := mp.metaindex.NewReader()
indexReader := mp.index.NewReader()
columnsHeaderReader := mp.columnsHeader.NewReader()
timestampsReader := mp.timestamps.NewReader()
fieldValuesReader := mp.fieldValues.NewReader()
fieldBloomFilterReader := mp.fieldBloomFilter.NewReader()
messageValuesReader := mp.messageValues.NewReader()
messageBloomFilterReader := mp.messageBloomFilter.NewReader()
bsr.streamReaders.init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader,
fieldValuesReader, fieldBloomFilterReader, messageValuesReader, messageBloomFilterReader)
// Read metaindex data
bsr.indexBlockHeaders = mustReadIndexBlockHeaders(bsr.indexBlockHeaders[:0], &bsr.streamReaders.metaindexReader)
}
// MustInitFromFilePart initializes bsr from file part at the given path.
func (bsr *blockStreamReader) MustInitFromFilePart(path string) {
bsr.reset()
// Files in the part are always read without OS cache pollution,
// since they are usually deleted after the merge.
const nocache = true
metaindexPath := filepath.Join(path, metaindexFilename)
indexPath := filepath.Join(path, indexFilename)
columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
timestampsPath := filepath.Join(path, timestampsFilename)
fieldValuesPath := filepath.Join(path, fieldValuesFilename)
fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
messageValuesPath := filepath.Join(path, messageValuesFilename)
messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
bsr.ph.mustReadMetadata(path)
// Open data readers
metaindexReader := filestream.MustOpen(metaindexPath, nocache)
indexReader := filestream.MustOpen(indexPath, nocache)
columnsHeaderReader := filestream.MustOpen(columnsHeaderPath, nocache)
timestampsReader := filestream.MustOpen(timestampsPath, nocache)
fieldValuesReader := filestream.MustOpen(fieldValuesPath, nocache)
fieldBloomFilterReader := filestream.MustOpen(fieldBloomFilterPath, nocache)
messageValuesReader := filestream.MustOpen(messageValuesPath, nocache)
messageBloomFilterReader := filestream.MustOpen(messageBloomFilterPath, nocache)
// Initialize streamReaders
bsr.streamReaders.init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader,
fieldValuesReader, fieldBloomFilterReader, messageValuesReader, messageBloomFilterReader)
// Read metaindex data
bsr.indexBlockHeaders = mustReadIndexBlockHeaders(bsr.indexBlockHeaders[:0], &bsr.streamReaders.metaindexReader)
}
// NextBlock reads the next block from bsr and puts it into bsr.blockData.
//
// false is returned if there are no other blocks.
func (bsr *blockStreamReader) NextBlock() bool {
for bsr.nextBlockIdx >= len(bsr.blockHeaders) {
if !bsr.nextIndexBlock() {
return false
}
}
ih := &bsr.indexBlockHeaders[bsr.nextIndexBlockIdx-1]
bh := &bsr.blockHeaders[bsr.nextBlockIdx]
th := &bh.timestampsHeader
// Validate bh
if bh.streamID.less(&bsr.sidLast) {
logger.Panicf("FATAL: %s: blockHeader.streamID=%s cannot be smaller than the streamID from the previously read block: %s", bsr.Path(), &bh.streamID, &bsr.sidLast)
}
if bh.streamID.equal(&bsr.sidLast) && th.minTimestamp < bsr.minTimestampLast {
logger.Panicf("FATAL: %s: timestamps.minTimestamp=%d cannot be smaller than the minTimestamp for the previously read block for the same streamID: %d",
bsr.Path(), th.minTimestamp, bsr.minTimestampLast)
}
bsr.minTimestampLast = th.minTimestamp
bsr.sidLast = bh.streamID
if th.minTimestamp < ih.minTimestamp {
logger.Panicf("FATAL: %s: timestampsHeader.minTimestamp=%d cannot be smaller than indexBlockHeader.minTimestamp=%d", bsr.Path(), th.minTimestamp, ih.minTimestamp)
}
if th.maxTimestamp > ih.maxTimestamp {
logger.Panicf("FATAL: %s: timestampsHeader.maxTimestamp=%d cannot be bigger than indexBlockHeader.maxTimestamp=%d", bsr.Path(), th.maxTimestamp, ih.minTimestamp)
}
// Read bsr.blockData
bsr.blockData.mustReadFrom(bh, &bsr.streamReaders)
bsr.globalUncompressedSizeBytes += bh.uncompressedSizeBytes
bsr.globalRowsCount += bh.rowsCount
bsr.globalBlocksCount++
if bsr.globalUncompressedSizeBytes > bsr.ph.UncompressedSizeBytes {
logger.Panicf("FATAL: %s: too big size of entries read: %d; mustn't exceed partHeader.UncompressedSizeBytes=%d",
bsr.Path(), bsr.globalUncompressedSizeBytes, bsr.ph.UncompressedSizeBytes)
}
if bsr.globalRowsCount > bsr.ph.RowsCount {
logger.Panicf("FATAL: %s: too many log entries read so far: %d; mustn't exceed partHeader.RowsCount=%d", bsr.Path(), bsr.globalRowsCount, bsr.ph.RowsCount)
}
if bsr.globalBlocksCount > bsr.ph.BlocksCount {
logger.Panicf("FATAL: %s: too many blocks read so far: %d; mustn't exceed partHeader.BlocksCount=%d", bsr.Path(), bsr.globalBlocksCount, bsr.ph.BlocksCount)
}
// The block has been sucessfully read
bsr.nextBlockIdx++
return true
}
func (bsr *blockStreamReader) nextIndexBlock() bool {
// Advance to the next indexBlockHeader
if bsr.nextIndexBlockIdx >= len(bsr.indexBlockHeaders) {
// No more blocks left
// Validate bsr.ph
totalBytesRead := bsr.streamReaders.totalBytesRead()
if bsr.ph.CompressedSizeBytes != totalBytesRead {
logger.Panicf("FATAL: %s: partHeader.CompressedSizeBytes=%d must match the size of data read: %d", bsr.Path(), bsr.ph.CompressedSizeBytes, totalBytesRead)
}
if bsr.ph.UncompressedSizeBytes != bsr.globalUncompressedSizeBytes {
logger.Panicf("FATAL: %s: partHeader.UncompressedSizeBytes=%d must match the size of entries read: %d",
bsr.Path(), bsr.ph.UncompressedSizeBytes, bsr.globalUncompressedSizeBytes)
}
if bsr.ph.RowsCount != bsr.globalRowsCount {
logger.Panicf("FATAL: %s: partHeader.RowsCount=%d must match the number of log entries read: %d", bsr.Path(), bsr.ph.RowsCount, bsr.globalRowsCount)
}
if bsr.ph.BlocksCount != bsr.globalBlocksCount {
logger.Panicf("FATAL: %s: partHeader.BlocksCount=%d must match the number of blocks read: %d", bsr.Path(), bsr.ph.BlocksCount, bsr.globalBlocksCount)
}
return false
}
ih := &bsr.indexBlockHeaders[bsr.nextIndexBlockIdx]
// Validate ih
metaindexReader := &bsr.streamReaders.metaindexReader
if ih.minTimestamp < bsr.ph.MinTimestamp {
logger.Panicf("FATAL: %s: indexBlockHeader.minTimestamp=%d cannot be smaller than partHeader.MinTimestamp=%d",
metaindexReader.Path(), ih.minTimestamp, bsr.ph.MinTimestamp)
}
if ih.maxTimestamp > bsr.ph.MaxTimestamp {
logger.Panicf("FATAL: %s: indexBlockHeader.maxTimestamp=%d cannot be bigger than partHeader.MaxTimestamp=%d",
metaindexReader.Path(), ih.maxTimestamp, bsr.ph.MaxTimestamp)
}
// Read indexBlock for the given ih
bb := longTermBufPool.Get()
bb.B = ih.mustReadNextIndexBlock(bb.B[:0], &bsr.streamReaders)
bsr.blockHeaders = resetBlockHeaders(bsr.blockHeaders)
var err error
bsr.blockHeaders, err = unmarshalBlockHeaders(bsr.blockHeaders[:0], bb.B)
longTermBufPool.Put(bb)
if err != nil {
logger.Panicf("FATAL: %s: cannot unmarshal blockHeader entries: %s", bsr.streamReaders.indexReader.Path(), err)
}
bsr.nextIndexBlockIdx++
bsr.nextBlockIdx = 0
return true
}
// MustClose closes bsr.
func (bsr *blockStreamReader) MustClose() {
bsr.streamReaders.MustClose()
bsr.reset()
}
// getBlockStreamReader returns blockStreamReader.
//
// The returned blockStreamReader must be initialized with MustInit().
// call putBlockStreamReader() when the retruend blockStreamReader is no longer needed.
func getBlockStreamReader() *blockStreamReader {
v := blockStreamReaderPool.Get()
if v == nil {
v = &blockStreamReader{}
}
bsr := v.(*blockStreamReader)
return bsr
}
// putBlockStreamReader returns bsr to the pool.
//
// bsr cannot be used after returning to the pool.
func putBlockStreamReader(bsr *blockStreamReader) {
bsr.reset()
blockStreamReaderPool.Put(bsr)
}
var blockStreamReaderPool sync.Pool
// mustCloseBlockStreamReaders calls MustClose() on the given bsrs.
func mustCloseBlockStreamReaders(bsrs []*blockStreamReader) {
for _, bsr := range bsrs {
bsr.MustClose()
}
}

View file

@ -0,0 +1,362 @@
package logstorage
import (
"path/filepath"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// writerWithStats writes data to w and tracks the total amounts of data written at bytesWritten.
type writerWithStats struct {
w filestream.WriteCloser
bytesWritten uint64
}
func (w *writerWithStats) reset() {
w.w = nil
w.bytesWritten = 0
}
func (w *writerWithStats) init(wc filestream.WriteCloser) {
w.reset()
w.w = wc
}
func (w *writerWithStats) Path() string {
return w.w.Path()
}
func (w *writerWithStats) MustWrite(data []byte) {
fs.MustWriteData(w.w, data)
w.bytesWritten += uint64(len(data))
}
// MustClose closes the underlying w.
func (w *writerWithStats) MustClose() {
w.w.MustClose()
}
// streamWriters contain writers for blockStreamWriter
type streamWriters struct {
metaindexWriter writerWithStats
indexWriter writerWithStats
columnsHeaderWriter writerWithStats
timestampsWriter writerWithStats
fieldValuesWriter writerWithStats
fieldBloomFilterWriter writerWithStats
messageValuesWriter writerWithStats
messageBloomFilterWriter writerWithStats
}
func (sw *streamWriters) reset() {
sw.metaindexWriter.reset()
sw.indexWriter.reset()
sw.columnsHeaderWriter.reset()
sw.timestampsWriter.reset()
sw.fieldValuesWriter.reset()
sw.fieldBloomFilterWriter.reset()
sw.messageValuesWriter.reset()
sw.messageBloomFilterWriter.reset()
}
func (sw *streamWriters) init(metaindexWriter, indexWriter, columnsHeaderWriter, timestampsWriter, fieldValuesWriter, fieldBloomFilterWriter,
messageValuesWriter, messageBloomFilterWriter filestream.WriteCloser,
) {
sw.metaindexWriter.init(metaindexWriter)
sw.indexWriter.init(indexWriter)
sw.columnsHeaderWriter.init(columnsHeaderWriter)
sw.timestampsWriter.init(timestampsWriter)
sw.fieldValuesWriter.init(fieldValuesWriter)
sw.fieldBloomFilterWriter.init(fieldBloomFilterWriter)
sw.messageValuesWriter.init(messageValuesWriter)
sw.messageBloomFilterWriter.init(messageBloomFilterWriter)
}
func (sw *streamWriters) totalBytesWritten() uint64 {
n := uint64(0)
n += sw.metaindexWriter.bytesWritten
n += sw.indexWriter.bytesWritten
n += sw.columnsHeaderWriter.bytesWritten
n += sw.timestampsWriter.bytesWritten
n += sw.fieldValuesWriter.bytesWritten
n += sw.fieldBloomFilterWriter.bytesWritten
n += sw.messageValuesWriter.bytesWritten
n += sw.messageBloomFilterWriter.bytesWritten
return n
}
func (sw *streamWriters) MustClose() {
sw.metaindexWriter.MustClose()
sw.indexWriter.MustClose()
sw.columnsHeaderWriter.MustClose()
sw.timestampsWriter.MustClose()
sw.fieldValuesWriter.MustClose()
sw.fieldBloomFilterWriter.MustClose()
sw.messageValuesWriter.MustClose()
sw.messageBloomFilterWriter.MustClose()
}
// blockStreamWriter is used for writing blocks into the underlying storage in streaming manner.
type blockStreamWriter struct {
// streamWriters contains writer for block data
streamWriters streamWriters
// sidLast is the streamID for the last written block
sidLast streamID
// sidFirst is the streamID for the first block in the current indexBlock
sidFirst streamID
// minTimestampLast is the minimum timestamp seen for the last written block
minTimestampLast int64
// minTimestamp is the minimum timestamp seen across written blocks for the current indexBlock
minTimestamp int64
// maxTimestamp is the maximum timestamp seen across written blocks for the current indexBlock
maxTimestamp int64
// hasWrittenBlocks is set to true if at least a single block is written to the current indexBlock
hasWrittenBlocks bool
// globalUncompressedSizeBytes is the total size of all the log entries written via bsw
globalUncompressedSizeBytes uint64
// globalRowsCount is the total number of log entries written via bsw
globalRowsCount uint64
// globalBlocksCount is the total number of blocks written to bsw
globalBlocksCount uint64
// globalMinTimestamp is the minimum timestamp seen across all the blocks written to bsw
globalMinTimestamp int64
// globalMaxTimestamp is the maximum timestamp seen across all the blocks written to bsw
globalMaxTimestamp int64
// indexBlockData contains marshaled blockHeader data, which isn't written yet to indexFilename
indexBlockData []byte
// metaindexData contains marshaled indexBlockHeader data, which isn't written yet to metaindexFilename
metaindexData []byte
// indexBlockHeader is used for marshaling the data to metaindexData
indexBlockHeader indexBlockHeader
}
// reset resets bsw for subsequent re-use.
func (bsw *blockStreamWriter) reset() {
bsw.streamWriters.reset()
bsw.sidLast.reset()
bsw.sidFirst.reset()
bsw.minTimestampLast = 0
bsw.minTimestamp = 0
bsw.maxTimestamp = 0
bsw.hasWrittenBlocks = false
bsw.globalUncompressedSizeBytes = 0
bsw.globalRowsCount = 0
bsw.globalBlocksCount = 0
bsw.globalMinTimestamp = 0
bsw.globalMaxTimestamp = 0
bsw.indexBlockData = bsw.indexBlockData[:0]
if len(bsw.metaindexData) > 1024*1024 {
// The length of bsw.metaindexData is unbound, so drop too long buffer
// in order to conserve memory.
bsw.metaindexData = nil
} else {
bsw.metaindexData = bsw.metaindexData[:0]
}
bsw.indexBlockHeader.reset()
}
// MustInitFromInmemoryPart initializes bsw from mp
func (bsw *blockStreamWriter) MustInitForInmemoryPart(mp *inmemoryPart) {
bsw.reset()
bsw.streamWriters.init(&mp.metaindex, &mp.index, &mp.columnsHeader, &mp.timestamps, &mp.fieldValues, &mp.fieldBloomFilter, &mp.messageValues, &mp.messageBloomFilter)
}
// MustInitForFilePart initializes bsw for writing data to file part located at path.
//
// if nocache is true, then the written data doesn't go to OS page cache.
func (bsw *blockStreamWriter) MustInitForFilePart(path string, nocache bool) {
bsw.reset()
fs.MustMkdirFailIfExist(path)
metaindexPath := filepath.Join(path, metaindexFilename)
indexPath := filepath.Join(path, indexFilename)
columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
timestampsPath := filepath.Join(path, timestampsFilename)
fieldValuesPath := filepath.Join(path, fieldValuesFilename)
fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
messageValuesPath := filepath.Join(path, messageValuesFilename)
messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
// Always cache metaindex file, since it it re-read immediately after part creation
metaindexWriter := filestream.MustCreate(metaindexPath, false)
indexWriter := filestream.MustCreate(indexPath, nocache)
columnsHeaderWriter := filestream.MustCreate(columnsHeaderPath, nocache)
timestampsWriter := filestream.MustCreate(timestampsPath, nocache)
fieldValuesWriter := filestream.MustCreate(fieldValuesPath, nocache)
fieldBloomFilterWriter := filestream.MustCreate(fieldBloomFilterPath, nocache)
messageValuesWriter := filestream.MustCreate(messageValuesPath, nocache)
messageBloomFilterWriter := filestream.MustCreate(messageBloomFilterPath, nocache)
bsw.streamWriters.init(metaindexWriter, indexWriter, columnsHeaderWriter, timestampsWriter,
fieldValuesWriter, fieldBloomFilterWriter, messageValuesWriter, messageBloomFilterWriter)
}
// MustWriteRows writes timestamps with rows under the given sid to bsw.
//
// timestamps must be sorted.
// sid must be bigger or equal to the sid for the previously written rs.
func (bsw *blockStreamWriter) MustWriteRows(sid *streamID, timestamps []int64, rows [][]Field) {
if len(timestamps) == 0 {
return
}
b := getBlock()
b.MustInitFromRows(timestamps, rows)
bsw.MustWriteBlock(sid, b)
putBlock(b)
}
// MustWriteBlockData writes bd to bsw.
//
// The bd.streamID must be bigger or equal to the streamID for the previously written blocks.
func (bsw *blockStreamWriter) MustWriteBlockData(bd *blockData) {
if bd.rowsCount == 0 {
return
}
bsw.mustWriteBlockInternal(&bd.streamID, nil, bd)
}
// MustWriteBlock writes b under the given sid to bsw.
//
// The sid must be bigger or equal to the sid for the previously written blocks.
// The minimum timestamp in b must be bigger or equal to the minimum timestamp written to the same sid.
func (bsw *blockStreamWriter) MustWriteBlock(sid *streamID, b *block) {
rowsCount := b.Len()
if rowsCount == 0 {
return
}
bsw.mustWriteBlockInternal(sid, b, nil)
}
func (bsw *blockStreamWriter) mustWriteBlockInternal(sid *streamID, b *block, bd *blockData) {
if sid.less(&bsw.sidLast) {
logger.Panicf("BUG: the sid=%s cannot be smaller than the previously written sid=%s", sid, &bsw.sidLast)
}
hasWrittenBlocks := bsw.hasWrittenBlocks
if !hasWrittenBlocks {
bsw.sidFirst = *sid
bsw.hasWrittenBlocks = true
}
isSeenSid := sid.equal(&bsw.sidLast)
bsw.sidLast = *sid
bh := getBlockHeader()
if b != nil {
b.mustWriteTo(sid, bh, &bsw.streamWriters)
} else {
bd.mustWriteTo(bh, &bsw.streamWriters)
}
th := &bh.timestampsHeader
if bsw.globalRowsCount == 0 || th.minTimestamp < bsw.globalMinTimestamp {
bsw.globalMinTimestamp = th.minTimestamp
}
if bsw.globalRowsCount == 0 || th.maxTimestamp > bsw.globalMaxTimestamp {
bsw.globalMaxTimestamp = th.maxTimestamp
}
if !hasWrittenBlocks || th.minTimestamp < bsw.minTimestamp {
bsw.minTimestamp = th.minTimestamp
}
if !hasWrittenBlocks || th.maxTimestamp > bsw.maxTimestamp {
bsw.maxTimestamp = th.maxTimestamp
}
if isSeenSid && th.minTimestamp < bsw.minTimestampLast {
logger.Panicf("BUG: the block for sid=%s cannot contain timestamp smaller than %d, but it contains timestamp %d", sid, bsw.minTimestampLast, th.minTimestamp)
}
bsw.minTimestampLast = th.minTimestamp
bsw.globalUncompressedSizeBytes += bh.uncompressedSizeBytes
bsw.globalRowsCount += bh.rowsCount
bsw.globalBlocksCount++
// Marshal bh
bsw.indexBlockData = bh.marshal(bsw.indexBlockData)
putBlockHeader(bh)
if len(bsw.indexBlockData) > maxUncompressedIndexBlockSize {
bsw.mustFlushIndexBlock(bsw.indexBlockData)
bsw.indexBlockData = bsw.indexBlockData[:0]
}
}
func (bsw *blockStreamWriter) mustFlushIndexBlock(data []byte) {
if len(data) > 0 {
bsw.indexBlockHeader.mustWriteIndexBlock(data, bsw.sidFirst, bsw.minTimestamp, bsw.maxTimestamp, &bsw.streamWriters)
bsw.metaindexData = bsw.indexBlockHeader.marshal(bsw.metaindexData)
}
bsw.hasWrittenBlocks = false
bsw.minTimestamp = 0
bsw.maxTimestamp = 0
bsw.sidFirst.reset()
}
// Finalize() finalizes the data write process and updates ph with the finalized stats
//
// It closes the writers passed to MustInit().
//
// bsw can be re-used after calling Finalize().
func (bsw *blockStreamWriter) Finalize(ph *partHeader) {
ph.UncompressedSizeBytes = bsw.globalUncompressedSizeBytes
ph.RowsCount = bsw.globalRowsCount
ph.BlocksCount = bsw.globalBlocksCount
ph.MinTimestamp = bsw.globalMinTimestamp
ph.MaxTimestamp = bsw.globalMaxTimestamp
bsw.mustFlushIndexBlock(bsw.indexBlockData)
// Write metaindex data
bb := longTermBufPool.Get()
bb.B = encoding.CompressZSTDLevel(bb.B[:0], bsw.metaindexData, 1)
bsw.streamWriters.metaindexWriter.MustWrite(bb.B)
if len(bb.B) < 1024*1024 {
longTermBufPool.Put(bb)
}
ph.CompressedSizeBytes = bsw.streamWriters.totalBytesWritten()
bsw.streamWriters.MustClose()
bsw.reset()
}
var longTermBufPool bytesutil.ByteBufferPool
// getBlockStreamWriter returns new blockStreamWriter from the pool.
//
// Return back the blockStreamWriter to the pool when it is no longer needed by calling putBlockStreamWriter.
func getBlockStreamWriter() *blockStreamWriter {
v := blockStreamWriterPool.Get()
if v == nil {
return &blockStreamWriter{}
}
return v.(*blockStreamWriter)
}
// putBlockStreamWriter returns bsw to the pool.
func putBlockStreamWriter(bsw *blockStreamWriter) {
bsw.reset()
blockStreamWriterPool.Put(bsw)
}
var blockStreamWriterPool sync.Pool

View file

@ -0,0 +1,179 @@
package logstorage
import (
"fmt"
"reflect"
"testing"
)
func TestBlockMustInitFromRows(t *testing.T) {
f := func(timestamps []int64, rows [][]Field, bExpected *block) {
t.Helper()
b := getBlock()
defer putBlock(b)
b.MustInitFromRows(timestamps, rows)
if b.uncompressedSizeBytes() >= maxUncompressedBlockSize {
t.Fatalf("expecting non-full block")
}
if !reflect.DeepEqual(b, bExpected) {
t.Fatalf("unexpected block;\ngot\n%v\nwant\n%v", b, bExpected)
}
if n := b.Len(); n != len(timestamps) {
t.Fatalf("unexpected block len; got %d; want %d", n, len(timestamps))
}
b.assertValid()
}
// An empty log entries
f(nil, nil, &block{})
f([]int64{}, [][]Field{}, &block{})
// A single row
timestamps := []int64{1234}
rows := [][]Field{
{
{
Name: "msg",
Value: "foo",
},
{
Name: "level",
Value: "error",
},
},
}
bExpected := &block{
timestamps: []int64{1234},
constColumns: []Field{
{
Name: "level",
Value: "error",
},
{
Name: "msg",
Value: "foo",
},
},
}
f(timestamps, rows, bExpected)
// Multiple log entries with the same set of fields
timestamps = []int64{3, 5}
rows = [][]Field{
{
{
Name: "job",
Value: "foo",
},
{
Name: "instance",
Value: "host1",
},
},
{
{
Name: "job",
Value: "foo",
},
{
Name: "instance",
Value: "host2",
},
},
}
bExpected = &block{
timestamps: []int64{3, 5},
columns: []column{
{
name: "instance",
values: []string{"host1", "host2"},
},
},
constColumns: []Field{
{
Name: "job",
Value: "foo",
},
},
}
f(timestamps, rows, bExpected)
// Multiple log entries with distinct set of fields
timestamps = []int64{3, 5, 10}
rows = [][]Field{
{
{
Name: "msg",
Value: "foo",
},
{
Name: "b",
Value: "xyz",
},
},
{
{
Name: "b",
Value: "xyz",
},
{
Name: "a",
Value: "aaa",
},
},
{
{
Name: "b",
Value: "xyz",
},
},
}
bExpected = &block{
timestamps: []int64{3, 5, 10},
columns: []column{
{
name: "a",
values: []string{"", "aaa", ""},
},
{
name: "msg",
values: []string{"foo", "", ""},
},
},
constColumns: []Field{
{
Name: "b",
Value: "xyz",
},
},
}
f(timestamps, rows, bExpected)
}
func TestBlockMustInitFromRowsFullBlock(t *testing.T) {
const rowsCount = 2000
timestamps := make([]int64, rowsCount)
rows := make([][]Field, rowsCount)
for i := range timestamps {
fields := make([]Field, 10)
for j := range fields {
fields[j] = Field{
Name: fmt.Sprintf("field_%d", j),
Value: "very very looooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong value",
}
}
rows[i] = fields
}
b := getBlock()
defer putBlock(b)
b.MustInitFromRows(timestamps, rows)
if n := b.Len(); n != len(rows) {
t.Fatalf("unexpected total log entries; got %d; want %d", n, len(rows))
}
if b.uncompressedSizeBytes() < maxUncompressedBlockSize {
t.Fatalf("expecting full block")
}
b.assertValid()
}

View file

@ -0,0 +1,46 @@
package logstorage
import (
"fmt"
"testing"
)
func BenchmarkBlock_MustInitFromRows(b *testing.B) {
for _, rowsPerBlock := range []int{1, 10, 100, 1000, 10000} {
b.Run(fmt.Sprintf("rowsPerBlock_%d", rowsPerBlock), func(b *testing.B) {
benchmarkBlockMustInitFromRows(b, rowsPerBlock)
})
}
}
func benchmarkBlockMustInitFromRows(b *testing.B, rowsPerBlock int) {
timestamps, rows := newTestRows(rowsPerBlock, 10)
b.ReportAllocs()
b.SetBytes(int64(len(timestamps)))
b.RunParallel(func(pb *testing.PB) {
block := getBlock()
defer putBlock(block)
for pb.Next() {
block.MustInitFromRows(timestamps, rows)
if n := block.Len(); n != len(timestamps) {
panic(fmt.Errorf("unexpected block length; got %d; want %d", n, len(timestamps)))
}
}
})
}
func newTestRows(rowsCount, fieldsPerRow int) ([]int64, [][]Field) {
timestamps := make([]int64, rowsCount)
rows := make([][]Field, rowsCount)
for i := range timestamps {
timestamps[i] = int64(i) * 1e9
fields := make([]Field, fieldsPerRow)
for j := range fields {
f := &fields[j]
f.Name = fmt.Sprintf("field_%d", j)
f.Value = fmt.Sprintf("value_%d_%d", i, j)
}
rows[i] = fields
}
return timestamps, rows
}

View file

@ -0,0 +1,176 @@
package logstorage
import (
"fmt"
"sync"
"unsafe"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/cespare/xxhash/v2"
)
// bloomFilterHashesCount is the number of different hashes to use for bloom filter.
const bloomFilterHashesCount = 6
// bloomFilterBitsPerItem is the number of bits to use per each token.
const bloomFilterBitsPerItem = 16
// bloomFilterMarshal appends marshaled bloom filter for tokens to dst and returns the result.
func bloomFilterMarshal(dst []byte, tokens []string) []byte {
bf := getBloomFilter()
bf.mustInit(tokens)
dst = bf.marshal(dst)
putBloomFilter(bf)
return dst
}
type bloomFilter struct {
bits []uint64
}
func (bf *bloomFilter) reset() {
bits := bf.bits
for i := range bits {
bits[i] = 0
}
bf.bits = bits[:0]
}
// marshal appends marshaled bf to dst and returns the result.
func (bf *bloomFilter) marshal(dst []byte) []byte {
bits := bf.bits
for _, word := range bits {
dst = encoding.MarshalUint64(dst, word)
}
return dst
}
// unmarshal unmarshals bf from src.
func (bf *bloomFilter) unmarshal(src []byte) error {
if len(src)%8 != 0 {
return fmt.Errorf("cannot unmarshal bloomFilter from src with size not multiple by 8; len(src)=%d", len(src))
}
bf.reset()
wordsCount := len(src) / 8
bits := bf.bits
if n := wordsCount - cap(bits); n > 0 {
bits = append(bits[:cap(bits)], make([]uint64, n)...)
}
bits = bits[:wordsCount]
for i := range bits {
bits[i] = encoding.UnmarshalUint64(src)
src = src[8:]
}
bf.bits = bits
return nil
}
// mustInit initializes bf with the given tokens
func (bf *bloomFilter) mustInit(tokens []string) {
bitsCount := len(tokens) * bloomFilterBitsPerItem
wordsCount := (bitsCount + 63) / 64
bits := bf.bits
if n := wordsCount - cap(bits); n > 0 {
bits = append(bits[:cap(bits)], make([]uint64, n)...)
}
bits = bits[:wordsCount]
bloomFilterAdd(bits, tokens)
bf.bits = bits
}
// bloomFilterAdd adds the given tokens to the bloom filter bits
func bloomFilterAdd(bits []uint64, tokens []string) {
maxBits := uint64(len(bits)) * 64
var buf [8]byte
hp := (*uint64)(unsafe.Pointer(&buf[0]))
for _, token := range tokens {
*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
for i := 0; i < bloomFilterHashesCount; i++ {
hi := xxhash.Sum64(buf[:])
(*hp)++
idx := hi % maxBits
i := idx / 64
j := idx % 64
mask := uint64(1) << j
w := bits[i]
if (w & mask) == 0 {
bits[i] = w | mask
}
}
}
}
// containsAll returns true if bf contains all the given tokens.
func (bf *bloomFilter) containsAll(tokens []string) bool {
bits := bf.bits
if len(bits) == 0 {
return true
}
maxBits := uint64(len(bits)) * 64
var buf [8]byte
hp := (*uint64)(unsafe.Pointer(&buf[0]))
for _, token := range tokens {
*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
for i := 0; i < bloomFilterHashesCount; i++ {
hi := xxhash.Sum64(buf[:])
(*hp)++
idx := hi % maxBits
i := idx / 64
j := idx % 64
mask := uint64(1) << j
w := bits[i]
if (w & mask) == 0 {
// The token is missing
return false
}
}
}
return true
}
// containsAny returns true if bf contains at least a single token from the given tokens.
func (bf *bloomFilter) containsAny(tokens []string) bool {
bits := bf.bits
if len(bits) == 0 {
return true
}
maxBits := uint64(len(bits)) * 64
var buf [8]byte
hp := (*uint64)(unsafe.Pointer(&buf[0]))
nextToken:
for _, token := range tokens {
*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
for i := 0; i < bloomFilterHashesCount; i++ {
hi := xxhash.Sum64(buf[:])
(*hp)++
idx := hi % maxBits
i := idx / 64
j := idx % 64
mask := uint64(1) << j
w := bits[i]
if (w & mask) == 0 {
// The token is missing. Check the next token
continue nextToken
}
}
// It is likely the token exists in the bloom filter
return true
}
return false
}
func getBloomFilter() *bloomFilter {
v := bloomFilterPool.Get()
if v == nil {
return &bloomFilter{}
}
return v.(*bloomFilter)
}
func putBloomFilter(bf *bloomFilter) {
bf.reset()
bloomFilterPool.Put(bf)
}
var bloomFilterPool sync.Pool

View file

@ -0,0 +1,84 @@
package logstorage
import (
"fmt"
"testing"
)
func TestBloomFilter(t *testing.T) {
f := func(tokens []string) {
t.Helper()
data := bloomFilterMarshal(nil, tokens)
bf := getBloomFilter()
defer putBloomFilter(bf)
if err := bf.unmarshal(data); err != nil {
t.Fatalf("unexpected error when unmarshaling bloom filter: %s", err)
}
for _, token := range tokens {
if !bf.containsAny([]string{token}) {
t.Fatalf("bloomFilterContains must return true for the added token %q", token)
}
}
if !bf.containsAll(tokens) {
t.Fatalf("bloomFilterContains must return true for the added tokens")
}
}
f(nil)
f([]string{"foo"})
f([]string{"foo", "bar", "baz"})
// 10k tokens
tokens := make([]string, 10000)
for i := range tokens {
tokens[i] = fmt.Sprintf("token_%d", i)
}
f(tokens)
}
func TestBloomFilterUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
bf := getBloomFilter()
defer putBloomFilter(bf)
if err := bf.unmarshal(data); err == nil {
t.Fatalf("expecting non-nil error")
}
}
f([]byte("a"))
f([]byte("foo"))
}
func TestBloomFilterUnmarshalGarbage(t *testing.T) {
data := []byte("01234567")
var bf bloomFilter
if err := bf.unmarshal(data); err != nil {
t.Fatalf("unexpected error: %s", err)
}
}
func TestBloomFilterFalsePositive(t *testing.T) {
tokens := make([]string, 20000)
for i := range tokens {
tokens[i] = fmt.Sprintf("token_%d", i)
}
data := bloomFilterMarshal(nil, tokens)
bf := getBloomFilter()
defer putBloomFilter(bf)
if err := bf.unmarshal(data); err != nil {
t.Fatalf("unexpected error when unmarshaling bloom filter: %s", err)
}
// count the number of false positives on 20K non-existing tokens
falsePositives := 0
for i := range tokens {
token := fmt.Sprintf("non-existing-token_%d", i)
if bf.containsAny([]string{token}) {
falsePositives++
}
}
p := float64(falsePositives) / float64(len(tokens))
maxFalsePositive := 0.0011
if p > maxFalsePositive {
t.Fatalf("too high false positive rate; got %.4f; want %.4f max", p, maxFalsePositive)
}
}

32
lib/logstorage/consts.go Normal file
View file

@ -0,0 +1,32 @@
package logstorage
// maxUncompressedIndexBlockSize contains the maximum length of uncompressed block with blockHeader entries aka index block.
//
// The real block length can exceed this value by a small percentage because of the block write details.
const maxUncompressedIndexBlockSize = 128 * 1024
// maxUncompressedBlockSize is the maximum size of uncompressed block in bytes.
//
// The real uncompressed block can exceed this value by up to 2 times because of block merge details.
const maxUncompressedBlockSize = 2 * 1024 * 1024
// maxRowsPerBlock is the maximum number of log entries a single block can contain.
const maxRowsPerBlock = 8 * 1024 * 1024
// maxColumnsPerBlock is the maximum number of columns per block.
const maxColumnsPerBlock = 10000
// maxIndexBlockSize is the maximum size of the block with blockHeader entries (aka indexBlock)
const maxIndexBlockSize = 8 * 1024 * 1024
// maxTimestampsBlockSize is the maximum size of timestamps block
const maxTimestampsBlockSize = 8 * 1024 * 1024
// maxValuesBlockSize is the maximum size of values block
const maxValuesBlockSize = 8 * 1024 * 1024
// maxBloomFilterBlockSize is the maximum size of bloom filter block
const maxBloomFilterBlockSize = 8 * 1024 * 1024
// maxColumnsHeaderSize is the maximum size of columnsHeader block
const maxColumnsHeaderSize = 8 * 1024 * 1024

990
lib/logstorage/datadb.go Normal file
View file

@ -0,0 +1,990 @@
package logstorage
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
)
// Default number of parts to merge at once.
//
// This number has been obtained empirically - it gives the lowest possible overhead.
// See appendPartsToMerge tests for details.
const defaultPartsToMerge = 15
// minMergeMultiplier is the minimum multiplier for the size of the output part
// compared to the size of the maximum input part for the merge.
//
// Higher value reduces write amplification (disk write IO induced by the merge),
// while increases the number of unmerged parts.
// The 1.7 is good enough for production workloads.
const minMergeMultiplier = 1.7
// The maximum number of inmemory parts in the partition.
//
// If the number of inmemory parts reaches this value, then assisted merge runs during data ingestion.
const maxInmemoryPartsPerPartition = 20
// datadb represents a database with log data
type datadb struct {
// pt is the partition the datadb belongs to
pt *partition
// mergeIdx is used for generating unique directory names for parts
mergeIdx uint64
// path is the path to the directory with log data
path string
// flushInterval is interval for flushing the inmemory parts to disk
flushInterval time.Duration
// inmemoryParts contains a list of inmemory parts
inmemoryParts []*partWrapper
// fileParts contains a list of file-based parts
fileParts []*partWrapper
// partsLock protects parts from concurrent access
partsLock sync.Mutex
// wg is used for determining when background workers stop
wg sync.WaitGroup
// stopCh is used for notifying background workers to stop
stopCh chan struct{}
// mergeDoneCond is used for pace-limiting the data ingestion rate
mergeDoneCond *sync.Cond
// inmemoryPartsFlushersCount is the number of currently running in-memory parts flushers
//
// This variable must be accessed under partsLock.
inmemoryPartsFlushersCount int
// mergeWorkersCount is the number of currently running merge workers
//
// This variable must be accessed under partsLock.
mergeWorkersCount int
}
// partWrapper is a wrapper for opened part.
type partWrapper struct {
// refCount is the number of references to p.
//
// When the number of references reaches zero, then p is closed.
refCount int32
// The flag, which is set when the part must be deleted after refCount reaches zero.
mustBeDeleted uint32
// p is an opened part
p *part
// mp references inmemory part used for initializing p.
mp *inmemoryPart
// isInMerge is set to true if the part takes part in merge.
isInMerge bool
// The deadline when in-memory part must be flushed to disk.
flushDeadline time.Time
}
func (pw *partWrapper) incRef() {
atomic.AddInt32(&pw.refCount, 1)
}
func (pw *partWrapper) decRef() {
n := atomic.AddInt32(&pw.refCount, -1)
if n > 0 {
return
}
deletePath := ""
if pw.mp == nil {
if atomic.LoadUint32(&pw.mustBeDeleted) != 0 {
deletePath = pw.p.path
}
} else {
putInmemoryPart(pw.mp)
pw.mp = nil
}
mustClosePart(pw.p)
pw.p = nil
if deletePath != "" {
fs.MustRemoveAll(deletePath)
}
}
func mustCreateDatadb(path string) {
fs.MustMkdirFailIfExist(path)
mustWritePartNames(path, []string{})
}
// mustOpenDatadb opens datadb at the given path with the given flushInterval for in-memory data.
func mustOpenDatadb(pt *partition, path string, flushInterval time.Duration) *datadb {
// Remove temporary directories, which may be left after unclean shutdown.
fs.MustRemoveTemporaryDirs(path)
partNames := mustReadPartNames(path)
mustRemoveUnusedDirs(path, partNames)
pws := make([]*partWrapper, len(partNames))
for i, partName := range partNames {
partPath := filepath.Join(path, partName)
p := mustOpenFilePart(pt, partPath)
pws[i] = newPartWrapper(p, nil, time.Time{})
}
ddb := &datadb{
pt: pt,
mergeIdx: uint64(time.Now().UnixNano()),
flushInterval: flushInterval,
path: path,
fileParts: pws,
stopCh: make(chan struct{}),
}
ddb.mergeDoneCond = sync.NewCond(&ddb.partsLock)
// Start merge workers in the hope they'll merge the remaining parts
ddb.partsLock.Lock()
n := getMergeWorkersCount()
for i := 0; i < n; i++ {
ddb.startMergeWorkerLocked()
}
ddb.partsLock.Unlock()
return ddb
}
// startInmemoryPartsFlusherLocked starts flusher for in-memory parts to disk.
//
// This function must be called under partsLock.
func (ddb *datadb) startInmemoryPartsFlusherLocked() {
if ddb.inmemoryPartsFlushersCount >= 1 {
return
}
ddb.inmemoryPartsFlushersCount++
ddb.wg.Add(1)
go func() {
ddb.flushInmemoryParts()
ddb.wg.Done()
}()
}
func (ddb *datadb) flushInmemoryParts() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
ddb.partsLock.Lock()
pws := make([]*partWrapper, 0, len(ddb.inmemoryParts))
pws = appendNotInMergePartsLocked(pws, ddb.inmemoryParts)
currentTime := time.Now()
partsToFlush := pws[:0]
for _, pw := range pws {
if pw.flushDeadline.Before(currentTime) {
partsToFlush = append(partsToFlush, pw)
}
}
setInMergeLocked(partsToFlush)
if len(pws) == 0 {
ddb.inmemoryPartsFlushersCount--
}
ddb.partsLock.Unlock()
if len(pws) == 0 {
// There are no in-memory parts, so stop the flusher.
return
}
ddb.mustMergePartsFinal(partsToFlush)
select {
case <-ddb.stopCh:
return
case <-ticker.C:
}
}
}
// startMergeWorkerLocked starts a merge worker.
//
// This function must be called under locked partsLock.
func (ddb *datadb) startMergeWorkerLocked() {
if ddb.mergeWorkersCount >= getMergeWorkersCount() {
return
}
ddb.mergeWorkersCount++
ddb.wg.Add(1)
go func() {
globalMergeLimitCh <- struct{}{}
ddb.mustMergeExistingParts()
<-globalMergeLimitCh
ddb.wg.Done()
}()
}
// globalMergeLimitCh limits the number of concurrent merges across all the partitions
var globalMergeLimitCh = make(chan struct{}, getMergeWorkersCount())
func getMergeWorkersCount() int {
n := cgroup.AvailableCPUs()
if n < 4 {
// Use bigger number of workers on systems with small number of CPU cores,
// since a single worker may become busy for long time when merging big parts.
// Then the remaining workers may continue performing merges
// for newly added small parts.
return 4
}
return n
}
func (ddb *datadb) mustMergeExistingParts() {
for !needStop(ddb.stopCh) {
maxOutBytes := ddb.availableDiskSpace()
ddb.partsLock.Lock()
parts := make([]*partWrapper, 0, len(ddb.inmemoryParts)+len(ddb.fileParts))
parts = appendNotInMergePartsLocked(parts, ddb.inmemoryParts)
parts = appendNotInMergePartsLocked(parts, ddb.fileParts)
pws := appendPartsToMerge(nil, parts, maxOutBytes)
setInMergeLocked(pws)
if len(pws) == 0 {
ddb.mergeWorkersCount--
}
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge at the moment.
return
}
partsSize := getCompressedSize(pws)
if !ddb.reserveDiskSpace(partsSize) {
// There is no free disk space for the merge,
// because concurrent merge workers already reserved the disk space.
// Try again with smaller maxOutBytes.
ddb.releasePartsToMerge(pws)
continue
}
ddb.mustMergeParts(pws, false)
ddb.releaseDiskSpace(partsSize)
}
}
// appendNotInMergePartsLocked appends src parts with isInMerge=false to dst and returns the result.
//
// This function must be called under partsLock.
func appendNotInMergePartsLocked(dst, src []*partWrapper) []*partWrapper {
for _, pw := range src {
if !pw.isInMerge {
dst = append(dst, pw)
}
}
return dst
}
// setInMergeLocked sets isInMerge flag for pws.
//
// This function must be called under partsLock.
func setInMergeLocked(pws []*partWrapper) {
for _, pw := range pws {
if pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to true")
}
pw.isInMerge = true
}
}
func assertIsInMerge(pws []*partWrapper) {
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to false")
}
}
}
// mustMergeParts merges pws to a single resulting part.
//
// if isFinal is set, then the resulting part will be saved to disk.
//
// All the parts inside pws must have isInMerge field set to true.
func (ddb *datadb) mustMergeParts(pws []*partWrapper, isFinal bool) {
if len(pws) == 0 {
// Nothing to merge.
return
}
assertIsInMerge(pws)
startTime := time.Now()
// Initialize destination paths.
dstPartType := ddb.getDstPartType(pws, isFinal)
mergeIdx := ddb.nextMergeIdx()
dstPartPath := ddb.getDstPartPath(dstPartType, mergeIdx)
if isFinal && len(pws) == 1 && pws[0].mp != nil {
// Fast path: flush a single in-memory part to disk.
mp := pws[0].mp
mp.MustStoreToDisk(dstPartPath)
pwNew := ddb.openCreatedPart(&mp.ph, pws, nil, dstPartPath)
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
return
}
// Prepare blockStreamReaders for source parts.
bsrs := mustOpenBlockStreamReaders(pws)
// Prepare BlockStreamWriter for destination part.
srcSize := uint64(0)
srcRowsCount := uint64(0)
srcBlocksCount := uint64(0)
for _, pw := range pws {
srcSize += pw.p.ph.CompressedSizeBytes
srcRowsCount += pw.p.ph.RowsCount
srcBlocksCount += pw.p.ph.BlocksCount
}
bsw := getBlockStreamWriter()
var mpNew *inmemoryPart
if dstPartType == partInmemory {
mpNew = getInmemoryPart()
bsw.MustInitForInmemoryPart(mpNew)
} else {
nocache := !shouldUsePageCacheForPartSize(srcSize)
bsw.MustInitForFilePart(dstPartPath, nocache)
}
// Merge source parts to destination part.
var ph partHeader
stopCh := ddb.stopCh
if isFinal {
// The final merge shouldn't be stopped even if ddb.stopCh is closed.
stopCh = nil
}
mustMergeBlockStreams(&ph, bsw, bsrs, stopCh)
putBlockStreamWriter(bsw)
for _, bsr := range bsrs {
putBlockStreamReader(bsr)
}
// Persist partHeader for destination part after the merge.
if mpNew != nil {
mpNew.ph = ph
} else {
ph.mustWriteMetadata(dstPartPath)
// Make sure the created part directory listing is synced.
fs.MustSyncPath(dstPartPath)
}
if needStop(stopCh) {
ddb.releasePartsToMerge(pws)
ddb.mergeDoneCond.Broadcast()
// Remove incomplete destination part
if dstPartType == partFile {
fs.MustRemoveAll(dstPartPath)
}
return
}
// Atomically swap the source parts with the newly created part.
pwNew := ddb.openCreatedPart(&ph, pws, mpNew, dstPartPath)
dstSize := uint64(0)
dstRowsCount := uint64(0)
dstBlocksCount := uint64(0)
if pwNew != nil {
pDst := pwNew.p
dstSize = pDst.ph.CompressedSizeBytes
dstRowsCount = pDst.ph.RowsCount
dstBlocksCount = pDst.ph.BlocksCount
}
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
d := time.Since(startTime)
if d <= 30*time.Second {
return
}
// Log stats for long merges.
durationSecs := d.Seconds()
rowsPerSec := int(float64(srcRowsCount) / durationSecs)
logger.Infof("merged (%d parts, %d rows, %d blocks, %d bytes) into (1 part, %d rows, %d blocks, %d bytes) in %.3f seconds at %d rows/sec to %q",
len(pws), srcRowsCount, srcBlocksCount, srcSize, dstRowsCount, dstBlocksCount, dstSize, durationSecs, rowsPerSec, dstPartPath)
}
func (ddb *datadb) nextMergeIdx() uint64 {
return atomic.AddUint64(&ddb.mergeIdx, 1)
}
type partType int
var (
partInmemory = partType(0)
partFile = partType(1)
)
func (ddb *datadb) getDstPartType(pws []*partWrapper, isFinal bool) partType {
if isFinal {
return partFile
}
dstPartSize := getCompressedSize(pws)
if dstPartSize > getMaxInmemoryPartSize() {
return partFile
}
if !areAllInmemoryParts(pws) {
// If at least a single source part is located in file,
// then the destination part must be in file for durability reasons.
return partFile
}
return partInmemory
}
func (ddb *datadb) getDstPartPath(dstPartType partType, mergeIdx uint64) string {
ptPath := ddb.path
dstPartPath := ""
if dstPartType != partInmemory {
dstPartPath = filepath.Join(ptPath, fmt.Sprintf("%016X", mergeIdx))
}
return dstPartPath
}
func (ddb *datadb) openCreatedPart(ph *partHeader, pws []*partWrapper, mpNew *inmemoryPart, dstPartPath string) *partWrapper {
// Open the created part.
if ph.RowsCount == 0 {
// The created part is empty. Remove it
if mpNew == nil {
fs.MustRemoveAll(dstPartPath)
}
return nil
}
var p *part
var flushDeadline time.Time
if mpNew != nil {
// Open the created part from memory.
p = mustOpenInmemoryPart(ddb.pt, mpNew)
flushDeadline = ddb.getFlushToDiskDeadline(pws)
} else {
// Open the created part from disk.
p = mustOpenFilePart(ddb.pt, dstPartPath)
}
return newPartWrapper(p, mpNew, flushDeadline)
}
func (ddb *datadb) mustAddRows(lr *LogRows) {
if len(lr.streamIDs) == 0 {
return
}
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
p := mustOpenInmemoryPart(ddb.pt, mp)
flushDeadline := time.Now().Add(ddb.flushInterval)
pw := newPartWrapper(p, mp, flushDeadline)
ddb.partsLock.Lock()
ddb.inmemoryParts = append(ddb.inmemoryParts, pw)
ddb.startInmemoryPartsFlusherLocked()
if len(ddb.inmemoryParts) > defaultPartsToMerge {
ddb.startMergeWorkerLocked()
}
for len(ddb.inmemoryParts) > maxInmemoryPartsPerPartition {
// limit the pace for data ingestion if too many inmemory parts are created
ddb.mergeDoneCond.Wait()
}
ddb.partsLock.Unlock()
}
// DatadbStats contains various stats for datadb.
type DatadbStats struct {
// InmemoryRowsCount is the number of rows, which weren't flushed to disk yet.
InmemoryRowsCount uint64
// FileRowsCount is the number of rows stored on disk.
FileRowsCount uint64
// InmemoryParts is the number of in-memory parts, which weren't flushed to disk yet.
InmemoryParts uint64
// FileParts is the number of file-based parts stored on disk.
FileParts uint64
// InmemoryBlocks is the number of in-memory blocks, which weren't flushed to disk yet.
InmemoryBlocks uint64
// FileBlocks is the number of file-based blocks stored on disk.
FileBlocks uint64
// CompressedInmemorySize is the size of compressed data stored in memory.
CompressedInmemorySize uint64
// CompressedFileSize is the size of compressed data stored on disk.
CompressedFileSize uint64
// UncompressedInmemorySize is the size of uncompressed data stored in memory.
UncompressedInmemorySize uint64
// UncompressedFileSize is the size of uncompressed data stored on disk.
UncompressedFileSize uint64
}
func (s *DatadbStats) reset() {
*s = DatadbStats{}
}
// RowsCount returns the number of rows stored in datadb.
func (s *DatadbStats) RowsCount() uint64 {
return s.InmemoryRowsCount + s.FileRowsCount
}
// updateStats updates s with ddb stats
func (ddb *datadb) updateStats(s *DatadbStats) {
ddb.partsLock.Lock()
s.InmemoryRowsCount += getRowsCount(ddb.inmemoryParts)
s.FileRowsCount += getRowsCount(ddb.fileParts)
s.InmemoryParts += uint64(len(ddb.inmemoryParts))
s.FileParts += uint64(len(ddb.fileParts))
s.InmemoryBlocks += getBlocksCount(ddb.inmemoryParts)
s.FileBlocks += getBlocksCount(ddb.fileParts)
s.CompressedInmemorySize += getCompressedSize(ddb.inmemoryParts)
s.CompressedFileSize += getCompressedSize(ddb.fileParts)
s.UncompressedInmemorySize += getUncompressedSize(ddb.inmemoryParts)
s.UncompressedFileSize += getUncompressedSize(ddb.fileParts)
ddb.partsLock.Unlock()
}
// debugFlush() makes sure that the recently ingested data is availalbe for search.
func (ddb *datadb) debugFlush() {
// Nothing to do, since all the ingested data is available for search via ddb.inmemoryParts.
}
func (ddb *datadb) mustMergePartsFinal(pws []*partWrapper) {
assertIsInMerge(pws)
var pwsChunk []*partWrapper
for len(pws) > 0 {
pwsChunk = appendPartsToMerge(pwsChunk[:0], pws, (1<<64)-1)
if len(pwsChunk) == 0 {
pwsChunk = append(pwsChunk[:0], pws...)
}
ddb.mustMergeParts(pwsChunk, true)
partsToRemove := partsToMap(pwsChunk)
removedParts := 0
pws, removedParts = removeParts(pws, partsToRemove)
if removedParts != len(pwsChunk) {
logger.Panicf("BUG: unexpected number of parts removed; got %d; want %d", removedParts, len(pwsChunk))
}
}
}
func partsToMap(pws []*partWrapper) map[*partWrapper]struct{} {
m := make(map[*partWrapper]struct{}, len(pws))
for _, pw := range pws {
m[pw] = struct{}{}
}
if len(m) != len(pws) {
logger.Panicf("BUG: %d duplicate parts found out of %d parts", len(pws)-len(m), len(pws))
}
return m
}
func (ddb *datadb) swapSrcWithDstParts(pws []*partWrapper, pwNew *partWrapper, dstPartType partType) {
// Atomically unregister old parts and add new part to pt.
partsToRemove := partsToMap(pws)
removedInmemoryParts := 0
removedFileParts := 0
ddb.partsLock.Lock()
ddb.inmemoryParts, removedInmemoryParts = removeParts(ddb.inmemoryParts, partsToRemove)
ddb.fileParts, removedFileParts = removeParts(ddb.fileParts, partsToRemove)
if pwNew != nil {
switch dstPartType {
case partInmemory:
ddb.inmemoryParts = append(ddb.inmemoryParts, pwNew)
ddb.startInmemoryPartsFlusherLocked()
case partFile:
ddb.fileParts = append(ddb.fileParts, pwNew)
default:
logger.Panicf("BUG: unknown partType=%d", dstPartType)
}
if len(ddb.inmemoryParts)+len(ddb.fileParts) > defaultPartsToMerge {
ddb.startMergeWorkerLocked()
}
}
// Atomically store the updated list of file-based parts on disk.
// This must be performed under partsLock in order to prevent from races
// when multiple concurrently running goroutines update the list.
if removedFileParts > 0 || pwNew != nil && dstPartType == partFile {
partNames := getPartNames(ddb.fileParts)
mustWritePartNames(ddb.path, partNames)
}
ddb.partsLock.Unlock()
removedParts := removedInmemoryParts + removedFileParts
if removedParts != len(partsToRemove) {
logger.Panicf("BUG: unexpected number of parts removed; got %d, want %d", removedParts, len(partsToRemove))
}
// Mark old parts as must be deleted and decrement reference count,
// so they are eventually closed and deleted.
for _, pw := range pws {
atomic.StoreUint32(&pw.mustBeDeleted, 1)
pw.decRef()
}
ddb.mergeDoneCond.Broadcast()
}
func removeParts(pws []*partWrapper, partsToRemove map[*partWrapper]struct{}) ([]*partWrapper, int) {
dst := pws[:0]
for _, pw := range pws {
if _, ok := partsToRemove[pw]; !ok {
dst = append(dst, pw)
}
}
for i := len(dst); i < len(pws); i++ {
pws[i] = nil
}
return dst, len(pws) - len(dst)
}
func mustOpenBlockStreamReaders(pws []*partWrapper) []*blockStreamReader {
bsrs := make([]*blockStreamReader, 0, len(pws))
for _, pw := range pws {
bsr := getBlockStreamReader()
if pw.mp != nil {
bsr.MustInitFromInmemoryPart(pw.mp)
} else {
bsr.MustInitFromFilePart(pw.p.path)
}
bsrs = append(bsrs, bsr)
}
return bsrs
}
func newPartWrapper(p *part, mp *inmemoryPart, flushDeadline time.Time) *partWrapper {
pw := &partWrapper{
p: p,
mp: mp,
flushDeadline: flushDeadline,
}
// Increase reference counter for newly created part - it is decreased when the part
// is removed from the list of open parts.
pw.incRef()
return pw
}
func (ddb *datadb) getFlushToDiskDeadline(pws []*partWrapper) time.Time {
d := time.Now().Add(ddb.flushInterval)
for _, pw := range pws {
if pw.mp != nil && pw.flushDeadline.Before(d) {
d = pw.flushDeadline
}
}
return d
}
func getMaxInmemoryPartSize() uint64 {
// Allocate 10% of allowed memory for in-memory parts.
n := uint64(0.1 * float64(memory.Allowed()) / maxInmemoryPartsPerPartition)
if n < 1e6 {
n = 1e6
}
return n
}
func areAllInmemoryParts(pws []*partWrapper) bool {
for _, pw := range pws {
if pw.mp == nil {
return false
}
}
return true
}
func (ddb *datadb) releasePartsToMerge(pws []*partWrapper) {
ddb.partsLock.Lock()
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: missing isInMerge flag on the part %q", pw.p.path)
}
pw.isInMerge = false
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) availableDiskSpace() uint64 {
available := fs.MustGetFreeSpace(ddb.path)
reserved := atomic.LoadUint64(&reservedDiskSpace)
if available < reserved {
return 0
}
return available - reserved
}
func (ddb *datadb) reserveDiskSpace(n uint64) bool {
available := fs.MustGetFreeSpace(ddb.path)
reserved := atomic.AddUint64(&reservedDiskSpace, n)
if available > reserved {
return true
}
ddb.releaseDiskSpace(n)
return false
}
func (ddb *datadb) releaseDiskSpace(n uint64) {
atomic.AddUint64(&reservedDiskSpace, -n)
}
// reservedDiskSpace tracks global reserved disk space for currently executed
// background merges across all the partitions.
//
// It should allow avoiding background merges when there is no free disk space.
var reservedDiskSpace uint64
func needStop(stopCh <-chan struct{}) bool {
select {
case <-stopCh:
return true
default:
return false
}
}
// mustCloseDatadb can be called only when nobody accesses ddb.
func mustCloseDatadb(ddb *datadb) {
// Stop background workers
close(ddb.stopCh)
ddb.wg.Wait()
// flush in-memory data to disk
pws := append([]*partWrapper{}, ddb.inmemoryParts...)
setInMergeLocked(pws)
ddb.mustMergePartsFinal(pws)
// There is no need in using ddb.partsLock here, since nobody should acces ddb now.
for _, pw := range ddb.inmemoryParts {
pw.decRef()
if pw.refCount != 0 {
logger.Panicf("BUG: there are %d references to inmemoryPart", pw.refCount)
}
}
ddb.inmemoryParts = nil
for _, pw := range ddb.fileParts {
pw.decRef()
if pw.refCount != 0 {
logger.Panicf("BUG: ther are %d references to filePart", pw.refCount)
}
}
ddb.fileParts = nil
ddb.path = ""
ddb.pt = nil
}
func getPartNames(pws []*partWrapper) []string {
partNames := make([]string, 0, len(pws))
for _, pw := range pws {
if pw.mp != nil {
// Skip in-memory parts
continue
}
partName := filepath.Base(pw.p.path)
partNames = append(partNames, partName)
}
sort.Strings(partNames)
return partNames
}
func mustWritePartNames(path string, partNames []string) {
data, err := json.Marshal(partNames)
if err != nil {
logger.Panicf("BUG: cannot marshal partNames to JSON: %s", err)
}
partNamesPath := filepath.Join(path, partsFilename)
fs.MustWriteAtomic(partNamesPath, data, true)
}
func mustReadPartNames(path string) []string {
partNamesPath := filepath.Join(path, partsFilename)
data, err := os.ReadFile(partNamesPath)
if err != nil {
logger.Panicf("FATAL: cannot read %s: %s", partNamesPath, err)
}
var partNames []string
if err := json.Unmarshal(data, &partNames); err != nil {
logger.Panicf("FATAL: cannot parse %s: %s", partNamesPath, err)
}
return partNames
}
// mustRemoveUnusedDirs removes dirs at path, which are missing in partNames.
//
// These dirs may be left after unclean shutdown.
func mustRemoveUnusedDirs(path string, partNames []string) {
des := fs.MustReadDir(path)
m := make(map[string]struct{}, len(partNames))
for _, partName := range partNames {
m[partName] = struct{}{}
}
removedDirs := 0
for _, de := range des {
if !fs.IsDirOrSymlink(de) {
// Skip non-directories.
continue
}
fn := de.Name()
if _, ok := m[fn]; !ok {
deletePath := filepath.Join(path, fn)
fs.MustRemoveAll(deletePath)
removedDirs++
}
}
if removedDirs > 0 {
fs.MustSyncPath(path)
}
}
// appendPartsToMerge finds optimal parts to merge from src,
// appends them to dst and returns the result.
func appendPartsToMerge(dst, src []*partWrapper, maxOutBytes uint64) []*partWrapper {
if len(src) < 2 {
// There is no need in merging zero or one part :)
return dst
}
// Filter out too big parts.
// This should reduce N for O(N^2) algorithm below.
maxInPartBytes := uint64(float64(maxOutBytes) / minMergeMultiplier)
tmp := make([]*partWrapper, 0, len(src))
for _, pw := range src {
if pw.p.ph.CompressedSizeBytes > maxInPartBytes {
continue
}
tmp = append(tmp, pw)
}
src = tmp
sortPartsForOptimalMerge(src)
maxSrcParts := defaultPartsToMerge
if maxSrcParts > len(src) {
maxSrcParts = len(src)
}
minSrcParts := (maxSrcParts + 1) / 2
if minSrcParts < 2 {
minSrcParts = 2
}
// Exhaustive search for parts giving the lowest write amplification when merged.
var pws []*partWrapper
maxM := float64(0)
for i := minSrcParts; i <= maxSrcParts; i++ {
for j := 0; j <= len(src)-i; j++ {
a := src[j : j+i]
if a[0].p.ph.CompressedSizeBytes*uint64(len(a)) < a[len(a)-1].p.ph.CompressedSizeBytes {
// Do not merge parts with too big difference in size,
// since this results in unbalanced merges.
continue
}
outSize := getCompressedSize(a)
if outSize > maxOutBytes {
// There is no need in verifying remaining parts with bigger sizes.
break
}
m := float64(outSize) / float64(a[len(a)-1].p.ph.CompressedSizeBytes)
if m < maxM {
continue
}
maxM = m
pws = a
}
}
minM := float64(defaultPartsToMerge) / 2
if minM < minMergeMultiplier {
minM = minMergeMultiplier
}
if maxM < minM {
// There is no sense in merging parts with too small m,
// since this leads to high disk write IO.
return dst
}
return append(dst, pws...)
}
func sortPartsForOptimalMerge(pws []*partWrapper) {
// Sort src parts by size and backwards timestamp.
// This should improve adjanced points' locality in the merged parts.
sort.Slice(pws, func(i, j int) bool {
a := &pws[i].p.ph
b := &pws[j].p.ph
if a.CompressedSizeBytes == b.CompressedSizeBytes {
return a.MinTimestamp > b.MinTimestamp
}
return a.CompressedSizeBytes < b.CompressedSizeBytes
})
}
func getCompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.CompressedSizeBytes
}
return n
}
func getUncompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.UncompressedSizeBytes
}
return n
}
func getRowsCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.RowsCount
}
return n
}
func getBlocksCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.BlocksCount
}
return n
}
func shouldUsePageCacheForPartSize(size uint64) bool {
mem := memory.Remaining() / defaultPartsToMerge
return size <= uint64(mem)
}

View file

@ -0,0 +1,91 @@
package logstorage
import (
"math/rand"
"testing"
)
func TestAppendPartsToMergeManyParts(t *testing.T) {
// Verify that big number of parts are merged into minimal number of parts
// using minimum merges.
var sizes []uint64
maxOutSize := uint64(0)
r := rand.New(rand.NewSource(1))
for i := 0; i < 1024; i++ {
n := uint64(uint32(r.NormFloat64() * 1e9))
n++
maxOutSize += n
sizes = append(sizes, n)
}
pws := newTestPartWrappersForSizes(sizes)
iterationsCount := 0
sizeMergedTotal := uint64(0)
for {
pms := appendPartsToMerge(nil, pws, maxOutSize)
if len(pms) == 0 {
break
}
m := make(map[*partWrapper]bool)
for _, pw := range pms {
m[pw] = true
}
var pwsNew []*partWrapper
size := uint64(0)
for _, pw := range pws {
if m[pw] {
size += pw.p.ph.CompressedSizeBytes
} else {
pwsNew = append(pwsNew, pw)
}
}
pw := &partWrapper{
p: &part{
ph: partHeader{
CompressedSizeBytes: size,
},
},
}
sizeMergedTotal += size
pwsNew = append(pwsNew, pw)
pws = pwsNew
iterationsCount++
}
sizes = newTestSizesFromPartWrappers(pws)
sizeTotal := uint64(0)
for _, size := range sizes {
sizeTotal += uint64(size)
}
overhead := float64(sizeMergedTotal) / float64(sizeTotal)
if overhead > 2.1 {
t.Fatalf("too big overhead; sizes=%d, iterationsCount=%d, sizeTotal=%d, sizeMergedTotal=%d, overhead=%f",
sizes, iterationsCount, sizeTotal, sizeMergedTotal, overhead)
}
if len(sizes) > 18 {
t.Fatalf("too many sizes %d; sizes=%d, iterationsCount=%d, sizeTotal=%d, sizeMergedTotal=%d, overhead=%f",
len(sizes), sizes, iterationsCount, sizeTotal, sizeMergedTotal, overhead)
}
}
func newTestSizesFromPartWrappers(pws []*partWrapper) []uint64 {
var sizes []uint64
for _, pw := range pws {
sizes = append(sizes, pw.p.ph.CompressedSizeBytes)
}
return sizes
}
func newTestPartWrappersForSizes(sizes []uint64) []*partWrapper {
var pws []*partWrapper
for _, size := range sizes {
pw := &partWrapper{
p: &part{
ph: partHeader{
CompressedSizeBytes: size,
},
},
}
pws = append(pws, pw)
}
return pws
}

314
lib/logstorage/encoding.go Normal file
View file

@ -0,0 +1,314 @@
package logstorage
import (
"fmt"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
// marshalStringsBlock marshals a and appends the result to dst.
//
// The marshaled strings block can be unmarshaled with stringsBlockUnmarshaler.
func marshalStringsBlock(dst []byte, a []string) []byte {
// Encode string lengths
u64s := encoding.GetUint64s(len(a))
aLens := u64s.A[:0]
for _, s := range a {
aLens = append(aLens, uint64(len(s)))
}
u64s.A = aLens
dst = marshalUint64Block(dst, u64s.A)
encoding.PutUint64s(u64s)
// Encode strings
bb := bbPool.Get()
b := bb.B
for _, s := range a {
b = append(b, s...)
}
bb.B = b
dst = marshalBytesBlock(dst, bb.B)
bbPool.Put(bb)
return dst
}
// stringsBlockUnmarshaler is used for unmarshaling the block returned from marshalStringsBlock()
//
// use getStringsBlockUnmarshaler() for obtaining the unmarshaler from the pool in order to save memory allocations.
type stringsBlockUnmarshaler struct {
// data contains the data for the unmarshaled values
data []byte
}
func (sbu *stringsBlockUnmarshaler) reset() {
sbu.data = sbu.data[:0]
}
// unmarshal unmarshals itemsCount strings from src, appends them to dst and returns the result.
//
// The returned strings are valid until sbu.reset() call.
func (sbu *stringsBlockUnmarshaler) unmarshal(dst []string, src []byte, itemsCount uint64) ([]string, error) {
u64s := encoding.GetUint64s(0)
defer encoding.PutUint64s(u64s)
// Decode string lengths
var tail []byte
var err error
u64s.A, tail, err = unmarshalUint64Block(u64s.A[:0], src, itemsCount)
if err != nil {
return dst, fmt.Errorf("cannot unmarshal string lengths: %w", err)
}
aLens := u64s.A
src = tail
// Read bytes block into sbu.data
dataLen := len(sbu.data)
sbu.data, tail, err = unmarshalBytesBlock(sbu.data, src)
if err != nil {
return dst, fmt.Errorf("cannot unmarshal bytes block with strings: %w", err)
}
if len(tail) > 0 {
return dst, fmt.Errorf("unexpected non-empty tail after reading bytes block with strings; len(tail)=%d", len(tail))
}
// Decode strings from sbu.data into dst
data := sbu.data[dataLen:]
for _, sLen := range aLens {
if uint64(len(data)) < sLen {
return dst, fmt.Errorf("cannot unmarshal a string with the length %d bytes from %d bytes", sLen, len(data))
}
s := bytesutil.ToUnsafeString(data[:sLen])
data = data[sLen:]
dst = append(dst, s)
}
return dst, nil
}
// marshalUint64Block appends marshaled a to dst and returns the result.
func marshalUint64Block(dst []byte, a []uint64) []byte {
bb := bbPool.Get()
bb.B = marshalUint64Items(bb.B[:0], a)
dst = marshalBytesBlock(dst, bb.B)
bbPool.Put(bb)
return dst
}
// unmarshalUint64Block appends unmarshaled from src itemsCount uint64 items to dst and returns the result.
func unmarshalUint64Block(dst []uint64, src []byte, itemsCount uint64) ([]uint64, []byte, error) {
bb := bbPool.Get()
defer bbPool.Put(bb)
// Unmarshal the underlying bytes block
var err error
bb.B, src, err = unmarshalBytesBlock(bb.B[:0], src)
if err != nil {
return dst, src, fmt.Errorf("cannot unmarshal bytes block: %w", err)
}
// Unmarshal the items from bb.
dst, err = unmarshalUint64Items(dst, bb.B, itemsCount)
if err != nil {
return dst, src, fmt.Errorf("cannot unmarshal %d uint64 items from bytes block of length %d bytes: %w", itemsCount, len(bb.B), err)
}
return dst, src, nil
}
const (
uintBlockType8 = 0
uintBlockType16 = 1
uintBlockType32 = 2
uintBlockType64 = 3
)
// marshalUint64Items appends the marshaled a items to dst and returns the result.
func marshalUint64Items(dst []byte, a []uint64) []byte {
// Do not marshal len(a), since it is expected that unmarshaler knows it.
nMax := uint64(0)
for _, n := range a {
if n > nMax {
nMax = n
}
}
switch {
case nMax < (1 << 8):
dst = append(dst, uintBlockType8)
for _, n := range a {
dst = append(dst, byte(n))
}
case nMax < (1 << 16):
dst = append(dst, uintBlockType16)
for _, n := range a {
dst = encoding.MarshalUint16(dst, uint16(n))
}
case nMax < (1 << 32):
dst = append(dst, uintBlockType32)
for _, n := range a {
dst = encoding.MarshalUint32(dst, uint32(n))
}
default:
dst = append(dst, uintBlockType64)
for _, n := range a {
dst = encoding.MarshalUint64(dst, uint64(n))
}
}
return dst
}
// unmarshalUint64Items appends unmarshaled from src itemsCount uint64 items to dst and returns the result.
func unmarshalUint64Items(dst []uint64, src []byte, itemsCount uint64) ([]uint64, error) {
// Unmarshal block type
if len(src) < 1 {
return dst, fmt.Errorf("cannot unmarshal uint64 block type from empty src")
}
blockType := src[0]
src = src[1:]
switch blockType {
case uintBlockType8:
// A block with items smaller than 1<<8 bytes
if uint64(len(src)) != itemsCount {
return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), itemsCount)
}
for _, v := range src {
dst = append(dst, uint64(v))
}
case uintBlockType16:
// A block with items smaller than 1<<16 bytes
if uint64(len(src)) != 2*itemsCount {
return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 2*itemsCount)
}
for len(src) > 0 {
v := encoding.UnmarshalUint16(src)
src = src[2:]
dst = append(dst, uint64(v))
}
case uintBlockType32:
// A block with items smaller than 1<<32 bytes
if uint64(len(src)) != 4*itemsCount {
return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 4*itemsCount)
}
for len(src) > 0 {
v := encoding.UnmarshalUint32(src)
src = src[4:]
dst = append(dst, uint64(v))
}
case uintBlockType64:
// A block with items smaller than 1<<64 bytes
if uint64(len(src)) != 8*itemsCount {
return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 8*itemsCount)
}
for len(src) > 0 {
v := encoding.UnmarshalUint64(src)
src = src[8:]
dst = append(dst, v)
}
default:
return dst, fmt.Errorf("unexpected uint64 block type: %d; want 0, 1, 2 or 3", blockType)
}
return dst, nil
}
const (
marshalBytesTypePlain = 0
marshalBytesTypeZSTD = 1
)
func marshalBytesBlock(dst, src []byte) []byte {
if len(src) < 128 {
// Marshal the block in plain without compression
dst = append(dst, marshalBytesTypePlain)
dst = append(dst, byte(len(src)))
return append(dst, src...)
}
// Compress the block
dst = append(dst, marshalBytesTypeZSTD)
bb := bbPool.Get()
bb.B = encoding.CompressZSTDLevel(bb.B[:0], src, 1)
dst = encoding.MarshalVarUint64(dst, uint64(len(bb.B)))
dst = append(dst, bb.B...)
bbPool.Put(bb)
return dst
}
func unmarshalBytesBlock(dst, src []byte) ([]byte, []byte, error) {
if len(src) < 1 {
return dst, src, fmt.Errorf("cannot unmarshal block type from empty src")
}
blockType := src[0]
src = src[1:]
switch blockType {
case marshalBytesTypePlain:
// Plain block
// Read block length
if len(src) < 1 {
return dst, src, fmt.Errorf("cannot unmarshal plain block size from empty src")
}
blockLen := int(src[0])
src = src[1:]
if len(src) < blockLen {
return dst, src, fmt.Errorf("cannot read plain block with the size %d bytes from %b bytes", blockLen, len(src))
}
// Copy the block to dst
dst = append(dst, src[:blockLen]...)
src = src[blockLen:]
return dst, src, nil
case marshalBytesTypeZSTD:
// Compressed block
// Read block length
tail, blockLen, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return dst, src, fmt.Errorf("cannot unmarshal compressed block size: %w", err)
}
src = tail
if uint64(len(src)) < blockLen {
return dst, src, fmt.Errorf("cannot read compressed block with the size %d bytes from %d bytes", blockLen, len(src))
}
compressedBlock := src[:blockLen]
src = src[blockLen:]
// Decompress the block
bb := bbPool.Get()
bb.B, err = encoding.DecompressZSTD(bb.B[:0], compressedBlock)
if err != nil {
return dst, src, fmt.Errorf("cannot decompress block: %w", err)
}
// Copy the decompressed block to dst.
dst = append(dst, bb.B...)
bbPool.Put(bb)
return dst, src, nil
default:
return dst, src, fmt.Errorf("unexpected block type: %d; supported types: 0, 1", blockType)
}
}
var bbPool bytesutil.ByteBufferPool
// getStringsBlockUnmarshaler returns stringsBlockUnmarshaler from the pool.
//
// Return back the stringsBlockUnmarshaler to the pool by calling putStringsBlockUnmarshaler().
func getStringsBlockUnmarshaler() *stringsBlockUnmarshaler {
v := sbuPool.Get()
if v == nil {
return &stringsBlockUnmarshaler{}
}
return v.(*stringsBlockUnmarshaler)
}
// putStringsBlockUnmarshaler returns back sbu to the pool.
//
// sbu mustn't be used after returning to the pool.
func putStringsBlockUnmarshaler(sbu *stringsBlockUnmarshaler) {
sbu.reset()
sbuPool.Put(sbu)
}
var sbuPool sync.Pool

View file

@ -0,0 +1,86 @@
package logstorage
import (
"fmt"
"reflect"
"strings"
"testing"
)
func TestMarshalUnmarshalStringsBlock(t *testing.T) {
f := func(logs string, blockLenExpected int) {
t.Helper()
var a []string
if logs != "" {
a = strings.Split(logs, "\n")
}
data := marshalStringsBlock(nil, a)
if len(data) != blockLenExpected {
t.Fatalf("unexpected block length; got %d; want %d; block=%q", len(data), blockLenExpected, data)
}
sbu := getStringsBlockUnmarshaler()
values, err := sbu.unmarshal(nil, data, uint64(len(a)))
if err != nil {
t.Fatalf("cannot unmarshal strings block: %s", err)
}
if !reflect.DeepEqual(values, a) {
t.Fatalf("unexpected strings after unmarshaling;\ngot\n%q\nwant\n%q", values, a)
}
putStringsBlockUnmarshaler(sbu)
}
f("", 5)
f("foo", 9)
f(`foo
bar
baz
`, 18)
f(`
Apr 28 13:39:06 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
Apr 28 13:40:05 localhost kernel: [35544.823503] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 1 (2437/0 MHz)
Apr 28 13:40:15 localhost kernel: [35554.295612] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 2 (2447/0 MHz)
Apr 28 13:43:37 localhost NetworkManager[1516]: <info> [1651142617.3668] manager: NetworkManager state is now CONNECTED_GLOBAL
Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Activating via systemd: service name='org.freedesktop.nm_dispatcher' unit='dbus-org.freedesktop.nm-dispatcher.service' requested by ':1.13' (uid=0 pid=1516 comm="/usr/sbin/NetworkManager --no-daemon " label="unconfined")
Apr 28 13:43:37 localhost systemd[1]: Starting Network Manager Script Dispatcher Service...
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] The default IPv4 route is: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Not a paid data plan: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Found usable connection: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Successfully activated service 'org.freedesktop.nm_dispatcher'
Apr 28 13:43:37 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
Apr 28 13:48:01 localhost kernel: [36020.497807] CPU2: Core temperature above threshold, cpu clock throttled (total events = 22034)
Apr 28 13:48:01 localhost kernel: [36020.497809] CPU1: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497810] CPU3: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497810] CPU2: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497812] CPU0: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.499855] CPU2: Core temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499855] CPU0: Core temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499856] CPU1: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499857] CPU3: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499858] CPU0: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499859] CPU2: Package temperature/speed normal
`, 951)
// Generate a string longer than 1<<16 bytes
s := "foo"
for len(s) < (1 << 16) {
s += s
}
s += "\n"
lines := s
f(lines, 36)
lines += s
f(lines, 52)
// Generate more than 256 strings
lines = ""
for i := 0; i < 1000; i++ {
lines += fmt.Sprintf("line %d\n", i)
}
f(lines, 766)
}

View file

@ -0,0 +1,73 @@
package logstorage
import (
"fmt"
"strings"
"testing"
)
func BenchmarkMarshalStringsBlock(b *testing.B) {
block := strings.Split(benchLogs, "\n")
b.SetBytes(int64(len(benchLogs)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var buf []byte
for pb.Next() {
buf = marshalStringsBlock(buf[:0], block)
}
})
}
func BenchmarkStringsBlockUnmarshaler_Unmarshal(b *testing.B) {
block := strings.Split(benchLogs, "\n")
data := marshalStringsBlock(nil, block)
b.SetBytes(int64(len(benchLogs)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
sbu := getStringsBlockUnmarshaler()
var values []string
for pb.Next() {
var err error
values, err = sbu.unmarshal(values[:0], data, uint64(len(block)))
if err != nil {
panic(fmt.Errorf("unexpected error: %w", err))
}
sbu.reset()
}
putStringsBlockUnmarshaler(sbu)
})
}
const benchLogs = `
Apr 28 13:39:06 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
Apr 28 13:40:05 localhost kernel: [35544.823503] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 1 (2437/0 MHz)
Apr 28 13:40:15 localhost kernel: [35554.295612] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 2 (2447/0 MHz)
Apr 28 13:43:37 localhost NetworkManager[1516]: <info> [1651142617.3668] manager: NetworkManager state is now CONNECTED_GLOBAL
Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Activating via systemd: service name='org.freedesktop.nm_dispatcher' unit='dbus-org.freedesktop.nm-dispatcher.service' requested by ':1.13' (uid=0 pid=1516 comm="/usr/sbin/NetworkManager --no-daemon " label="unconfined")
Apr 28 13:43:37 localhost systemd[1]: Starting Network Manager Script Dispatcher Service...
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] The default IPv4 route is: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Not a paid data plan: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Found usable connection: /org/freedesktop/NetworkManager/ActiveConnection/10
Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Successfully activated service 'org.freedesktop.nm_dispatcher'
Apr 28 13:43:37 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
Apr 28 13:48:01 localhost kernel: [36020.497807] CPU2: Core temperature above threshold, cpu clock throttled (total events = 22034)
Apr 28 13:48:01 localhost kernel: [36020.497809] CPU1: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497810] CPU3: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497810] CPU2: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.497812] CPU0: Package temperature above threshold, cpu clock throttled (total events = 27400)
Apr 28 13:48:01 localhost kernel: [36020.499855] CPU2: Core temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499855] CPU0: Core temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499856] CPU1: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499857] CPU3: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499858] CPU0: Package temperature/speed normal
Apr 28 13:48:01 localhost kernel: [36020.499859] CPU2: Package temperature/speed normal
`

View file

@ -0,0 +1,22 @@
package logstorage
const (
metaindexFilename = "metaindex.bin"
indexFilename = "index.bin"
columnsHeaderFilename = "columns_header.bin"
timestampsFilename = "timestamps.bin"
fieldValuesFilename = "field_values.bin"
fieldBloomFilename = "field_bloom.bin"
messageValuesFilename = "message_values.bin"
messageBloomFilename = "message_bloom.bin"
metadataFilename = "metadata.json"
partsFilename = "parts.json"
streamIDCacheFilename = "stream_id.bin"
indexdbDirname = "indexdb"
datadbDirname = "datadb"
cacheDirname = "cache"
partitionsDirname = "partitions"
)

3053
lib/logstorage/filters.go Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

38
lib/logstorage/hash128.go Normal file
View file

@ -0,0 +1,38 @@
package logstorage
import (
"sync"
"github.com/cespare/xxhash/v2"
)
func hash128(data []byte) u128 {
h := getHasher()
_, _ = h.Write(data)
hi := h.Sum64()
_, _ = h.Write(magicSuffixForHash)
lo := h.Sum64()
putHasher(h)
return u128{
hi: hi,
lo: lo,
}
}
var magicSuffixForHash = []byte("magic!")
func getHasher() *xxhash.Digest {
v := hasherPool.Get()
if v == nil {
return xxhash.New()
}
return v.(*xxhash.Digest)
}
func putHasher(h *xxhash.Digest) {
h.Reset()
hasherPool.Put(h)
}
var hasherPool sync.Pool

View file

@ -0,0 +1,24 @@
package logstorage
import (
"testing"
)
func TestHash128(t *testing.T) {
f := func(data string, hashExpected u128) {
t.Helper()
h := hash128([]byte(data))
if !h.equal(&hashExpected) {
t.Fatalf("unexpected hash; got %s; want %s", &h, &hashExpected)
}
}
f("", u128{
hi: 17241709254077376921,
lo: 13138662262368978769,
})
f("abc", u128{
hi: 4952883123889572249,
lo: 3255951525518405514,
})
}

View file

@ -0,0 +1,29 @@
package logstorage
import (
"fmt"
"sync/atomic"
"testing"
)
func BenchmarkHash128(b *testing.B) {
a := make([][]byte, 100)
for i := range a {
a[i] = []byte(fmt.Sprintf("some string %d", i))
}
b.ReportAllocs()
b.SetBytes(int64(len(a)))
b.RunParallel(func(pb *testing.PB) {
var n uint64
for pb.Next() {
for _, b := range a {
h := hash128(b)
n += h.hi
n += h.lo
}
}
atomic.AddUint64(&GlobalSinkU64, n)
})
}
var GlobalSinkU64 uint64

View file

@ -0,0 +1,164 @@
package logstorage
import (
"fmt"
"io"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// indexBlockHeader contains index information about multiple blocks.
//
// It allows locating the block by streamID and/or by time range.
type indexBlockHeader struct {
// streamID is the minimum streamID covered by the indexBlockHeader
streamID streamID
// minTimestamp is the mimumum timestamp seen across blocks covered by the indexBlockHeader
minTimestamp int64
// maxTimestamp is the maximum timestamp seen across blocks covered by the indexBlockHeader
maxTimestamp int64
// indexBlockOffset is an offset of the linked index block at indexFilename
indexBlockOffset uint64
// indexBlockSize is the size of the linked index block at indexFilename
indexBlockSize uint64
}
// reset resets ih for subsequent re-use.
func (ih *indexBlockHeader) reset() {
ih.streamID.reset()
ih.minTimestamp = 0
ih.maxTimestamp = 0
ih.indexBlockOffset = 0
ih.indexBlockSize = 0
}
// mustWriteIndexBlock writes data with the given additioanl args to sw and updates ih accordingly.
func (ih *indexBlockHeader) mustWriteIndexBlock(data []byte, sidFirst streamID, minTimestamp, maxTimestamp int64, sw *streamWriters) {
ih.streamID = sidFirst
ih.minTimestamp = minTimestamp
ih.maxTimestamp = maxTimestamp
bb := longTermBufPool.Get()
bb.B = encoding.CompressZSTDLevel(bb.B[:0], data, 1)
ih.indexBlockOffset = sw.indexWriter.bytesWritten
ih.indexBlockSize = uint64(len(bb.B))
sw.indexWriter.MustWrite(bb.B)
longTermBufPool.Put(bb)
}
// mustReadNextIndexBlock reads the next index block associated with ih from src, appends it to dst and returns the result.
func (ih *indexBlockHeader) mustReadNextIndexBlock(dst []byte, sr *streamReaders) []byte {
indexReader := &sr.indexReader
indexBlockSize := ih.indexBlockSize
if indexBlockSize > maxIndexBlockSize {
logger.Panicf("FATAL: %s: indexBlockHeader.indexBlockSize=%d cannot exceed %d bytes", indexReader.Path(), indexBlockSize, maxIndexBlockSize)
}
if ih.indexBlockOffset != indexReader.bytesRead {
logger.Panicf("FATAL: %s: indexBlockHeader.indexBlockOffset=%d must equal to %d", indexReader.Path(), ih.indexBlockOffset, indexReader.bytesRead)
}
bbCompressed := longTermBufPool.Get()
bbCompressed.B = bytesutil.ResizeNoCopyMayOverallocate(bbCompressed.B, int(indexBlockSize))
indexReader.MustReadFull(bbCompressed.B)
// Decompress bbCompressed to dst
var err error
dst, err = encoding.DecompressZSTD(dst, bbCompressed.B)
longTermBufPool.Put(bbCompressed)
if err != nil {
logger.Panicf("FATAL: %s: cannot decompress indexBlock read at offset %d with size %d: %s", indexReader.Path(), ih.indexBlockOffset, indexBlockSize, err)
}
return dst
}
// marshal appends marshaled ih to dst and returns the result.
func (ih *indexBlockHeader) marshal(dst []byte) []byte {
dst = ih.streamID.marshal(dst)
dst = encoding.MarshalUint64(dst, uint64(ih.minTimestamp))
dst = encoding.MarshalUint64(dst, uint64(ih.maxTimestamp))
dst = encoding.MarshalUint64(dst, ih.indexBlockOffset)
dst = encoding.MarshalUint64(dst, ih.indexBlockSize)
return dst
}
// unmarshal unmarshals ih from src and returns the tail left.
func (ih *indexBlockHeader) unmarshal(src []byte) ([]byte, error) {
srcOrig := src
// unmarshal ih.streamID
tail, err := ih.streamID.unmarshal(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal streamID: %w", err)
}
src = tail
// unmarshal the rest of indexBlockHeader fields
if len(src) < 32 {
return srcOrig, fmt.Errorf("cannot unmarshal indexBlockHeader from %d bytes; need at least 32 bytes", len(src))
}
ih.minTimestamp = int64(encoding.UnmarshalUint64(src))
ih.maxTimestamp = int64(encoding.UnmarshalUint64(src[8:]))
ih.indexBlockOffset = encoding.UnmarshalUint64(src[16:])
ih.indexBlockSize = encoding.UnmarshalUint64(src[24:])
return src[32:], nil
}
// mustReadIndexBlockHeaders reads indexBlockHeader entries from r, appends them to dst and returns the result.
func mustReadIndexBlockHeaders(dst []indexBlockHeader, r *readerWithStats) []indexBlockHeader {
data, err := io.ReadAll(r)
if err != nil {
logger.Panicf("FATAL: cannot read indexBlockHeader entries from %s: %s", r.Path(), err)
}
bb := longTermBufPool.Get()
bb.B, err = encoding.DecompressZSTD(bb.B[:0], data)
if err != nil {
logger.Panicf("FATAL: cannot decompress indexBlockHeader entries from %s: %s", r.Path(), err)
}
dst, err = unmarshalIndexBlockHeaders(dst, bb.B)
if len(bb.B) < 1024*1024 {
longTermBufPool.Put(bb)
}
if err != nil {
logger.Panicf("FATAL: cannot parse indexBlockHeader entries from %s: %s", r.Path(), err)
}
return dst
}
// unmarshalIndexBlockHeaders appends unmarshaled from src indexBlockHeader entries to dst and returns the result.
func unmarshalIndexBlockHeaders(dst []indexBlockHeader, src []byte) ([]indexBlockHeader, error) {
dstOrig := dst
for len(src) > 0 {
if len(dst) < cap(dst) {
dst = dst[:len(dst)+1]
} else {
dst = append(dst, indexBlockHeader{})
}
ih := &dst[len(dst)-1]
tail, err := ih.unmarshal(src)
if err != nil {
return dstOrig, fmt.Errorf("cannot unmarshal indexBlockHeader %d: %w", len(dst)-len(dstOrig), err)
}
src = tail
}
if err := validateIndexBlockHeaders(dst[len(dstOrig):]); err != nil {
return dstOrig, err
}
return dst, nil
}
func validateIndexBlockHeaders(ihs []indexBlockHeader) error {
for i := 1; i < len(ihs); i++ {
if ihs[i].streamID.less(&ihs[i-1].streamID) {
return fmt.Errorf("unexpected indexBlockHeader with smaller streamID=%s after bigger streamID=%s", &ihs[i].streamID, &ihs[i-1].streamID)
}
}
return nil
}

View file

@ -0,0 +1,138 @@
package logstorage
import (
"reflect"
"testing"
)
func TestIndexBlockHeaderMarshalUnmarshal(t *testing.T) {
f := func(ih *indexBlockHeader, marshaledLen int) {
t.Helper()
data := ih.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected marshaled length of indexBlockHeader; got %d; want %d", len(data), marshaledLen)
}
var ih2 indexBlockHeader
tail, err := ih2.unmarshal(data)
if err != nil {
t.Fatalf("cannot unmarshal indexBlockHeader: %s", err)
}
if len(tail) > 0 {
t.Fatalf("unexpected non-empty tail left after unmarshaling indexBlockHeader: %X", tail)
}
if !reflect.DeepEqual(ih, &ih2) {
t.Fatalf("unexpected unmarshaled indexBlockHeader\ngot\n%v\nwant\n%v", &ih2, ih)
}
}
f(&indexBlockHeader{}, 56)
f(&indexBlockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
hi: 214,
lo: 2111,
},
},
minTimestamp: 1234,
maxTimestamp: 898943,
indexBlockOffset: 234,
indexBlockSize: 898,
}, 56)
}
func TestIndexBlockHeaderUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var ih indexBlockHeader
tail, err := ih.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
ih := &indexBlockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
hi: 214,
lo: 2111,
},
},
minTimestamp: 1234,
maxTimestamp: 898943,
indexBlockOffset: 234,
indexBlockSize: 898,
}
data := ih.marshal(nil)
for len(data) > 0 {
data = data[:len(data)-1]
f(data)
}
}
func TestIndexBlockHeaderReset(t *testing.T) {
ih := &indexBlockHeader{
streamID: streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
hi: 214,
lo: 2111,
},
},
minTimestamp: 1234,
maxTimestamp: 898943,
indexBlockOffset: 234,
indexBlockSize: 898,
}
ih.reset()
ihZero := &indexBlockHeader{}
if !reflect.DeepEqual(ih, ihZero) {
t.Fatalf("unexpected non-zero indexBlockHeader after reset: %v", ih)
}
}
func TestMarshalUnmarshalIndexBlockHeaders(t *testing.T) {
f := func(ihs []indexBlockHeader, marshaledLen int) {
t.Helper()
var data []byte
for i := range ihs {
data = ihs[i].marshal(data)
}
if len(data) != marshaledLen {
t.Fatalf("unexpected marshaled length for indexBlockHeader entries; got %d; want %d", len(data), marshaledLen)
}
ihs2, err := unmarshalIndexBlockHeaders(nil, data)
if err != nil {
t.Fatalf("cannot unmarshal indexBlockHeader entries: %s", err)
}
if !reflect.DeepEqual(ihs, ihs2) {
t.Fatalf("unexpected indexBlockHeader entries after unmarshaling\ngot\n%v\nwant\n%v", ihs2, ihs)
}
}
f(nil, 0)
f([]indexBlockHeader{{}}, 56)
f([]indexBlockHeader{
{
indexBlockOffset: 234,
indexBlockSize: 5432,
},
{
minTimestamp: -123,
},
}, 112)
}

900
lib/logstorage/indexdb.go Normal file
View file

@ -0,0 +1,900 @@
package logstorage
import (
"bytes"
"fmt"
"io"
"sort"
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/mergeset"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
)
const (
// (tenantID:streamID) entries have this prefix
//
// These entries are used for detecting whether the given stream is already registered
nsPrefixStreamID = 0
// (tenantID:streamID -> streamTagsCanonical) entries have this prefix
nsPrefixStreamIDToStreamTags = 1
// (tenantID:name:value => streamIDs) entries have this prefix
nsPrefixTagToStreamIDs = 2
)
// IndexdbStats contains indexdb stats
type IndexdbStats struct {
// StreamsCreatedTotal is the number of log streams created since the indexdb initialization.
StreamsCreatedTotal uint64
}
type indexdb struct {
// streamsCreatedTotal is the number of log streams created since the indexdb intialization.
streamsCreatedTotal uint64
// path is the path to indexdb
path string
// partitionName is the name of the partition for the indexdb.
partitionName string
// tb is the storage for indexdb
tb *mergeset.Table
// indexSearchPool is a pool of indexSearch struct for the given indexdb
indexSearchPool sync.Pool
// the generation of the streamFilterCache.
// It is updated each time new item is added to tb.
streamFilterCacheGeneration uint32
// s is the storage where indexdb belongs to.
s *Storage
}
func mustCreateIndexdb(path string) {
fs.MustMkdirFailIfExist(path)
}
func mustOpenIndexdb(path, partitionName string, s *Storage) *indexdb {
idb := &indexdb{
path: path,
partitionName: partitionName,
s: s,
}
isReadOnly := uint32(0)
idb.tb = mergeset.MustOpenTable(path, idb.invalidateStreamFilterCache, mergeTagToStreamIDsRows, &isReadOnly)
return idb
}
func mustCloseIndexdb(idb *indexdb) {
idb.tb.MustClose()
idb.tb = nil
idb.s = nil
idb.partitionName = ""
idb.path = ""
}
func (idb *indexdb) debugFlush() {
idb.tb.DebugFlush()
}
func (idb *indexdb) updateStats(d *IndexdbStats) {
d.StreamsCreatedTotal += atomic.LoadUint64(&idb.streamsCreatedTotal)
}
func (idb *indexdb) appendStreamTagsByStreamID(dst []byte, sid *streamID) []byte {
is := idb.getIndexSearch()
defer idb.putIndexSearch(is)
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixStreamIDToStreamTags, sid.tenantID)
kb.B = sid.id.marshal(kb.B)
if err := ts.FirstItemWithPrefix(kb.B); err != nil {
if err == io.EOF {
return dst
}
logger.Panicf("FATAL: unexpected error when searching for StreamTags by streamID=%s in indexdb: %s", sid, err)
}
data := ts.Item[len(kb.B):]
dst = append(dst, data...)
return dst
}
// hasStreamID returns true if streamID exists in idb
func (idb *indexdb) hasStreamID(sid *streamID) bool {
is := idb.getIndexSearch()
defer idb.putIndexSearch(is)
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B, nsPrefixStreamID, sid.tenantID)
kb.B = sid.id.marshal(kb.B)
if err := ts.FirstItemWithPrefix(kb.B); err != nil {
if err == io.EOF {
return false
}
logger.Panicf("FATAL: unexpected error when searching for streamID=%s in indexdb: %s", sid, err)
}
return len(kb.B) == len(ts.Item)
}
type indexSearch struct {
idb *indexdb
ts mergeset.TableSearch
kb bytesutil.ByteBuffer
}
func (idb *indexdb) getIndexSearch() *indexSearch {
v := idb.indexSearchPool.Get()
if v == nil {
v = &indexSearch{
idb: idb,
}
}
is := v.(*indexSearch)
is.ts.Init(idb.tb)
return is
}
func (idb *indexdb) putIndexSearch(is *indexSearch) {
is.idb = nil
is.ts.MustClose()
is.kb.Reset()
idb.indexSearchPool.Put(is)
}
// searchStreamIDs returns streamIDs for the given tenantIDs and the given stream filters
func (idb *indexdb) searchStreamIDs(tenantIDs []TenantID, sf *StreamFilter) []streamID {
// Try obtaining streamIDs from cache
streamIDs, ok := idb.loadStreamIDsFromCache(tenantIDs, sf)
if ok {
// Fast path - streamIDs found in the cache.
return streamIDs
}
// Slow path - collect streamIDs from indexdb.
// Collect streamIDs for all the specified tenantIDs.
is := idb.getIndexSearch()
m := make(map[streamID]struct{})
for _, tenantID := range tenantIDs {
for _, asf := range sf.orFilters {
is.updateStreamIDs(m, tenantID, asf)
}
}
idb.putIndexSearch(is)
// Convert the collected streamIDs from m to sorted slice.
streamIDs = make([]streamID, 0, len(m))
for streamID := range m {
streamIDs = append(streamIDs, streamID)
}
sortStreamIDs(streamIDs)
// Store the collected streamIDs to cache.
idb.storeStreamIDsToCache(tenantIDs, sf, streamIDs)
return streamIDs
}
func sortStreamIDs(streamIDs []streamID) {
sort.Slice(streamIDs, func(i, j int) bool {
return streamIDs[i].less(&streamIDs[j])
})
}
func (is *indexSearch) updateStreamIDs(dst map[streamID]struct{}, tenantID TenantID, asf *andStreamFilter) {
var m map[u128]struct{}
for _, tf := range asf.tagFilters {
ids := is.getStreamIDsForTagFilter(tenantID, tf)
if len(ids) == 0 {
// There is no need in checking the remaining filters,
// since the result will be empty in any case.
return
}
if m == nil {
m = ids
} else {
for id := range m {
if _, ok := ids[id]; !ok {
delete(m, id)
}
}
}
}
var sid streamID
for id := range m {
sid.tenantID = tenantID
sid.id = id
dst[sid] = struct{}{}
}
}
func (is *indexSearch) getStreamIDsForTagFilter(tenantID TenantID, tf *streamTagFilter) map[u128]struct{} {
switch tf.op {
case "=":
if tf.value == "" {
// (field="")
return is.getStreamIDsForEmptyTagValue(tenantID, tf.tagName)
}
// (field="value")
return is.getStreamIDsForNonEmptyTagValue(tenantID, tf.tagName, tf.value)
case "!=":
if tf.value == "" {
// (field!="")
return is.getStreamIDsForTagName(tenantID, tf.tagName)
}
// (field!="value") => (all and not field="value")
ids := is.getStreamIDsForTenant(tenantID)
idsForTag := is.getStreamIDsForNonEmptyTagValue(tenantID, tf.tagName, tf.value)
for id := range idsForTag {
delete(ids, id)
}
return ids
case "=~":
re := tf.getRegexp()
if re.MatchString("") {
// (field=~"|re") => (field="" or field=~"re")
ids := is.getStreamIDsForEmptyTagValue(tenantID, tf.tagName)
idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
for id := range idsForRe {
ids[id] = struct{}{}
}
return ids
}
return is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
case "!~":
re := tf.getRegexp()
if re.MatchString("") {
// (field!~"|re") => (field!="" and not field=~"re")
ids := is.getStreamIDsForTagName(tenantID, tf.tagName)
if len(ids) == 0 {
return ids
}
idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
for id := range idsForRe {
delete(ids, id)
}
return ids
}
// (field!~"re") => (all and not field=~"re")
ids := is.getStreamIDsForTenant(tenantID)
idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
for id := range idsForRe {
delete(ids, id)
}
return ids
default:
logger.Panicf("BUG: unexpected operation in stream tag filter: %q", tf.op)
return nil
}
}
func (is *indexSearch) getStreamIDsForNonEmptyTagValue(tenantID TenantID, tagName, tagValue string) map[u128]struct{} {
ids := make(map[u128]struct{})
var sp tagToStreamIDsRowParser
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagValue))
prefix := kb.B
ts.Seek(prefix)
for ts.NextItem() {
item := ts.Item
if !bytes.HasPrefix(item, prefix) {
break
}
tail := item[len(prefix):]
sp.UpdateStreamIDs(ids, tail)
}
if err := ts.Error(); err != nil {
logger.Panicf("FATAL: unexpected error: %s", err)
}
return ids
}
func (is *indexSearch) getStreamIDsForEmptyTagValue(tenantID TenantID, tagName string) map[u128]struct{} {
ids := is.getStreamIDsForTenant(tenantID)
idsForTag := is.getStreamIDsForTagName(tenantID, tagName)
for id := range idsForTag {
delete(ids, id)
}
return ids
}
func (is *indexSearch) getStreamIDsForTenant(tenantID TenantID) map[u128]struct{} {
ids := make(map[u128]struct{})
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixStreamID, tenantID)
prefix := kb.B
ts.Seek(prefix)
var id u128
for ts.NextItem() {
item := ts.Item
if !bytes.HasPrefix(item, prefix) {
break
}
tail, err := id.unmarshal(item[len(prefix):])
if err != nil {
logger.Panicf("FATAL: cannot unmarshal streamID from (tenantID:streamID) entry: %s", err)
}
if len(tail) > 0 {
logger.Panicf("FATAL: unexpected non-empty tail left after unmarshaling streamID from (tenantID:streamID); tail len=%d", len(tail))
}
ids[id] = struct{}{}
}
if err := ts.Error(); err != nil {
logger.Panicf("FATAL: unexpected error: %s", err)
}
return ids
}
func (is *indexSearch) getStreamIDsForTagName(tenantID TenantID, tagName string) map[u128]struct{} {
ids := make(map[u128]struct{})
var sp tagToStreamIDsRowParser
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
prefix := kb.B
ts.Seek(prefix)
for ts.NextItem() {
item := ts.Item
if !bytes.HasPrefix(item, prefix) {
break
}
tail := item[len(prefix):]
n := bytes.IndexByte(tail, tagSeparatorChar)
if n < 0 {
logger.Panicf("FATAL: cannot find the end of tag value")
}
tail = tail[n+1:]
sp.UpdateStreamIDs(ids, tail)
}
if err := ts.Error(); err != nil {
logger.Panicf("FATAL: unexpected error: %s", err)
}
return ids
}
func (is *indexSearch) getStreamIDsForTagRegexp(tenantID TenantID, tagName string, re *regexutil.PromRegex) map[u128]struct{} {
ids := make(map[u128]struct{})
var sp tagToStreamIDsRowParser
var tagValue, prevMatchingTagValue []byte
var err error
ts := &is.ts
kb := &is.kb
kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
prefix := kb.B
ts.Seek(prefix)
for ts.NextItem() {
item := ts.Item
if !bytes.HasPrefix(item, prefix) {
break
}
tail := item[len(prefix):]
tail, tagValue, err = unmarshalTagValue(tagValue[:0], tail)
if err != nil {
logger.Panicf("FATAL: cannot unmarshal tag value: %s", err)
}
if !bytes.Equal(tagValue, prevMatchingTagValue) {
if !re.MatchString(bytesutil.ToUnsafeString(tagValue)) {
continue
}
prevMatchingTagValue = append(prevMatchingTagValue[:0], tagValue...)
}
sp.UpdateStreamIDs(ids, tail)
}
if err := ts.Error(); err != nil {
logger.Panicf("FATAL: unexpected error: %s", err)
}
return ids
}
func (idb *indexdb) mustRegisterStream(streamID *streamID, streamTagsCanonical []byte) {
st := GetStreamTags()
mustUnmarshalStreamTags(st, streamTagsCanonical)
tenantID := streamID.tenantID
bi := getBatchItems()
buf := bi.buf[:0]
items := bi.items[:0]
// Register tenantID:streamID entry.
bufLen := len(buf)
buf = marshalCommonPrefix(buf, nsPrefixStreamID, tenantID)
buf = streamID.id.marshal(buf)
items = append(items, buf[bufLen:])
// Register tenantID:streamID -> streamTagsCanonical entry.
bufLen = len(buf)
buf = marshalCommonPrefix(buf, nsPrefixStreamIDToStreamTags, tenantID)
buf = streamID.id.marshal(buf)
buf = append(buf, streamTagsCanonical...)
items = append(items, buf[bufLen:])
// Register tenantID:name:value -> streamIDs entries.
tags := st.tags
for i := range tags {
bufLen = len(buf)
buf = marshalCommonPrefix(buf, nsPrefixTagToStreamIDs, tenantID)
buf = tags[i].indexdbMarshal(buf)
buf = streamID.id.marshal(buf)
items = append(items, buf[bufLen:])
}
PutStreamTags(st)
// Add items to the storage
idb.tb.AddItems(items)
bi.buf = buf
bi.items = items
putBatchItems(bi)
atomic.AddUint64(&idb.streamsCreatedTotal, 1)
}
func (idb *indexdb) invalidateStreamFilterCache() {
// This function must be fast, since it is called each
// time new indexdb entry is added.
atomic.AddUint32(&idb.streamFilterCacheGeneration, 1)
}
func (idb *indexdb) marshalStreamFilterCacheKey(dst []byte, tenantIDs []TenantID, sf *StreamFilter) []byte {
dst = encoding.MarshalUint32(dst, idb.streamFilterCacheGeneration)
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(idb.partitionName))
dst = encoding.MarshalVarUint64(dst, uint64(len(tenantIDs)))
for i := range tenantIDs {
dst = tenantIDs[i].marshal(dst)
}
dst = sf.marshalForCacheKey(dst)
return dst
}
func (idb *indexdb) loadStreamIDsFromCache(tenantIDs []TenantID, sf *StreamFilter) ([]streamID, bool) {
bb := bbPool.Get()
bb.B = idb.marshalStreamFilterCacheKey(bb.B[:0], tenantIDs, sf)
data := idb.s.streamFilterCache.GetBig(nil, bb.B)
bbPool.Put(bb)
if len(data) == 0 {
// Cache miss
return nil, false
}
// Cache hit - unpack streamIDs from data.
tail, n, err := encoding.UnmarshalVarUint64(data)
if err != nil {
logger.Panicf("BUG: unexpected error when unmarshaling the number of streamIDs from cache: %s", err)
}
src := tail
streamIDs := make([]streamID, n)
for i := uint64(0); i < n; i++ {
tail, err = streamIDs[i].unmarshal(src)
if err != nil {
logger.Panicf("BUG: unexpected error when unmarshaling streamID #%d: %s", i, err)
}
src = tail
}
if len(src) > 0 {
logger.Panicf("BUG: unexpected non-empty tail left with len=%d", len(src))
}
return streamIDs, true
}
func (idb *indexdb) storeStreamIDsToCache(tenantIDs []TenantID, sf *StreamFilter, streamIDs []streamID) {
// marshal streamIDs
var b []byte
b = encoding.MarshalVarUint64(b, uint64(len(streamIDs)))
for i := 0; i < len(streamIDs); i++ {
b = streamIDs[i].marshal(b)
}
// Store marshaled streamIDs to cache.
bb := bbPool.Get()
bb.B = idb.marshalStreamFilterCacheKey(bb.B[:0], tenantIDs, sf)
idb.s.streamFilterCache.SetBig(bb.B, b)
bbPool.Put(bb)
}
type batchItems struct {
buf []byte
items [][]byte
}
func (bi *batchItems) reset() {
bi.buf = bi.buf[:0]
items := bi.items
for i := range items {
items[i] = nil
}
bi.items = items[:0]
}
func getBatchItems() *batchItems {
v := batchItemsPool.Get()
if v == nil {
return &batchItems{}
}
return v.(*batchItems)
}
func putBatchItems(bi *batchItems) {
bi.reset()
batchItemsPool.Put(bi)
}
var batchItemsPool sync.Pool
func mergeTagToStreamIDsRows(data []byte, items []mergeset.Item) ([]byte, []mergeset.Item) {
// Perform quick checks whether items contain rows starting from nsPrefixTagToStreamIDs
// based on the fact that items are sorted.
if len(items) <= 2 {
// The first and the last row must remain unchanged.
return data, items
}
firstItem := items[0].Bytes(data)
if len(firstItem) > 0 && firstItem[0] > nsPrefixTagToStreamIDs {
return data, items
}
lastItem := items[len(items)-1].Bytes(data)
if len(lastItem) > 0 && lastItem[0] < nsPrefixTagToStreamIDs {
return data, items
}
// items contain at least one row starting from nsPrefixTagToStreamIDs. Merge rows with common tag.
tsm := getTagToStreamIDsRowsMerger()
tsm.dataCopy = append(tsm.dataCopy[:0], data...)
tsm.itemsCopy = append(tsm.itemsCopy[:0], items...)
sp := &tsm.sp
spPrev := &tsm.spPrev
dstData := data[:0]
dstItems := items[:0]
for i, it := range items {
item := it.Bytes(data)
if len(item) == 0 || item[0] != nsPrefixTagToStreamIDs || i == 0 || i == len(items)-1 {
// Write rows not starting with nsPrefixTagToStreamIDs as-is.
// Additionally write the first and the last row as-is in order to preserve
// sort order for adjacent blocks.
dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
dstData = append(dstData, item...)
dstItems = append(dstItems, mergeset.Item{
Start: uint32(len(dstData) - len(item)),
End: uint32(len(dstData)),
})
continue
}
if err := sp.Init(item); err != nil {
logger.Panicf("FATAL: cannot parse row during merge: %s", err)
}
if sp.StreamIDsLen() >= maxStreamIDsPerRow {
dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
dstData = append(dstData, item...)
dstItems = append(dstItems, mergeset.Item{
Start: uint32(len(dstData) - len(item)),
End: uint32(len(dstData)),
})
continue
}
if !sp.EqualPrefix(spPrev) {
dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
}
sp.ParseStreamIDs()
tsm.pendingStreamIDs = append(tsm.pendingStreamIDs, sp.StreamIDs...)
spPrev, sp = sp, spPrev
if len(tsm.pendingStreamIDs) >= maxStreamIDsPerRow {
dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
}
}
if len(tsm.pendingStreamIDs) > 0 {
logger.Panicf("BUG: tsm.pendingStreamIDs must be empty at this point; got %d items", len(tsm.pendingStreamIDs))
}
if !checkItemsSorted(dstData, dstItems) {
// Items could become unsorted if initial items contain duplicate streamIDs:
//
// item1: 1, 1, 5
// item2: 1, 4
//
// Items could become the following after the merge:
//
// item1: 1, 5
// item2: 1, 4
//
// i.e. item1 > item2
//
// Leave the original items unmerged, so they can be merged next time.
// This case should be quite rare - if multiple data points are simultaneously inserted
// into the same new time series from multiple concurrent goroutines.
dstData = append(dstData[:0], tsm.dataCopy...)
dstItems = append(dstItems[:0], tsm.itemsCopy...)
if !checkItemsSorted(dstData, dstItems) {
logger.Panicf("BUG: the original items weren't sorted; items=%q", dstItems)
}
}
putTagToStreamIDsRowsMerger(tsm)
return dstData, dstItems
}
// maxStreamIDsPerRow limits the number of streamIDs in tenantID:name:value -> streamIDs row.
//
// This reduces overhead on index and metaindex in lib/mergeset.
const maxStreamIDsPerRow = 32
type u128Sorter []u128
func (s u128Sorter) Len() int { return len(s) }
func (s u128Sorter) Less(i, j int) bool {
return s[i].less(&s[j])
}
func (s u128Sorter) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
type tagToStreamIDsRowsMerger struct {
pendingStreamIDs u128Sorter
sp tagToStreamIDsRowParser
spPrev tagToStreamIDsRowParser
itemsCopy []mergeset.Item
dataCopy []byte
}
func (tsm *tagToStreamIDsRowsMerger) Reset() {
tsm.pendingStreamIDs = tsm.pendingStreamIDs[:0]
tsm.sp.Reset()
tsm.spPrev.Reset()
tsm.itemsCopy = tsm.itemsCopy[:0]
tsm.dataCopy = tsm.dataCopy[:0]
}
func (tsm *tagToStreamIDsRowsMerger) flushPendingStreamIDs(dstData []byte, dstItems []mergeset.Item, sp *tagToStreamIDsRowParser) ([]byte, []mergeset.Item) {
if len(tsm.pendingStreamIDs) == 0 {
// Nothing to flush
return dstData, dstItems
}
// Use sort.Sort instead of sort.Slice in order to reduce memory allocations.
sort.Sort(&tsm.pendingStreamIDs)
tsm.pendingStreamIDs = removeDuplicateStreamIDs(tsm.pendingStreamIDs)
// Marshal pendingStreamIDs
dstDataLen := len(dstData)
dstData = sp.MarshalPrefix(dstData)
pendingStreamIDs := tsm.pendingStreamIDs
for i := range pendingStreamIDs {
dstData = pendingStreamIDs[i].marshal(dstData)
}
dstItems = append(dstItems, mergeset.Item{
Start: uint32(dstDataLen),
End: uint32(len(dstData)),
})
tsm.pendingStreamIDs = tsm.pendingStreamIDs[:0]
return dstData, dstItems
}
func removeDuplicateStreamIDs(sortedStreamIDs []u128) []u128 {
if len(sortedStreamIDs) < 2 {
return sortedStreamIDs
}
hasDuplicates := false
for i := 1; i < len(sortedStreamIDs); i++ {
if sortedStreamIDs[i-1] == sortedStreamIDs[i] {
hasDuplicates = true
break
}
}
if !hasDuplicates {
return sortedStreamIDs
}
dstStreamIDs := sortedStreamIDs[:1]
for i := 1; i < len(sortedStreamIDs); i++ {
if sortedStreamIDs[i-1] == sortedStreamIDs[i] {
continue
}
dstStreamIDs = append(dstStreamIDs, sortedStreamIDs[i])
}
return dstStreamIDs
}
func getTagToStreamIDsRowsMerger() *tagToStreamIDsRowsMerger {
v := tsmPool.Get()
if v == nil {
return &tagToStreamIDsRowsMerger{}
}
return v.(*tagToStreamIDsRowsMerger)
}
func putTagToStreamIDsRowsMerger(tsm *tagToStreamIDsRowsMerger) {
tsm.Reset()
tsmPool.Put(tsm)
}
var tsmPool sync.Pool
type tagToStreamIDsRowParser struct {
// TenantID contains TenantID of the parsed row
TenantID TenantID
// StreamIDs contains parsed StreamIDs after ParseStreamIDs call
StreamIDs []u128
// streamIDsParsed is set to true after ParseStreamIDs call
streamIDsParsed bool
// Tag contains parsed tag after Init call
Tag streamTag
// tail contains the remaining unparsed streamIDs
tail []byte
}
func (sp *tagToStreamIDsRowParser) Reset() {
sp.TenantID.Reset()
sp.StreamIDs = sp.StreamIDs[:0]
sp.streamIDsParsed = false
sp.Tag.reset()
sp.tail = nil
}
// Init initializes sp from b, which should contain encoded tenantID:name:value -> streamIDs row.
//
// b cannot be re-used until Reset call.
//
// ParseStreamIDs() must be called later for obtaining sp.StreamIDs from the given tail.
func (sp *tagToStreamIDsRowParser) Init(b []byte) error {
tail, nsPrefix, err := unmarshalCommonPrefix(&sp.TenantID, b)
if err != nil {
return fmt.Errorf("invalid tenantID:name:value -> streamIDs row %q: %w", b, err)
}
if nsPrefix != nsPrefixTagToStreamIDs {
return fmt.Errorf("invalid prefix for tenantID:name:value -> streamIDs row %q; got %d; want %d", b, nsPrefix, nsPrefixTagToStreamIDs)
}
tail, err = sp.Tag.indexdbUnmarshal(tail)
if err != nil {
return fmt.Errorf("cannot unmarshal tag from tenantID:name:value -> streamIDs row %q: %w", b, err)
}
if err = sp.InitOnlyTail(tail); err != nil {
return fmt.Errorf("cannot initialize tail from tenantID:name:value -> streamIDs row %q: %w", b, err)
}
return nil
}
// MarshalPrefix marshals row prefix without tail to dst.
func (sp *tagToStreamIDsRowParser) MarshalPrefix(dst []byte) []byte {
dst = marshalCommonPrefix(dst, nsPrefixTagToStreamIDs, sp.TenantID)
dst = sp.Tag.indexdbMarshal(dst)
return dst
}
// InitOnlyTail initializes sp.tail from tail, which must contain streamIDs.
//
// tail cannot be re-used until Reset call.
//
// ParseStreamIDs() must be called later for obtaining sp.StreamIDs from the given tail.
func (sp *tagToStreamIDsRowParser) InitOnlyTail(tail []byte) error {
if len(tail) == 0 {
return fmt.Errorf("missing streamID in the tenantID:name:value -> streamIDs row")
}
if len(tail)%16 != 0 {
return fmt.Errorf("invalid tail length in the tenantID:name:value -> streamIDs row; got %d bytes; must be multiple of 16 bytes", len(tail))
}
sp.tail = tail
sp.streamIDsParsed = false
return nil
}
// EqualPrefix returns true if prefixes for sp and x are equal.
//
// Prefix contains (tenantID:name:value)
func (sp *tagToStreamIDsRowParser) EqualPrefix(x *tagToStreamIDsRowParser) bool {
if !sp.TenantID.equal(&x.TenantID) {
return false
}
if !sp.Tag.equal(&x.Tag) {
return false
}
return true
}
// StreamIDsLen returns the number of StreamIDs in the sp.tail
func (sp *tagToStreamIDsRowParser) StreamIDsLen() int {
return len(sp.tail) / 16
}
// ParseStreamIDs parses StreamIDs from sp.tail into sp.StreamIDs.
func (sp *tagToStreamIDsRowParser) ParseStreamIDs() {
if sp.streamIDsParsed {
return
}
tail := sp.tail
n := len(tail) / 16
streamIDs := sp.StreamIDs[:0]
if n <= cap(streamIDs) {
streamIDs = streamIDs[:n]
} else {
streamIDs = append(streamIDs[:cap(streamIDs)], make([]u128, n-cap(streamIDs))...)
}
sp.StreamIDs = streamIDs
for i := 0; i < n; i++ {
var err error
tail, err = streamIDs[i].unmarshal(tail)
if err != nil {
logger.Panicf("FATAL: cannot unmarshal streamID: %s", err)
}
}
sp.streamIDsParsed = true
}
func (sp *tagToStreamIDsRowParser) UpdateStreamIDs(ids map[u128]struct{}, tail []byte) {
sp.Reset()
if err := sp.InitOnlyTail(tail); err != nil {
logger.Panicf("FATAL: cannot parse '(date, tag) -> streamIDs' row: %s", err)
}
sp.ParseStreamIDs()
for _, id := range sp.StreamIDs {
ids[id] = struct{}{}
}
}
// commonPrefixLen is the length of common prefix for indexdb rows
// 1 byte for ns* prefix + 8 bytes for tenantID
const commonPrefixLen = 1 + 8
func marshalCommonPrefix(dst []byte, nsPrefix byte, tenantID TenantID) []byte {
dst = append(dst, nsPrefix)
dst = tenantID.marshal(dst)
return dst
}
func unmarshalCommonPrefix(dstTenantID *TenantID, src []byte) ([]byte, byte, error) {
if len(src) < commonPrefixLen {
return nil, 0, fmt.Errorf("cannot unmarshal common prefix from %d bytes; need at least %d bytes; data=%X", len(src), commonPrefixLen, src)
}
prefix := src[0]
src = src[1:]
tail, err := dstTenantID.unmarshal(src)
if err != nil {
return nil, 0, fmt.Errorf("cannot unmarshal tenantID: %s", err)
}
return tail, prefix, nil
}
func checkItemsSorted(data []byte, items []mergeset.Item) bool {
if len(items) == 0 {
return true
}
prevItem := items[0].String(data)
for _, it := range items[1:] {
currItem := it.String(data)
if prevItem > currItem {
return false
}
prevItem = currItem
}
return true
}

View file

@ -0,0 +1,253 @@
package logstorage
import (
"fmt"
"reflect"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
func TestStorageSearchStreamIDs(t *testing.T) {
const path = "TestStorageSearchStreamIDs"
const partitionName = "foobar"
s := newTestStorage()
mustCreateIndexdb(path)
idb := mustOpenIndexdb(path, partitionName, s)
tenantID := TenantID{
AccountID: 123,
ProjectID: 567,
}
getStreamIDForTags := func(tags map[string]string) (streamID, []byte) {
st := GetStreamTags()
for k, v := range tags {
st.Add(k, v)
}
streamTagsCanonical := st.MarshalCanonical(nil)
PutStreamTags(st)
id := hash128(streamTagsCanonical)
sid := streamID{
tenantID: tenantID,
id: id,
}
return sid, streamTagsCanonical
}
// Create indexdb entries
const jobsCount = 7
const instancesCount = 5
for i := 0; i < jobsCount; i++ {
for j := 0; j < instancesCount; j++ {
sid, streamTagsCanonical := getStreamIDForTags(map[string]string{
"job": fmt.Sprintf("job-%d", i),
"instance": fmt.Sprintf("instance-%d", j),
})
idb.mustRegisterStream(&sid, streamTagsCanonical)
}
}
idb.debugFlush()
f := func(streamFilter string, expectedStreamIDs []streamID) {
t.Helper()
sf := mustNewStreamFilter(streamFilter)
if expectedStreamIDs == nil {
expectedStreamIDs = []streamID{}
}
sortStreamIDs(expectedStreamIDs)
for i := 0; i < 3; i++ {
streamIDs := idb.searchStreamIDs([]TenantID{tenantID}, sf)
if !reflect.DeepEqual(streamIDs, expectedStreamIDs) {
t.Fatalf("unexpected streamIDs on iteration %d; got %v; want %v", i, streamIDs, expectedStreamIDs)
}
}
}
t.Run("missing-tenant-id", func(t *testing.T) {
tenantID := TenantID{
AccountID: 1,
ProjectID: 2,
}
sf := mustNewStreamFilter(`{job="job-0",instance="instance-0"}`)
for i := 0; i < 3; i++ {
streamIDs := idb.searchStreamIDs([]TenantID{tenantID}, sf)
if len(streamIDs) > 0 {
t.Fatalf("unexpected non-empty streamIDs on iteration %d: %d", i, len(streamIDs))
}
}
})
t.Run("missing-job", func(t *testing.T) {
f(`{job="non-existing-job",instance="instance-0"}`, nil)
})
t.Run("missing-job-re", func(t *testing.T) {
f(`{job=~"non-existing-job|",instance="instance-0"}`, nil)
})
t.Run("missing-job-negative-re", func(t *testing.T) {
f(`{job!~"job.+",instance="instance-0"}`, nil)
})
t.Run("empty-job", func(t *testing.T) {
f(`{job="",instance="instance-0"}`, nil)
})
t.Run("missing-instance", func(t *testing.T) {
f(`{job="job-0",instance="non-existing-instance"}`, nil)
})
t.Run("missing-instance-re", func(t *testing.T) {
f(`{job="job-0",instance=~"non-existing-instance|"}`, nil)
})
t.Run("missing-instance-negative-re", func(t *testing.T) {
f(`{job="job-0",instance!~"instance.+"}`, nil)
})
t.Run("empty-instance", func(t *testing.T) {
f(`{job="job-0",instance=""}`, nil)
})
t.Run("non-existing-tag", func(t *testing.T) {
f(`{job="job-0",instance="instance-0",non_existing_tag="foobar"}`, nil)
})
t.Run("non-existing-non-empty-tag", func(t *testing.T) {
f(`{job="job-0",instance="instance-0",non_existing_tag!=""}`, nil)
})
t.Run("non-existing-tag-re", func(t *testing.T) {
f(`{job="job-0",instance="instance-0",non_existing_tag=~"foo.+"}`, nil)
})
t.Run("non-existing-non-empty-tag-re", func(t *testing.T) {
f(`{job="job-0",instance="instance-0",non_existing_tag!~""}`, nil)
})
t.Run("match-job-instance", func(t *testing.T) {
sid, _ := getStreamIDForTags(map[string]string{
"instance": "instance-0",
"job": "job-0",
})
f(`{job="job-0",instance="instance-0"}`, []streamID{sid})
})
t.Run("match-non-existing-tag", func(t *testing.T) {
sid, _ := getStreamIDForTags(map[string]string{
"instance": "instance-0",
"job": "job-0",
})
f(`{job="job-0",instance="instance-0",non_existing_tag=~"foo|"}`, []streamID{sid})
})
t.Run("match-job", func(t *testing.T) {
var streamIDs []streamID
for i := 0; i < instancesCount; i++ {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", i),
"job": "job-0",
})
streamIDs = append(streamIDs, sid)
}
f(`{job="job-0"}`, streamIDs)
})
t.Run("match-instance", func(t *testing.T) {
var streamIDs []streamID
for i := 0; i < jobsCount; i++ {
sid, _ := getStreamIDForTags(map[string]string{
"instance": "instance-1",
"job": fmt.Sprintf("job-%d", i),
})
streamIDs = append(streamIDs, sid)
}
f(`{instance="instance-1"}`, streamIDs)
})
t.Run("match-re", func(t *testing.T) {
var streamIDs []streamID
for _, instanceID := range []int{3, 1} {
for _, jobID := range []int{0, 2} {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", instanceID),
"job": fmt.Sprintf("job-%d", jobID),
})
streamIDs = append(streamIDs, sid)
}
}
f(`{job=~"job-(0|2)",instance=~"instance-[13]"}`, streamIDs)
})
t.Run("match-re-empty-match", func(t *testing.T) {
var streamIDs []streamID
for _, instanceID := range []int{3, 1} {
for _, jobID := range []int{0, 2} {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", instanceID),
"job": fmt.Sprintf("job-%d", jobID),
})
streamIDs = append(streamIDs, sid)
}
}
f(`{job=~"job-(0|2)|",instance=~"instance-[13]"}`, streamIDs)
})
t.Run("match-negative-re", func(t *testing.T) {
var instanceIDs []int
for i := 0; i < instancesCount; i++ {
if i != 0 && i != 1 {
instanceIDs = append(instanceIDs, i)
}
}
var jobIDs []int
for i := 0; i < jobsCount; i++ {
if i > 2 {
jobIDs = append(jobIDs, i)
}
}
var streamIDs []streamID
for _, instanceID := range instanceIDs {
for _, jobID := range jobIDs {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", instanceID),
"job": fmt.Sprintf("job-%d", jobID),
})
streamIDs = append(streamIDs, sid)
}
}
f(`{job!~"job-[0-2]",instance!~"instance-(0|1)"}`, streamIDs)
})
t.Run("match-negative-re-empty-match", func(t *testing.T) {
var instanceIDs []int
for i := 0; i < instancesCount; i++ {
if i != 0 && i != 1 {
instanceIDs = append(instanceIDs, i)
}
}
var jobIDs []int
for i := 0; i < jobsCount; i++ {
if i > 2 {
jobIDs = append(jobIDs, i)
}
}
var streamIDs []streamID
for _, instanceID := range instanceIDs {
for _, jobID := range jobIDs {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", instanceID),
"job": fmt.Sprintf("job-%d", jobID),
})
streamIDs = append(streamIDs, sid)
}
}
f(`{job!~"job-[0-2]",instance!~"instance-(0|1)|"}`, streamIDs)
})
t.Run("match-negative-job", func(t *testing.T) {
instanceIDs := []int{2}
var jobIDs []int
for i := 0; i < jobsCount; i++ {
if i != 1 {
jobIDs = append(jobIDs, i)
}
}
var streamIDs []streamID
for _, instanceID := range instanceIDs {
for _, jobID := range jobIDs {
sid, _ := getStreamIDForTags(map[string]string{
"instance": fmt.Sprintf("instance-%d", instanceID),
"job": fmt.Sprintf("job-%d", jobID),
})
streamIDs = append(streamIDs, sid)
}
}
f(`{instance="instance-2",job!="job-1"}`, streamIDs)
})
mustCloseIndexdb(idb)
fs.MustRemoveAll(path)
closeTestStorage(s)
}

View file

@ -0,0 +1,155 @@
package logstorage
import (
"path/filepath"
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
// inmemoryPart is an in-memory part.
type inmemoryPart struct {
// ph contains partHeader information for the given in-memory part.
ph partHeader
metaindex bytesutil.ByteBuffer
index bytesutil.ByteBuffer
columnsHeader bytesutil.ByteBuffer
timestamps bytesutil.ByteBuffer
fieldValues bytesutil.ByteBuffer
fieldBloomFilter bytesutil.ByteBuffer
messageValues bytesutil.ByteBuffer
messageBloomFilter bytesutil.ByteBuffer
}
// reset resets mp, so it can be re-used
func (mp *inmemoryPart) reset() {
mp.ph.reset()
mp.metaindex.Reset()
mp.index.Reset()
mp.columnsHeader.Reset()
mp.timestamps.Reset()
mp.fieldValues.Reset()
mp.fieldBloomFilter.Reset()
mp.messageValues.Reset()
mp.messageBloomFilter.Reset()
}
// mustInitFromRows initializes mp from lr.
func (mp *inmemoryPart) mustInitFromRows(lr *LogRows) {
mp.reset()
if len(lr.timestamps) == 0 {
return
}
sort.Sort(lr)
bsw := getBlockStreamWriter()
bsw.MustInitForInmemoryPart(mp)
trs := getTmpRows()
var sidPrev *streamID
uncompressedBlockSizeBytes := uint64(0)
timestamps := lr.timestamps
rows := lr.rows
streamIDs := lr.streamIDs
for i := range timestamps {
streamID := &streamIDs[i]
if sidPrev == nil {
sidPrev = streamID
}
if uncompressedBlockSizeBytes >= maxUncompressedBlockSize || !streamID.equal(sidPrev) {
bsw.MustWriteRows(sidPrev, trs.timestamps, trs.rows)
trs.reset()
sidPrev = streamID
uncompressedBlockSizeBytes = 0
}
fields := rows[i]
trs.timestamps = append(trs.timestamps, timestamps[i])
trs.rows = append(trs.rows, fields)
uncompressedBlockSizeBytes += uncompressedRowSizeBytes(fields)
}
bsw.MustWriteRows(sidPrev, trs.timestamps, trs.rows)
putTmpRows(trs)
bsw.Finalize(&mp.ph)
putBlockStreamWriter(bsw)
}
// MustStoreToDisk stores mp to disk at the given path.
func (mp *inmemoryPart) MustStoreToDisk(path string) {
fs.MustMkdirFailIfExist(path)
metaindexPath := filepath.Join(path, metaindexFilename)
indexPath := filepath.Join(path, indexFilename)
columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
timestampsPath := filepath.Join(path, timestampsFilename)
fieldValuesPath := filepath.Join(path, fieldValuesFilename)
fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
messageValuesPath := filepath.Join(path, messageValuesFilename)
messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
fs.MustWriteSync(metaindexPath, mp.metaindex.B)
fs.MustWriteSync(indexPath, mp.index.B)
fs.MustWriteSync(columnsHeaderPath, mp.columnsHeader.B)
fs.MustWriteSync(timestampsPath, mp.timestamps.B)
fs.MustWriteSync(fieldValuesPath, mp.fieldValues.B)
fs.MustWriteSync(fieldBloomFilterPath, mp.fieldBloomFilter.B)
fs.MustWriteSync(messageValuesPath, mp.messageValues.B)
fs.MustWriteSync(messageBloomFilterPath, mp.messageBloomFilter.B)
mp.ph.mustWriteMetadata(path)
fs.MustSyncPath(path)
// Do not sync parent directory - it must be synced by the caller.
}
// tmpRows is used as a helper for inmemoryPart.mustInitFromRows()
type tmpRows struct {
timestamps []int64
rows [][]Field
}
func (trs *tmpRows) reset() {
trs.timestamps = trs.timestamps[:0]
rows := trs.rows
for i := range rows {
rows[i] = nil
}
trs.rows = rows[:0]
}
func getTmpRows() *tmpRows {
v := tmpRowsPool.Get()
if v == nil {
return &tmpRows{}
}
return v.(*tmpRows)
}
func putTmpRows(trs *tmpRows) {
trs.reset()
tmpRowsPool.Put(trs)
}
var tmpRowsPool sync.Pool
func getInmemoryPart() *inmemoryPart {
v := inmemoryPartPool.Get()
if v == nil {
return &inmemoryPart{}
}
return v.(*inmemoryPart)
}
func putInmemoryPart(mp *inmemoryPart) {
mp.reset()
inmemoryPartPool.Put(mp)
}
var inmemoryPartPool sync.Pool

View file

@ -0,0 +1,343 @@
package logstorage
import (
"fmt"
"math"
"math/rand"
"reflect"
"sort"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
func TestInmemoryPartMustInitFromRows(t *testing.T) {
f := func(lr *LogRows, blocksCountExpected int, compressionRateExpected float64) {
t.Helper()
uncompressedSizeBytesExpected := uncompressedRowsSizeBytes(lr.rows)
rowsCountExpected := len(lr.timestamps)
minTimestampExpected := int64(math.MaxInt64)
maxTimestampExpected := int64(math.MinInt64)
// make a copy of lr - it is used for comapring the results later,
// since lr may be modified by inmemoryPart.mustInitFromRows()
lrOrig := GetLogRows(nil, nil)
for i, timestamp := range lr.timestamps {
if timestamp < minTimestampExpected {
minTimestampExpected = timestamp
}
if timestamp > maxTimestampExpected {
maxTimestampExpected = timestamp
}
lrOrig.mustAddInternal(lr.streamIDs[i], timestamp, lr.rows[i], lr.streamTagsCanonicals[i])
}
// Create inmemory part from lr
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
// Check mp.ph
ph := &mp.ph
checkCompressionRate(t, ph, compressionRateExpected)
if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
t.Fatalf("unexpected UncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
}
if ph.RowsCount != uint64(rowsCountExpected) {
t.Fatalf("unexpected rowsCount in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
}
if ph.BlocksCount != uint64(blocksCountExpected) {
t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
}
if ph.RowsCount > 0 {
if ph.MinTimestamp != minTimestampExpected {
t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
}
if ph.MaxTimestamp != maxTimestampExpected {
t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
}
}
// Read log entries from mp to rrsResult
sbu := getStringsBlockUnmarshaler()
defer putStringsBlockUnmarshaler(sbu)
vd := getValuesDecoder()
defer putValuesDecoder(vd)
lrResult := mp.readLogRows(sbu, vd)
putInmemoryPart(mp)
// compare lrOrig to lrResult
if err := checkEqualRows(lrResult, lrOrig); err != nil {
t.Fatalf("unequal log entries: %s", err)
}
}
f(GetLogRows(nil, nil), 0, 0)
// Check how inmemoryPart works with a single stream
f(newTestLogRows(1, 1, 0), 1, 0.8)
f(newTestLogRows(1, 2, 0), 1, 0.9)
f(newTestLogRows(1, 10, 0), 1, 2.0)
f(newTestLogRows(1, 1000, 0), 1, 7.1)
f(newTestLogRows(1, 20000, 0), 2, 7.2)
// Check how inmemoryPart works with multiple streams
f(newTestLogRows(2, 1, 0), 2, 0.8)
f(newTestLogRows(10, 1, 0), 10, 0.9)
f(newTestLogRows(100, 1, 0), 100, 1.0)
f(newTestLogRows(10, 5, 0), 10, 1.4)
f(newTestLogRows(10, 1000, 0), 10, 7.2)
f(newTestLogRows(100, 100, 0), 100, 5.0)
}
func checkCompressionRate(t *testing.T, ph *partHeader, compressionRateExpected float64) {
t.Helper()
compressionRate := float64(ph.UncompressedSizeBytes) / float64(ph.CompressedSizeBytes)
if math.Abs(compressionRate-compressionRateExpected) > 0.1 {
t.Fatalf("unexpected compression rate; got %.1f; want %.1f", compressionRate, compressionRateExpected)
}
}
func TestInmemoryPartInitFromBlockStreamReaders(t *testing.T) {
f := func(lrs []*LogRows, blocksCountExpected int, compressionRateExpected float64) {
t.Helper()
uncompressedSizeBytesExpected := uint64(0)
rowsCountExpected := 0
minTimestampExpected := int64(math.MaxInt64)
maxTimestampExpected := int64(math.MinInt64)
// make a copy of rrss in order to compare the results after merge.
lrOrig := GetLogRows(nil, nil)
for _, lr := range lrs {
uncompressedSizeBytesExpected += uncompressedRowsSizeBytes(lr.rows)
rowsCountExpected += len(lr.timestamps)
for j, timestamp := range lr.timestamps {
if timestamp < minTimestampExpected {
minTimestampExpected = timestamp
}
if timestamp > maxTimestampExpected {
maxTimestampExpected = timestamp
}
lrOrig.mustAddInternal(lr.streamIDs[j], timestamp, lr.rows[j], lr.streamTagsCanonicals[j])
}
}
// Initialize readers from lrs
var mpsSrc []*inmemoryPart
var bsrs []*blockStreamReader
for _, lr := range lrs {
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
mpsSrc = append(mpsSrc, mp)
bsr := getBlockStreamReader()
bsr.MustInitFromInmemoryPart(mp)
bsrs = append(bsrs, bsr)
}
defer func() {
for _, bsr := range bsrs {
putBlockStreamReader(bsr)
}
for _, mp := range mpsSrc {
putInmemoryPart(mp)
}
}()
// Merge data from bsrs into mpDst
mpDst := getInmemoryPart()
bsw := getBlockStreamWriter()
bsw.MustInitForInmemoryPart(mpDst)
mustMergeBlockStreams(&mpDst.ph, bsw, bsrs, nil)
putBlockStreamWriter(bsw)
// Check mpDst.ph stats
ph := &mpDst.ph
checkCompressionRate(t, ph, compressionRateExpected)
if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
t.Fatalf("unexpected uncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
}
if ph.RowsCount != uint64(rowsCountExpected) {
t.Fatalf("unexpected number of entries in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
}
if ph.BlocksCount != uint64(blocksCountExpected) {
t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
}
if ph.RowsCount > 0 {
if ph.MinTimestamp != minTimestampExpected {
t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
}
if ph.MaxTimestamp != maxTimestampExpected {
t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
}
}
// Read log entries from mpDst to rrsResult
sbu := getStringsBlockUnmarshaler()
defer putStringsBlockUnmarshaler(sbu)
vd := getValuesDecoder()
defer putValuesDecoder(vd)
lrResult := mpDst.readLogRows(sbu, vd)
putInmemoryPart(mpDst)
// compare rrsOrig to rrsResult
if err := checkEqualRows(lrResult, lrOrig); err != nil {
t.Fatalf("unequal log entries: %s", err)
}
}
// Check empty readers
f(nil, 0, 0)
f([]*LogRows{GetLogRows(nil, nil)}, 0, 0)
f([]*LogRows{GetLogRows(nil, nil), GetLogRows(nil, nil)}, 0, 0)
// Check merge with a single reader
f([]*LogRows{newTestLogRows(1, 1, 0)}, 1, 0.8)
f([]*LogRows{newTestLogRows(1, 10, 0)}, 1, 2.0)
f([]*LogRows{newTestLogRows(1, 100, 0)}, 1, 4.9)
f([]*LogRows{newTestLogRows(1, 1000, 0)}, 1, 7.1)
f([]*LogRows{newTestLogRows(1, 10000, 0)}, 1, 7.4)
f([]*LogRows{newTestLogRows(10, 1, 0)}, 10, 0.9)
f([]*LogRows{newTestLogRows(100, 1, 0)}, 100, 1.0)
f([]*LogRows{newTestLogRows(1000, 1, 0)}, 1000, 1.0)
f([]*LogRows{newTestLogRows(10, 10, 0)}, 10, 2.1)
f([]*LogRows{newTestLogRows(10, 100, 0)}, 10, 4.9)
//Check merge with multiple readers
f([]*LogRows{
newTestLogRows(1, 1, 0),
newTestLogRows(1, 1, 1),
}, 2, 0.9)
f([]*LogRows{
newTestLogRows(2, 2, 0),
newTestLogRows(2, 2, 0),
}, 2, 1.8)
f([]*LogRows{
newTestLogRows(1, 20, 0),
newTestLogRows(1, 10, 1),
newTestLogRows(1, 5, 2),
}, 3, 2.2)
f([]*LogRows{
newTestLogRows(10, 20, 0),
newTestLogRows(20, 10, 1),
newTestLogRows(30, 5, 2),
}, 60, 2.0)
f([]*LogRows{
newTestLogRows(10, 20, 0),
newTestLogRows(20, 10, 1),
newTestLogRows(30, 5, 2),
newTestLogRows(20, 7, 3),
newTestLogRows(10, 9, 4),
}, 90, 1.9)
}
func newTestLogRows(streams, rowsPerStream int, seed int64) *LogRows {
streamTags := []string{
"some-stream-tag",
}
lr := GetLogRows(streamTags, nil)
rng := rand.New(rand.NewSource(seed))
var fields []Field
for i := 0; i < streams; i++ {
tenantID := TenantID{
AccountID: rng.Uint32(),
ProjectID: rng.Uint32(),
}
for j := 0; j < rowsPerStream; j++ {
// Add stream tags
fields = append(fields[:0], Field{
Name: "some-stream-tag",
Value: fmt.Sprintf("some-stream-value-%d", i),
})
// Add the remaining tags
for k := 0; k < 5; k++ {
if rng.Float64() < 0.5 {
fields = append(fields, Field{
Name: fmt.Sprintf("field_%d", k),
Value: fmt.Sprintf("value_%d_%d_%d", i, j, k),
})
}
}
// add a message field
fields = append(fields, Field{
Name: "",
Value: fmt.Sprintf("some row number %d at stream %d", j, i),
})
// add a field with constant value
fields = append(fields, Field{
Name: "job",
Value: "foobar",
})
// add a field with uint value
fields = append(fields, Field{
Name: "response_size_bytes",
Value: fmt.Sprintf("%d", rng.Intn(1234)),
})
// shuffle fields in order to check de-shuffling algorithm
rng.Shuffle(len(fields), func(i, j int) {
fields[i], fields[j] = fields[j], fields[i]
})
timestamp := rng.Int63()
lr.MustAdd(tenantID, timestamp, fields)
}
}
return lr
}
func checkEqualRows(lrResult, lrOrig *LogRows) error {
if len(lrResult.timestamps) != len(lrOrig.timestamps) {
return fmt.Errorf("unexpected length LogRows; got %d; want %d", len(lrResult.timestamps), len(lrOrig.timestamps))
}
sort.Sort(lrResult)
sort.Sort(lrOrig)
sortFieldNames := func(fields []Field) {
sort.Slice(fields, func(i, j int) bool {
return fields[i].Name < fields[j].Name
})
}
for i := range lrOrig.timestamps {
if !lrOrig.streamIDs[i].equal(&lrResult.streamIDs[i]) {
return fmt.Errorf("unexpected streamID for log entry %d\ngot\n%s\nwant\n%s", i, &lrResult.streamIDs[i], &lrOrig.streamIDs[i])
}
if lrOrig.timestamps[i] != lrResult.timestamps[i] {
return fmt.Errorf("unexpected timestamp for log entry %d\ngot\n%d\nwant\n%d", i, lrResult.timestamps[i], lrOrig.timestamps[i])
}
fieldsOrig := lrOrig.rows[i]
fieldsResult := lrResult.rows[i]
if len(fieldsOrig) != len(fieldsResult) {
return fmt.Errorf("unexpected number of fields at log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
}
sortFieldNames(fieldsOrig)
sortFieldNames(fieldsResult)
if !reflect.DeepEqual(fieldsOrig, fieldsResult) {
return fmt.Errorf("unexpected fields for log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
}
}
return nil
}
// readLogRows reads log entries from mp.
//
// This function is for testing and debugging purposes only.
func (mp *inmemoryPart) readLogRows(sbu *stringsBlockUnmarshaler, vd *valuesDecoder) *LogRows {
lr := GetLogRows(nil, nil)
bsr := getBlockStreamReader()
defer putBlockStreamReader(bsr)
bsr.MustInitFromInmemoryPart(mp)
var tmp rows
for bsr.NextBlock() {
bd := &bsr.blockData
streamID := bd.streamID
if err := bd.unmarshalRows(&tmp, sbu, vd); err != nil {
logger.Panicf("BUG: cannot unmarshal log entries from inmemoryPart: %s", err)
}
for i, timestamp := range tmp.timestamps {
lr.MustAdd(streamID.tenantID, timestamp, tmp.rows[i])
lr.streamIDs[len(lr.streamIDs)-1] = streamID
}
tmp.reset()
}
return lr
}

View file

@ -0,0 +1,34 @@
package logstorage
import (
"fmt"
"testing"
)
func BenchmarkInmemoryPart_MustInitFromRows(b *testing.B) {
for _, streams := range []int{1, 10, 100} {
b.Run(fmt.Sprintf("streams_%d", streams), func(b *testing.B) {
for _, rowsPerStream := range []int{1, 10, 100, 1000} {
b.Run(fmt.Sprintf("rowsPerStream_%d", rowsPerStream), func(b *testing.B) {
benchmarkInmemoryPartMustInitFromRows(b, streams, rowsPerStream)
})
}
})
}
}
func benchmarkInmemoryPartMustInitFromRows(b *testing.B, streams, rowsPerStream int) {
b.ReportAllocs()
b.SetBytes(int64(streams * rowsPerStream))
b.RunParallel(func(pb *testing.PB) {
lr := newTestLogRows(streams, rowsPerStream, 0)
mp := getInmemoryPart()
for pb.Next() {
mp.mustInitFromRows(lr)
if mp.ph.RowsCount != uint64(len(lr.timestamps)) {
panic(fmt.Errorf("unexpecte number of entries in the output stream; got %d; want %d", mp.ph.RowsCount, len(lr.timestamps)))
}
}
putInmemoryPart(mp)
})
}

277
lib/logstorage/log_rows.go Normal file
View file

@ -0,0 +1,277 @@
package logstorage
import (
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
)
// LogRows holds a set of rows needed for Storage.MustAddRows
//
// LogRows must be obtained via GetLogRows()
type LogRows struct {
// buf holds all the bytes referred by items in LogRows
buf []byte
// fieldsBuf holds all the fields referred by items in LogRows
fieldsBuf []Field
// streamIDs holds streamIDs for rows added to LogRows
streamIDs []streamID
// streamTagsCanonicals holds streamTagsCanonical entries for rows added to LogRows
streamTagsCanonicals [][]byte
// timestamps holds stimestamps for rows added to LogRows
timestamps []int64
// rows holds fields for rows atted to LogRows.
rows [][]Field
// sf is a helper for sorting fields in every added row
sf sortedFields
// streamFields contains names for stream fields
streamFields map[string]struct{}
// ignoreFields contains names for log fields, which must be skipped during data ingestion
ignoreFields map[string]struct{}
}
type sortedFields []Field
func (sf *sortedFields) Len() int {
return len(*sf)
}
func (sf *sortedFields) Less(i, j int) bool {
a := *sf
return a[i].Name < a[j].Name
}
func (sf *sortedFields) Swap(i, j int) {
a := *sf
a[i], a[j] = a[j], a[i]
}
// RowFormatter implementes fmt.Stringer for []Field aka a single log row
type RowFormatter []Field
// String returns user-readable representation for rf
func (rf *RowFormatter) String() string {
b := append([]byte{}, '{')
fields := *rf
if len(fields) > 0 {
b = append(b, fields[0].String()...)
fields = fields[1:]
for _, field := range fields {
b = append(b, ',')
b = append(b, field.String()...)
}
}
b = append(b, '}')
return string(b)
}
// Reset resets lr
func (lr *LogRows) Reset() {
lr.buf = lr.buf[:0]
fb := lr.fieldsBuf
for i := range fb {
fb[i].Reset()
}
lr.fieldsBuf = fb[:0]
sids := lr.streamIDs
for i := range sids {
sids[i].reset()
}
lr.streamIDs = sids[:0]
sns := lr.streamTagsCanonicals
for i := range sns {
sns[i] = nil
}
lr.streamTagsCanonicals = sns[:0]
lr.timestamps = lr.timestamps[:0]
rows := lr.rows
for i := range rows {
rows[i] = nil
}
lr.rows = rows[:0]
lr.sf = nil
sfs := lr.streamFields
for k := range sfs {
delete(sfs, k)
}
ifs := lr.ignoreFields
for k := range ifs {
delete(ifs, k)
}
}
// NeedFlush returns true if lr contains too much data, so it must be flushed to the storage.
func (lr *LogRows) NeedFlush() bool {
return len(lr.buf) > (maxUncompressedBlockSize/8)*7
}
// MustAdd adds a log entry with the given args to lr.
//
// It is OK to modify the args after returning from the function,
// since lr copies all the args to internal data.
func (lr *LogRows) MustAdd(tenantID TenantID, timestamp int64, fields []Field) {
// Compose StreamTags from fields according to lr.streamFields
sfs := lr.streamFields
st := GetStreamTags()
for i := range fields {
f := &fields[i]
if _, ok := sfs[f.Name]; ok {
st.Add(f.Name, f.Value)
}
}
// Marshal StreamTags
bb := bbPool.Get()
bb.B = st.MarshalCanonical(bb.B)
PutStreamTags(st)
// Calculate the id for the StreamTags
var sid streamID
sid.tenantID = tenantID
sid.id = hash128(bb.B)
// Store the row
lr.mustAddInternal(sid, timestamp, fields, bb.B)
bbPool.Put(bb)
}
func (lr *LogRows) mustAddInternal(sid streamID, timestamp int64, fields []Field, streamTagsCanonical []byte) {
buf := lr.buf
bufLen := len(buf)
buf = append(buf, streamTagsCanonical...)
lr.streamTagsCanonicals = append(lr.streamTagsCanonicals, buf[bufLen:])
lr.streamIDs = append(lr.streamIDs, sid)
lr.timestamps = append(lr.timestamps, timestamp)
// Store all the fields
ifs := lr.ignoreFields
fb := lr.fieldsBuf
fieldsLen := len(fb)
for i := range fields {
f := &fields[i]
if _, ok := ifs[f.Name]; ok {
// Skip fields from the ifs map
continue
}
if f.Value == "" {
// Skip fields without values
continue
}
fb = append(fb, Field{})
dstField := &fb[len(fb)-1]
bufLen = len(buf)
if f.Name != "_msg" {
buf = append(buf, f.Name...)
}
dstField.Name = bytesutil.ToUnsafeString(buf[bufLen:])
bufLen = len(buf)
buf = append(buf, f.Value...)
dstField.Value = bytesutil.ToUnsafeString(buf[bufLen:])
}
lr.sf = fb[fieldsLen:]
sort.Sort(&lr.sf)
lr.rows = append(lr.rows, lr.sf)
lr.fieldsBuf = fb
lr.buf = buf
}
// GetLogRows returns LogRows from the pool for the given streamFields.
//
// streamFields is a set of field names, which must be associated with the stream.
//
// Return back it to the pool with PutLogRows() when it is no longer needed.
func GetLogRows(streamFields, ignoreFields []string) *LogRows {
v := logRowsPool.Get()
if v == nil {
v = &LogRows{}
}
lr := v.(*LogRows)
// Initialize streamFields
sfs := lr.streamFields
if sfs == nil {
sfs = make(map[string]struct{}, len(streamFields))
lr.streamFields = sfs
}
for _, f := range streamFields {
sfs[f] = struct{}{}
}
// Initialize ignoreFields
ifs := lr.ignoreFields
if ifs == nil {
ifs = make(map[string]struct{}, len(ignoreFields))
lr.ignoreFields = ifs
}
for _, f := range ignoreFields {
if f != "" {
ifs[f] = struct{}{}
}
}
return lr
}
// PutLogRows returns lr to the pool.
func PutLogRows(lr *LogRows) {
lr.Reset()
logRowsPool.Put(lr)
}
var logRowsPool sync.Pool
// Len returns the number of items in lr.
func (lr *LogRows) Len() int {
return len(lr.streamIDs)
}
// Less returns true if (streamID, timestamp) for row i is smaller than the (streamID, timestamp) for row j
func (lr *LogRows) Less(i, j int) bool {
a := &lr.streamIDs[i]
b := &lr.streamIDs[j]
if !a.equal(b) {
return a.less(b)
}
return lr.timestamps[i] < lr.timestamps[j]
}
// Swap swaps rows i and j in lr.
func (lr *LogRows) Swap(i, j int) {
a := &lr.streamIDs[i]
b := &lr.streamIDs[j]
*a, *b = *b, *a
tsA, tsB := &lr.timestamps[i], &lr.timestamps[j]
*tsA, *tsB = *tsB, *tsA
snA, snB := &lr.streamTagsCanonicals[i], &lr.streamTagsCanonicals[j]
*snA, *snB = *snB, *snA
fieldsA, fieldsB := &lr.rows[i], &lr.rows[j]
*fieldsA, *fieldsB = *fieldsB, *fieldsA
}

View file

@ -0,0 +1,83 @@
package logstorage
import (
"testing"
)
func BenchmarkLogRowsMustAdd(b *testing.B) {
rows := newBenchRows(map[string]string{
"input.type": "filestream",
"ecs.version": "8.0.0",
"host.hostname": "foobar-baz-abc",
"host.architecture": "x86_64",
"host.name": "foobar-baz-abc",
"host.os.codename": "bionic",
"host.os.type": "linux",
"host.os.platform": "ubuntu",
"host.os.version": "18.04.6 LTS (Bionic Beaver)",
"host.os.family": "debian",
"host.os.name": "Ubuntu",
"host.os.kernel": "4.15.0-211-generic",
"host.id": "a634d50249af449dbcb3ce724822568a",
"host.containerized": "false",
"host.ip": `["10.0.0.42","10.224.112.1","172.20.0.1","172.18.0.1","172.19.0.1","fc00:f853:ccd:e793::1","fe80::1","172.21.0.1","172.17.0.1"]`,
"host.mac": `["02-42-42-90-52-D9","02-42-C6-48-A6-84","02-42-FD-91-7E-17","52-54-00-F5-13-E7","54-E1-AD-89-1A-4C","F8-34-41-3C-C0-85"]`,
"agent.ephemeral_id": "6c251f67-7210-4cef-8f72-a9546cbb48cc",
"agent.id": "e97243c5-5ef3-4dc1-8828-504f68731e87",
"agent.name": "foobar-baz-abc",
"agent.type": "filebeat",
"agent.version": "8.8.0",
"log.file.path": "/var/log/auth.log",
"log.offset": "37908",
}, []string{
"Jun 4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=0)",
"Jun 4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=1)",
"Jun 4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=2)",
"Jun 4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=3)",
"Jun 4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=4)",
})
streamFields := []string{
"host.hostname",
"agent.name",
"log.file.path",
}
b.ReportAllocs()
b.SetBytes(int64(len(rows)))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
benchmarkLogRowsMustAdd(rows, streamFields)
}
})
}
func benchmarkLogRowsMustAdd(rows [][]Field, streamFields []string) {
lr := GetLogRows(streamFields, nil)
var tid TenantID
for i, fields := range rows {
tid.AccountID = uint32(i)
tid.ProjectID = uint32(2 * i)
timestamp := int64(i) * 1000
lr.MustAdd(tid, timestamp, fields)
}
PutLogRows(lr)
}
func newBenchRows(constFields map[string]string, messages []string) [][]Field {
rows := make([][]Field, 0, len(messages))
for _, msg := range messages {
row := make([]Field, 0, len(constFields)+1)
for k, v := range constFields {
row = append(row, Field{
Name: k,
Value: v,
})
}
row = append(row, Field{
Name: "_msg",
Value: msg,
})
rows = append(rows, row)
}
return rows
}

1100
lib/logstorage/parser.go Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,966 @@
package logstorage
import (
"math"
"reflect"
"testing"
"time"
)
func TestLexer(t *testing.T) {
f := func(s string, tokensExpected []string) {
t.Helper()
lex := newLexer(s)
for _, tokenExpected := range tokensExpected {
lex.nextToken()
if lex.token != tokenExpected {
t.Fatalf("unexpected token; got %q; want %q", lex.token, tokenExpected)
}
}
lex.nextToken()
if lex.token != "" {
t.Fatalf("unexpected tail token: %q", lex.token)
}
}
f("", nil)
f(" ", nil)
f("foo", []string{"foo"})
f(ест123", []string{ест123"})
f("foo:bar", []string{"foo", ":", "bar"})
f(` re ( "тест(\":" ) `, []string{"re", "(", `тест(":`, ")"})
f(" `foo, bar`* AND baz:(abc or 'd\\'\"ЙЦУК `'*)", []string{"foo, bar", "*", "AND", "baz", ":", "(", "abc", "or", `d'"ЙЦУК ` + "`", "*", ")"})
f(`_stream:{foo="bar",a=~"baz", b != 'cd',"d,}a"!~abc}`,
[]string{"_stream", ":", "{", "foo", "=", "bar", ",", "a", "=~", "baz", ",", "b", "!=", "cd", ",", "d,}a", "!~", "abc", "}"})
}
func TestNewStreamFilterSuccess(t *testing.T) {
f := func(s, resultExpected string) {
t.Helper()
sf, err := newStreamFilter(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
result := sf.String()
if result != resultExpected {
t.Fatalf("unexpected StreamFilter; got %s; want %s", result, resultExpected)
}
}
f("{}", "{}")
f(`{foo="bar"}`, `{foo="bar"}`)
f(`{ "foo" =~ "bar.+" , baz!="a" or x="y"}`, `{foo=~"bar.+",baz!="a" or x="y"}`)
f(`{"a b"='c}"d' OR de="aaa"}`, `{"a b"="c}\"d" or de="aaa"}`)
f(`{a="b", c="d" or x="y"}`, `{a="b",c="d" or x="y"}`)
}
func TestNewStreamFilterFailure(t *testing.T) {
f := func(s string) {
t.Helper()
sf, err := newStreamFilter(s)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if sf != nil {
t.Fatalf("expecting nil sf; got %v", sf)
}
}
f("")
f("}")
f("{")
f("{foo")
f("{foo}")
f("{'foo")
f("{foo=")
f("{foo or bar}")
f("{foo=bar")
f("{foo=bar baz}")
f("{foo='bar' baz='x'}")
}
func TestParseTimeRange(t *testing.T) {
f := func(s string, minTimestampExpected, maxTimestampExpected int64) {
t.Helper()
q, err := ParseQuery("_time:" + s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
tf, ok := q.f.(*timeFilter)
if !ok {
t.Fatalf("unexpected filter; got %T; want *timeFilter; filter: %s", q.f, q.f)
}
if tf.stringRepr != s {
t.Fatalf("unexpected string represenation for timeFilter; got %q; want %q", tf.stringRepr, s)
}
if tf.minTimestamp != minTimestampExpected {
t.Fatalf("unexpected minTimestamp; got %s; want %s", timestampToString(tf.minTimestamp), timestampToString(minTimestampExpected))
}
if tf.maxTimestamp != maxTimestampExpected {
t.Fatalf("unexpected maxTimestamp; got %s; want %s", timestampToString(tf.maxTimestamp), timestampToString(maxTimestampExpected))
}
}
var minTimestamp, maxTimestamp int64
// _time:YYYY -> _time:[YYYY, YYYY+1)
minTimestamp = time.Date(2023, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2024, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023", minTimestamp, maxTimestamp)
f("2023Z", minTimestamp, maxTimestamp)
// _time:YYYY-hh:mm -> _time:[YYYY-hh:mm, (YYYY+1)-hh:mm)
minTimestamp = time.Date(2023, time.January, 1, 2, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2024, time.January, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02:00", minTimestamp, maxTimestamp)
// _time:YYYY+hh:mm -> _time:[YYYY+hh:mm, (YYYY+1)+hh:mm)
minTimestamp = time.Date(2022, time.December, 31, 22, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.December, 31, 22, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023+02:00", minTimestamp, maxTimestamp)
// _time:YYYY-MM -> _time:[YYYY-MM, YYYY-MM+1)
minTimestamp = time.Date(2023, time.February, 1, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02", minTimestamp, maxTimestamp)
f("2023-02Z", minTimestamp, maxTimestamp)
// _time:YYYY-MM-hh:mm -> _time:[YYYY-MM-hh:mm, (YYYY-MM+1)-hh:mm)
minTimestamp = time.Date(2023, time.February, 1, 2, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-02:00", minTimestamp, maxTimestamp)
// March
minTimestamp = time.Date(2023, time.March, 1, 2, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.April, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-03-02:00", minTimestamp, maxTimestamp)
// _time:YYYY-MM+hh:mm -> _time:[YYYY-MM+hh:mm, (YYYY-MM+1)+hh:mm)
minTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 31, 21, 35, 0, 0, time.UTC).UnixNano() - 1
f("2023-03+02:25", minTimestamp, maxTimestamp)
// February with timezone offset
minTimestamp = time.Date(2023, time.January, 31, 21, 35, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano() - 1
f("2023-02+02:25", minTimestamp, maxTimestamp)
// February with timezone offset at leap year
minTimestamp = time.Date(2024, time.January, 31, 21, 35, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2024, time.February, 29, 21, 35, 0, 0, time.UTC).UnixNano() - 1
f("2024-02+02:25", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DD
minTimestamp = time.Date(2023, time.February, 12, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.February, 13, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-12", minTimestamp, maxTimestamp)
f("2023-02-12Z", minTimestamp, maxTimestamp)
// February 28
minTimestamp = time.Date(2023, time.February, 28, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28", minTimestamp, maxTimestamp)
// January 31
minTimestamp = time.Date(2023, time.January, 31, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.February, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-01-31", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DD-hh:mm
minTimestamp = time.Date(2023, time.January, 31, 2, 25, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.February, 1, 2, 25, 0, 0, time.UTC).UnixNano() - 1
f("2023-01-31-02:25", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DD+hh:mm
minTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 21, 35, 0, 0, time.UTC).UnixNano() - 1
f("2023-03-01+02:25", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH
minTimestamp = time.Date(2023, time.February, 28, 23, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28T23", minTimestamp, maxTimestamp)
f("2023-02-28T23Z", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH-hh:mm
minTimestamp = time.Date(2023, time.February, 28, 01, 25, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.February, 28, 02, 25, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-27T23-02:25", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH+hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 35, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 00, 35, 0, 0, time.UTC).UnixNano() - 1
f("2023-03-01T02+02:25", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28T23:59", minTimestamp, maxTimestamp)
f("2023-02-28T23:59Z", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM-hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28T22:59-01:00", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM+hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-03-01T00:59+01:00", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM:SS-hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28T23:59:59", minTimestamp, maxTimestamp)
f("2023-02-28T23:59:59Z", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM:SS-hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-02-28T22:59:59-01:00", minTimestamp, maxTimestamp)
// _time:YYYY-MM-DDTHH:MM:SS+hh:mm
minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f("2023-03-01T00:59:59+01:00", minTimestamp, maxTimestamp)
// _time:(start, end)
minTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() + 1
maxTimestamp = time.Date(2023, time.April, 6, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f(`(2023-03-01,2023-04-06)`, minTimestamp, maxTimestamp)
// _time:[start, end)
minTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.April, 6, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f(`[2023-03-01,2023-04-06)`, minTimestamp, maxTimestamp)
// _time:(start, end]
minTimestamp = time.Date(2023, time.March, 1, 21, 20, 0, 0, time.UTC).UnixNano() + 1
maxTimestamp = time.Date(2023, time.April, 7, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f(`(2023-03-01T21:20,2023-04-06]`, minTimestamp, maxTimestamp)
// _time:[start, end]
minTimestamp = time.Date(2023, time.February, 28, 21, 40, 0, 0, time.UTC).UnixNano()
maxTimestamp = time.Date(2023, time.April, 7, 0, 0, 0, 0, time.UTC).UnixNano() - 1
f(`[2023-03-01+02:20,2023-04-06T23]`, minTimestamp, maxTimestamp)
}
func TestParseSequenceFilter(t *testing.T) {
f := func(s, fieldNameExpected string, phrasesExpected []string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
sf, ok := q.f.(*sequenceFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *sequenceFilter; filter: %s", q.f, q.f)
}
if sf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", sf.fieldName, fieldNameExpected)
}
if !reflect.DeepEqual(sf.phrases, phrasesExpected) {
t.Fatalf("unexpected phrases\ngot\n%q\nwant\n%q", sf.phrases, phrasesExpected)
}
}
f(`seq()`, ``, nil)
f(`foo:seq(foo)`, `foo`, []string{"foo"})
f(`_msg:seq("foo bar,baz")`, `_msg`, []string{"foo bar,baz"})
f(`seq(foo,bar-baz.aa"bb","c,)d")`, ``, []string{"foo", `bar-baz.aa"bb"`, "c,)d"})
}
func TestParseInFilter(t *testing.T) {
f := func(s, fieldNameExpected string, valuesExpected []string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
f, ok := q.f.(*inFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *inFilter; filter: %s", q.f, q.f)
}
if f.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", f.fieldName, fieldNameExpected)
}
if !reflect.DeepEqual(f.values, valuesExpected) {
t.Fatalf("unexpected values\ngot\n%q\nwant\n%q", f.values, valuesExpected)
}
}
f(`in()`, ``, nil)
f(`foo:in(foo)`, `foo`, []string{"foo"})
f(`:in("foo bar,baz")`, ``, []string{"foo bar,baz"})
f(`ip:in(1.2.3.4, 5.6.7.8, 9.10.11.12)`, `ip`, []string{"1.2.3.4", "5.6.7.8", "9.10.11.12"})
f(`foo-bar:in(foo,bar-baz.aa"bb","c,)d")`, `foo-bar`, []string{"foo", `bar-baz.aa"bb"`, "c,)d"})
}
func TestParseIPv4RangeFilter(t *testing.T) {
f := func(s, fieldNameExpected string, minValueExpected, maxValueExpected uint32) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
rf, ok := q.f.(*ipv4RangeFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *ipv4RangeFilter; filter: %s", q.f, q.f)
}
if rf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
}
if rf.minValue != minValueExpected {
t.Fatalf("unexpected minValue; got %08x; want %08x", rf.minValue, minValueExpected)
}
if rf.maxValue != maxValueExpected {
t.Fatalf("unexpected maxValue; got %08x; want %08x", rf.maxValue, maxValueExpected)
}
}
f(`ipv4_range(1.2.3.4, 5.6.7.8)`, ``, 0x01020304, 0x05060708)
f(`_msg:ipv4_range("0.0.0.0", 255.255.255.255)`, `_msg`, 0, 0xffffffff)
f(`ip:ipv4_range(1.2.3.0/24)`, `ip`, 0x01020300, 0x010203ff)
f(`:ipv4_range("1.2.3.34/24")`, ``, 0x01020300, 0x010203ff)
f(`ipv4_range("1.2.3.34/20")`, ``, 0x01020000, 0x01020fff)
f(`ipv4_range("1.2.3.15/32")`, ``, 0x0102030f, 0x0102030f)
f(`ipv4_range(1.2.3.34/0)`, ``, 0, 0xffffffff)
}
func TestParseStringRangeFilter(t *testing.T) {
f := func(s, fieldNameExpected, minValueExpected, maxValueExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
rf, ok := q.f.(*stringRangeFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *stringRangeFilter; filter: %s", q.f, q.f)
}
if rf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
}
if rf.minValue != minValueExpected {
t.Fatalf("unexpected minValue; got %q; want %q", rf.minValue, minValueExpected)
}
if rf.maxValue != maxValueExpected {
t.Fatalf("unexpected maxValue; got %q; want %q", rf.maxValue, maxValueExpected)
}
}
f("string_range(foo, bar)", ``, "foo", "bar")
f(`abc:string_range("foo,bar", "baz) !")`, `abc`, `foo,bar`, `baz) !`)
}
func TestParseRegexpFilter(t *testing.T) {
f := func(s, reExpected string) {
t.Helper()
q, err := ParseQuery("re(" + s + ")")
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
rf, ok := q.f.(*regexpFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *regexpFilter; filter: %s", q.f, q.f)
}
if reString := rf.re.String(); reString != reExpected {
t.Fatalf("unexpected regexp; got %q; want %q", reString, reExpected)
}
}
f(`""`, ``)
f(`foo`, `foo`)
f(`"foo.+|bar.*"`, `foo.+|bar.*`)
f(`"foo(bar|baz),x[y]"`, `foo(bar|baz),x[y]`)
}
func TestParseAnyCasePhraseFilter(t *testing.T) {
f := func(s, fieldNameExpected, phraseExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
pf, ok := q.f.(*anyCasePhraseFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *anyCasePhraseFilter; filter: %s", q.f, q.f)
}
if pf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
}
if pf.phrase != phraseExpected {
t.Fatalf("unexpected phrase; got %q; want %q", pf.phrase, phraseExpected)
}
}
f(`i("")`, ``, ``)
f(`i(foo)`, ``, `foo`)
f(`abc-de.fg:i(foo-bar+baz)`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":i("foo-bar+baz")`, `abc-de.fg`, `foo-bar+baz`)
}
func TestParseAnyCasePrefixFilter(t *testing.T) {
f := func(s, fieldNameExpected, prefixExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
pf, ok := q.f.(*anyCasePrefixFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *anyCasePrefixFilter; filter: %s", q.f, q.f)
}
if pf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
}
if pf.prefix != prefixExpected {
t.Fatalf("unexpected prefix; got %q; want %q", pf.prefix, prefixExpected)
}
}
f(`i(*)`, ``, ``)
f(`i(""*)`, ``, ``)
f(`i(foo*)`, ``, `foo`)
f(`abc-de.fg:i(foo-bar+baz*)`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":i("foo-bar+baz"*)`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":i("foo-bar*baz *"*)`, `abc-de.fg`, `foo-bar*baz *`)
}
func TestParsePhraseFilter(t *testing.T) {
f := func(s, fieldNameExpected, phraseExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
pf, ok := q.f.(*phraseFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *phraseFilter; filter: %s", q.f, q.f)
}
if pf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
}
if pf.phrase != phraseExpected {
t.Fatalf("unexpected prefix; got %q; want %q", pf.phrase, phraseExpected)
}
}
f(`""`, ``, ``)
f(`foo`, ``, `foo`)
f(`abc-de.fg:foo-bar+baz`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":"foo-bar+baz"`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":"foo-bar*baz *"`, `abc-de.fg`, `foo-bar*baz *`)
f(`"foo:bar*,( baz"`, ``, `foo:bar*,( baz`)
}
func TestParsePrefixFilter(t *testing.T) {
f := func(s, fieldNameExpected, prefixExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
pf, ok := q.f.(*prefixFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *prefixFilter; filter: %s", q.f, q.f)
}
if pf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
}
if pf.prefix != prefixExpected {
t.Fatalf("unexpected prefix; got %q; want %q", pf.prefix, prefixExpected)
}
}
f(`*`, ``, ``)
f(`""*`, ``, ``)
f(`foo*`, ``, `foo`)
f(`abc-de.fg:foo-bar+baz*`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":"foo-bar+baz"*`, `abc-de.fg`, `foo-bar+baz`)
f(`"abc-de.fg":"foo-bar*baz *"*`, `abc-de.fg`, `foo-bar*baz *`)
}
func TestParseRangeFilter(t *testing.T) {
f := func(s, fieldNameExpected string, minValueExpected, maxValueExpected float64) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
rf, ok := q.f.(*rangeFilter)
if !ok {
t.Fatalf("unexpected filter type; got %T; want *ipv4RangeFilter; filter: %s", q.f, q.f)
}
if rf.fieldName != fieldNameExpected {
t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
}
if rf.minValue != minValueExpected {
t.Fatalf("unexpected minValue; got %v; want %v", rf.minValue, minValueExpected)
}
if rf.maxValue != maxValueExpected {
t.Fatalf("unexpected maxValue; got %v; want %v", rf.maxValue, maxValueExpected)
}
}
f(`range[-1.234, +2e5]`, ``, -1.234, 2e5)
f(`foo:range[-1.234e-5, 2e5]`, `foo`, -1.234e-5, 2e5)
f(`range:range["-1.234e5", "-2e-5"]`, `range`, -1.234e5, -2e-5)
f(`_msg:range[1, 2]`, `_msg`, 1, 2)
f(`:range(1, 2)`, ``, math.Nextafter(1, math.Inf(1)), math.Nextafter(2, math.Inf(-1)))
f(`range[1, 2)`, ``, 1, math.Nextafter(2, math.Inf(-1)))
f(`range("1", 2]`, ``, math.Nextafter(1, math.Inf(1)), 2)
}
func TestParseQuerySuccess(t *testing.T) {
f := func(s, resultExpected string) {
t.Helper()
q, err := ParseQuery(s)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
result := q.String()
if result != resultExpected {
t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
}
}
f("foo", "foo")
f(":foo", "foo")
f(`"":foo`, "foo")
f(`"" bar`, `"" bar`)
f(`!''`, `!""`)
f(`foo:""`, `foo:""`)
f(`!foo:""`, `!foo:""`)
f(`not foo:""`, `!foo:""`)
f(`not(foo)`, `!foo`)
f(`not (foo)`, `!foo`)
f(`not ( foo or bar )`, `!(foo or bar)`)
f(`foo:!""`, `!foo:""`)
f("_msg:foo", "foo")
f("'foo:bar'", `"foo:bar"`)
f("'!foo'", `"!foo"`)
f("foo 'and' and bar", `foo "and" bar`)
f("foo bar", "foo bar")
f("foo and bar", "foo bar")
f("foo AND bar", "foo bar")
f("foo or bar", "foo or bar")
f("foo OR bar", "foo or bar")
f("not foo", "!foo")
f("! foo", "!foo")
f("not !`foo bar`", `"foo bar"`)
f("foo or bar and not baz", "foo or bar !baz")
f("'foo bar' !baz", `"foo bar" !baz`)
f("foo:!bar", `!foo:bar`)
f(`foo and bar and baz or x or y or z and zz`, `foo bar baz or x or y or z zz`)
f(`foo and bar and (baz or x or y or z) and zz`, `foo bar (baz or x or y or z) zz`)
f(`(foo or bar or baz) and x and y and (z or zz)`, `(foo or bar or baz) x y (z or zz)`)
f(`(foo or bar or baz) and x and y and not (z or zz)`, `(foo or bar or baz) x y !(z or zz)`)
f(`NOT foo AND bar OR baz`, `!foo bar or baz`)
f(`NOT (foo AND bar) OR baz`, `!(foo bar) or baz`)
f(`foo OR bar AND baz`, `foo or bar baz`)
f(`(foo OR bar) AND baz`, `(foo or bar) baz`)
// parens
f(`foo:(bar baz or not :xxx)`, `foo:bar foo:baz or !foo:xxx`)
f(`(foo:bar and (foo:baz or aa:bb) and xx) and y`, `foo:bar (foo:baz or aa:bb) xx y`)
f("level:error and _msg:(a or b)", "level:error (a or b)")
f("level: ( ((error or warn*) and re(foo))) (not (bar))", `(level:error or level:warn*) level:re("foo") !bar`)
f("!(foo bar or baz and not aa*)", `!(foo bar or baz !aa*)`)
// prefix search
f(`'foo'* and (a:x* and x:* or y:i(""*)) and i("abc def"*)`, `foo* (a:x* x:* or y:i(*)) i("abc def"*)`)
// This isn't a prefix search - it equals to `foo AND *`
f(`foo *`, `foo *`)
f(`"foo" *`, `foo *`)
// empty filter
f(`"" or foo:"" and not bar:""`, `"" or foo:"" !bar:""`)
// _stream filters
f(`_stream:{}`, ``)
f(`_stream:{foo="bar", baz=~"x" OR or!="b", "x=},"="d}{"}`, `_stream:{foo="bar",baz=~"x" or "or"!="b","x=},"="d}{"}`)
f(`_stream:{or=a or ","="b"}`, `_stream:{"or"="a" or ","="b"}`)
f("_stream : { foo = bar , } ", `_stream:{foo="bar"}`)
// _time filters
f(`_time:[-5m,now)`, `_time:[-5m,now)`)
f(`_time:( now-1h , now-5m34s5ms]`, `_time:(now-1h,now-5m34s5ms]`)
f(`_time:[2023, 2023-01)`, `_time:[2023,2023-01)`)
f(`_time:[2023-01-02, 2023-02-03T04)`, `_time:[2023-01-02,2023-02-03T04)`)
f(`_time:[2023-01-02T04:05, 2023-02-03T04:05:06)`, `_time:[2023-01-02T04:05,2023-02-03T04:05:06)`)
f(`_time:[2023-01-02T04:05:06Z, 2023-02-03T04:05:06.234Z)`, `_time:[2023-01-02T04:05:06Z,2023-02-03T04:05:06.234Z)`)
f(`_time:[2023-01-02T04:05:06+02:30, 2023-02-03T04:05:06.234-02:45)`, `_time:[2023-01-02T04:05:06+02:30,2023-02-03T04:05:06.234-02:45)`)
f(`_time:[2023-06-07T23:56:34.3456-02:30, now)`, `_time:[2023-06-07T23:56:34.3456-02:30,now)`)
f(`_time:("2024-01-02+02:00", now)`, `_time:(2024-01-02+02:00,now)`)
f(`_time:now`, `_time:now`)
f(`_time:"now"`, `_time:now`)
f(`_time:2024Z`, `_time:2024Z`)
f(`_time:2024-02:30`, `_time:2024-02:30`)
f(`_time:2024-01-02:30`, `_time:2024-01-02:30`)
f(`_time:2024-01-02:30`, `_time:2024-01-02:30`)
f(`_time:2024-01-02+03:30`, `_time:2024-01-02+03:30`)
f(`_time:2024-01-02T10+03:30`, `_time:2024-01-02T10+03:30`)
f(`_time:2024-01-02T10:20+03:30`, `_time:2024-01-02T10:20+03:30`)
f(`_time:2024-01-02T10:20:40+03:30`, `_time:2024-01-02T10:20:40+03:30`)
f(`_time:2024-01-02T10:20:40-03:30`, `_time:2024-01-02T10:20:40-03:30`)
f(`_time:"2024-01-02T10:20:40Z"`, `_time:2024-01-02T10:20:40Z`)
f(`_time:2023-01-02T04:05:06.789Z`, `_time:2023-01-02T04:05:06.789Z`)
f(`_time:2023-01-02T04:05:06.789-02:30`, `_time:2023-01-02T04:05:06.789-02:30`)
f(`_time:2023-01-02T04:05:06.789+02:30`, `_time:2023-01-02T04:05:06.789+02:30`)
f(`_time:[1234567890, 1400000000]`, `_time:[1234567890,1400000000]`)
// reserved keywords
f("and", `"and"`)
f("and and or", `"and" "or"`)
f("AnD", `"AnD"`)
f("or", `"or"`)
f("re 'and' `or` 'not'", `"re" "and" "or" "not"`)
f("foo:and", `foo:"and"`)
f("'re':or or x", `"re":"or" or x`)
f(`"-"`, `"-"`)
f(`"!"`, `"!"`)
f(`"not"`, `"not"`)
f(`''`, `""`)
// reserved functions
f("exact", `"exact"`)
f("exact:a", `"exact":a`)
f("exact-foo", `exact-foo`)
f("a:exact", `a:"exact"`)
f("a:exact-foo", `a:exact-foo`)
f("exact-foo:b", `exact-foo:b`)
f("exact_prefix", `"exact_prefix"`)
f("exact_prefix:a", `"exact_prefix":a`)
f("exact_prefix-foo", `exact_prefix-foo`)
f("a:exact_prefix", `a:"exact_prefix"`)
f("a:exact_prefix-foo", `a:exact_prefix-foo`)
f("exact_prefix-foo:b", `exact_prefix-foo:b`)
f("i", `"i"`)
f("i-foo", `i-foo`)
f("a:i-foo", `a:i-foo`)
f("i-foo:b", `i-foo:b`)
f("in", `"in"`)
f("in:a", `"in":a`)
f("in-foo", `in-foo`)
f("a:in", `a:"in"`)
f("a:in-foo", `a:in-foo`)
f("in-foo:b", `in-foo:b`)
f("ipv4_range", `"ipv4_range"`)
f("ipv4_range:a", `"ipv4_range":a`)
f("ipv4_range-foo", `ipv4_range-foo`)
f("a:ipv4_range", `a:"ipv4_range"`)
f("a:ipv4_range-foo", `a:ipv4_range-foo`)
f("ipv4_range-foo:b", `ipv4_range-foo:b`)
f("len_range", `"len_range"`)
f("len_range:a", `"len_range":a`)
f("len_range-foo", `len_range-foo`)
f("a:len_range", `a:"len_range"`)
f("a:len_range-foo", `a:len_range-foo`)
f("len_range-foo:b", `len_range-foo:b`)
f("range", `"range"`)
f("range:a", `"range":a`)
f("range-foo", `range-foo`)
f("a:range", `a:"range"`)
f("a:range-foo", `a:range-foo`)
f("range-foo:b", `range-foo:b`)
f("re", `"re"`)
f("re-bar", `re-bar`)
f("a:re-bar", `a:re-bar`)
f("re-bar:a", `re-bar:a`)
f("seq", `"seq"`)
f("seq-a", `seq-a`)
f("x:seq-a", `x:seq-a`)
f("seq-a:x", `seq-a:x`)
f("string_range", `"string_range"`)
f("string_range-a", `string_range-a`)
f("x:string_range-a", `x:string_range-a`)
f("string_range-a:x", `string_range-a:x`)
// exact filter
f("exact(foo)", `exact(foo)`)
f("exact('foo bar),|baz')", `exact("foo bar),|baz")`)
f(`exact(foo-bar,)`, `exact(foo-bar)`)
f(`exact(foo|b:ar)`, `exact("foo|b:ar")`)
f(`foo:exact(f,)`, `foo:exact(f)`)
// exact_prefix filter
f("exact_prefix(foo)", `exact_prefix(foo)`)
f(`exact_prefix("foo bar")`, `exact_prefix("foo bar")`)
f(`exact_prefix(foo-bar,)`, `exact_prefix(foo-bar)`)
f(`exact_prefix(foo|b:ar)`, `exact_prefix("foo|b:ar")`)
f(`foo:exact_prefix(f,)`, `foo:exact_prefix(f)`)
// i filter
f("i(foo)", `i(foo)`)
f("i(foo*)", `i(foo*)`)
f("i(`foo`* )", `i(foo*)`)
f("i(' foo ) bar')", `i(" foo ) bar")`)
f("i('foo bar'*)", `i("foo bar"*)`)
f(`foo:i(foo:bar-baz|aa+bb)`, `foo:i("foo:bar-baz|aa+bb")`)
// in filter
f(`in()`, `in()`)
f(`in(foo)`, `in(foo)`)
f(`in(foo, bar)`, `in(foo,bar)`)
f(`in("foo bar", baz)`, `in("foo bar",baz)`)
f(`foo:in(foo-bar|baz)`, `foo:in("foo-bar|baz")`)
// ipv4_range filter
f(`ipv4_range(1.2.3.4, "5.6.7.8")`, `ipv4_range(1.2.3.4, 5.6.7.8)`)
f(`foo:ipv4_range(1.2.3.4, "5.6.7.8" , )`, `foo:ipv4_range(1.2.3.4, 5.6.7.8)`)
f(`ipv4_range(1.2.3.4)`, `ipv4_range(1.2.3.4, 1.2.3.4)`)
f(`ipv4_range(1.2.3.4/20)`, `ipv4_range(1.2.0.0, 1.2.15.255)`)
f(`ipv4_range(1.2.3.4,)`, `ipv4_range(1.2.3.4, 1.2.3.4)`)
// len_range filter
f(`len_range(10, 20)`, `len_range(10,20)`)
f(`foo:len_range("10", 20, )`, `foo:len_range(10,20)`)
// range filter
f(`range(1.234, 5656.43454)`, `range(1.234,5656.43454)`)
f(`foo:range(-2343.344, 2343.4343)`, `foo:range(-2343.344,2343.4343)`)
f(`range(-1.234e-5 , 2.34E+3)`, `range(-1.234e-5,2.34E+3)`)
f(`range[123, 456)`, `range[123,456)`)
f(`range(123, 445]`, `range(123,445]`)
f(`range("1.234e-4", -23)`, `range(1.234e-4,-23)`)
// re filter
f("re('foo|ba(r.+)')", `re("foo|ba(r.+)")`)
f("re(foo)", `re("foo")`)
f(`foo:re(foo-bar|baz.)`, `foo:re("foo-bar|baz.")`)
// seq filter
f(`seq()`, `seq()`)
f(`seq(foo)`, `seq(foo)`)
f(`seq("foo, bar", baz, abc)`, `seq("foo, bar",baz,abc)`)
f(`foo:seq(foo"bar-baz+aa, b)`, `foo:seq("foo\"bar-baz+aa",b)`)
// string_range filter
f(`string_range(foo, bar)`, `string_range(foo, bar)`)
f(`foo:string_range("foo, bar", baz)`, `foo:string_range("foo, bar", baz)`)
// reserved field names
f(`"_stream"`, `_stream`)
f(`"_time"`, `_time`)
f(`"_msg"`, `_msg`)
f(`_stream and _time or _msg`, `_stream _time or _msg`)
// invalid rune
f("\xff", `"\xff"`)
// ip addresses in the query
f("1.2.3.4 or ip:5.6.7.9", "1.2.3.4 or ip:5.6.7.9")
// '-' and '.' chars in field name and search phrase
f("trace-id.foo.bar:baz", `trace-id.foo.bar:baz`)
f(`custom-Time:2024-01-02T03:04:05+08:00 fooBar OR !baz:xxx`, `custom-Time:"2024-01-02T03:04:05+08:00" fooBar or !baz:xxx`)
f("foo-bar+baz*", `"foo-bar+baz"*`)
f("foo- bar", `foo- bar`)
f("foo -bar", `foo -bar`)
f("foo!bar", `"foo!bar"`)
f("foo:aa!bb:cc", `foo:"aa!bb:cc"`)
f(`foo:bar:baz`, `foo:"bar:baz"`)
f(`foo:(bar baz:xxx)`, `foo:bar foo:"baz:xxx"`)
f(`foo:(_time:abc or not z)`, `foo:"_time:abc" or !foo:z`)
f(`foo:(_msg:a :x _stream:{c="d"})`, `foo:"_msg:a" foo:x foo:"_stream:{c=\"d\"}"`)
f(`:(_msg:a:b c)`, `"a:b" c`)
f(`"foo"bar baz:"a'b"c`, `"\"foo\"bar" baz:"\"a'b\"c"`)
// complex queries
f(`_time:[-1h, now] _stream:{job="foo",env=~"prod|staging"} level:(error or warn*) and not "connection reset by peer"`,
`_time:[-1h,now] _stream:{job="foo",env=~"prod|staging"} (level:error or level:warn*) !"connection reset by peer"`)
f(`(_time:(2023-04-20, now] or _time:[-10m, -1m))
and (_stream:{job="a"} or _stream:{instance!="b"})
and (err* or ip:(ipv4_range(1.2.3.0, 1.2.3.255) and not 1.2.3.4))`,
`(_time:(2023-04-20,now] or _time:[-10m,-1m)) (_stream:{job="a"} or _stream:{instance!="b"}) (err* or ip:ipv4_range(1.2.3.0, 1.2.3.255) !ip:1.2.3.4)`)
}
func TestParseQueryFailure(t *testing.T) {
f := func(s string) {
t.Helper()
q, err := ParseQuery(s)
if q != nil {
t.Fatalf("expecting nil result; got %s", q)
}
if err == nil {
t.Fatalf("expecting non-nil error")
}
}
f("")
f("|")
f("foo|")
f("foo|bar")
f("foo and")
f("foo OR ")
f("not")
f("NOT")
f("not (abc")
f("!")
// invalid parens
f("(")
f("foo (bar ")
f("(foo:'bar")
// missing filter
f(":")
f(": ")
f("foo: ")
f("_msg : ")
f(`"": `)
// invalid quoted strings
f(`"foo`)
f(`'foo`)
f("`foo")
// invalid _stream filters
f("_stream:")
f("_stream:{")
f("_stream:(")
f("_stream:{foo")
f("_stream:{foo}")
f("_stream:{foo=")
f("_stream:{foo='bar")
f("_stream:{foo='bar}")
f("_stream:{foo=bar or")
f("_stream:{foo=bar or}")
f("_stream:{foo=bar or baz}")
f("_stream:{foo=bar baz x=y}")
f("_stream:{foo=bar,")
f("_stream:{foo=bar")
f("_stream:foo")
f("_stream:(foo)")
f("_stream:[foo]")
// invalid _time filters
f("_time:")
f("_time:[")
f("_time:foo")
f("_time:{}")
f("_time:[foo,bar)")
f("_time:(now)")
f("_time:[now,")
f("_time:(now, not now]")
f("_time:(-5m, -1m}")
f("_time:[-")
f("_time:[now-foo,-bar]")
f("_time:[2023-ab,2023]")
f("_time:[fooo-02,2023]")
f("_time:[2023-01-02T04:05:06+12,2023]")
f("_time:[2023-01-02T04:05:06-12,2023]")
f("_time:2023-01-02T04:05:06.789")
// long query with error
f(`very long query with error aaa ffdfd fdfdfd fdfd:( ffdfdfdfdfd`)
// query with unexpected tail
f(`foo | bar`)
// unexpected comma
f(`foo,bar`)
f(`foo, bar`)
f(`foo ,bar`)
// unexpected token
f(`[foo`)
f(`foo]bar`)
f(`foo] bar`)
f(`foo ]bar`)
f(`) foo`)
f(`foo)bar`)
// unknown function
f(`unknown_function(foo)`)
// invalid exact
f(`exact(`)
f(`exact(f, b)`)
f(`exact(foo`)
f(`exact(foo,`)
f(`exact(foo*)`)
f(`exact(foo bar)`)
f(`exact(foo, bar`)
// invalid i
f(`i(`)
f(`i(aa`)
f(`i(aa, bb)`)
f(`i(*`)
f(`i(aaa*`)
f(`i(a**)`)
f(`i("foo`)
f(`i(foo bar)`)
// invalid in
f(`in(`)
f(`in(,)`)
f(`in(f, b c)`)
f(`in(foo`)
f(`in(foo,`)
f(`in(foo*)`)
f(`in(foo, "bar baz"*)`)
f(`in(foo, "bar baz"*, abc)`)
f(`in(foo bar)`)
f(`in(foo, bar`)
// invalid ipv4_range
f(`ipv4_range(`)
f(`ipv4_range(foo,bar)`)
f(`ipv4_range(1.2.3.4*)`)
f(`ipv4_range("1.2.3.4"*)`)
f(`ipv4_range(1.2.3.4`)
f(`ipv4_range(1.2.3.4,`)
f(`ipv4_range(1.2.3.4, 5.6.7)`)
f(`ipv4_range(1.2.3.4, 5.6.7.8`)
f(`ipv4_range(1.2.3.4, 5.6.7.8,`)
f(`ipv4_range(1.2.3.4, 5.6.7.8,,`)
f(`ipv4_range(1.2.3.4, 5.6.7.8,5.3.2.1)`)
// invalid len_range
f(`len_range(`)
f(`len_range(1)`)
f(`len_range(foo, bar)`)
f(`len_range(1, bar)`)
f(`len_range(1, 2`)
f(`len_range(1.2, 3.4)`)
// invalid range
f(`range(`)
f(`range(foo,bar)`)
f(`range(1"`)
f(`range(1,`)
f(`range(1)`)
f(`range(1,)`)
f(`range(1,2,`)
f(`range[1,foo)`)
f(`range[1,2,3)`)
f(`range(1)`)
// invalid re
f("re(")
f("re(a, b)")
f("foo:re(bar")
f("re(`ab(`)")
f(`re(a b)`)
// invalid seq
f(`seq(`)
f(`seq(,)`)
f(`seq(foo`)
f(`seq(foo,`)
f(`seq(foo*)`)
f(`seq(foo*, bar)`)
f(`seq(foo bar)`)
f(`seq(foo, bar`)
// invalid string_range
f(`string_range(`)
f(`string_range(,)`)
f(`string_range(foo`)
f(`string_range(foo,`)
f(`string_range(foo*)`)
f(`string_range(foo bar)`)
f(`string_range(foo, bar`)
f(`string_range(foo)`)
f(`string_range(foo, bar, baz)`)
}

102
lib/logstorage/part.go Normal file
View file

@ -0,0 +1,102 @@
package logstorage
import (
"path/filepath"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
type part struct {
// pt is the partition the part belongs to
pt *partition
// path is the path to the part on disk.
//
// If the part is in-memory then the path is empty.
path string
// ph contains partHeader for the given part.
ph partHeader
// indexBlockHeaders contains a list of indexBlockHeader entries for the given part.
indexBlockHeaders []indexBlockHeader
indexFile fs.MustReadAtCloser
columnsHeaderFile fs.MustReadAtCloser
timestampsFile fs.MustReadAtCloser
fieldValuesFile fs.MustReadAtCloser
fieldBloomFilterFile fs.MustReadAtCloser
messageValuesFile fs.MustReadAtCloser
messageBloomFilterFile fs.MustReadAtCloser
}
func mustOpenInmemoryPart(pt *partition, mp *inmemoryPart) *part {
var p part
p.pt = pt
p.path = ""
p.ph = mp.ph
// Read metaindex
metaindexReader := mp.metaindex.NewReader()
var mrs readerWithStats
mrs.init(metaindexReader)
p.indexBlockHeaders = mustReadIndexBlockHeaders(p.indexBlockHeaders[:0], &mrs)
// Open data files
p.indexFile = &mp.index
p.columnsHeaderFile = &mp.columnsHeader
p.timestampsFile = &mp.timestamps
p.fieldValuesFile = &mp.fieldValues
p.fieldBloomFilterFile = &mp.fieldBloomFilter
p.messageValuesFile = &mp.messageValues
p.messageBloomFilterFile = &mp.messageBloomFilter
return &p
}
func mustOpenFilePart(pt *partition, path string) *part {
var p part
p.pt = pt
p.path = path
p.ph.mustReadMetadata(path)
metaindexPath := filepath.Join(path, metaindexFilename)
indexPath := filepath.Join(path, indexFilename)
columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
timestampsPath := filepath.Join(path, timestampsFilename)
fieldValuesPath := filepath.Join(path, fieldValuesFilename)
fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
messageValuesPath := filepath.Join(path, messageValuesFilename)
messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
// Read metaindex
metaindexReader := filestream.MustOpen(metaindexPath, true)
var mrs readerWithStats
mrs.init(metaindexReader)
p.indexBlockHeaders = mustReadIndexBlockHeaders(p.indexBlockHeaders[:0], &mrs)
mrs.MustClose()
// Open data files
p.indexFile = fs.MustOpenReaderAt(indexPath)
p.columnsHeaderFile = fs.MustOpenReaderAt(columnsHeaderPath)
p.timestampsFile = fs.MustOpenReaderAt(timestampsPath)
p.fieldValuesFile = fs.MustOpenReaderAt(fieldValuesPath)
p.fieldBloomFilterFile = fs.MustOpenReaderAt(fieldBloomFilterPath)
p.messageValuesFile = fs.MustOpenReaderAt(messageValuesPath)
p.messageBloomFilterFile = fs.MustOpenReaderAt(messageBloomFilterPath)
return &p
}
func mustClosePart(p *part) {
p.indexFile.MustClose()
p.columnsHeaderFile.MustClose()
p.timestampsFile.MustClose()
p.fieldValuesFile.MustClose()
p.fieldBloomFilterFile.MustClose()
p.messageValuesFile.MustClose()
p.messageBloomFilterFile.MustClose()
p.pt = nil
}

View file

@ -0,0 +1,84 @@
package logstorage
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// partHeader contains the information about a single part
type partHeader struct {
// CompressedSizeBytes is physical size of the part
CompressedSizeBytes uint64
// UncompressedSizeBytes is the original size of log entries stored in the part
UncompressedSizeBytes uint64
// RowsCount is the number of log entries in the part
RowsCount uint64
// BlocksCount is the number of blocks in the part
BlocksCount uint64
// MinTimestamp is the minimum timestamp seen in the part
MinTimestamp int64
// MaxTimestamp is the maximum timestamp seen in the part
MaxTimestamp int64
}
// reset resets ph for subsequent re-use
func (ph *partHeader) reset() {
ph.CompressedSizeBytes = 0
ph.UncompressedSizeBytes = 0
ph.RowsCount = 0
ph.BlocksCount = 0
ph.MinTimestamp = 0
ph.MaxTimestamp = 0
}
// String returns string represenation for ph.
func (ph *partHeader) String() string {
return fmt.Sprintf("{CompressedSizeBytes=%d, UncompressedSizeBytes=%d, RowsCount=%d, BlocksCount=%d, MinTimestamp=%s, MaxTimestamp=%s}",
ph.CompressedSizeBytes, ph.UncompressedSizeBytes, ph.RowsCount, ph.BlocksCount, timestampToString(ph.MinTimestamp), timestampToString(ph.MaxTimestamp))
}
func (ph *partHeader) mustReadMetadata(partPath string) {
ph.reset()
metadataPath := filepath.Join(partPath, metadataFilename)
metadata, err := os.ReadFile(metadataPath)
if err != nil {
logger.Panicf("FATAL: cannot read %q: %s", metadataPath, err)
}
if err := json.Unmarshal(metadata, ph); err != nil {
logger.Panicf("FATAL: cannot parse %q: %s", metadataPath, err)
}
// Perform various checks
if ph.MinTimestamp > ph.MaxTimestamp {
logger.Panicf("FATAL: MinTimestamp cannot exceed MaxTimestamp; got %d vs %d", ph.MinTimestamp, ph.MaxTimestamp)
}
}
func (ph *partHeader) mustWriteMetadata(partPath string) {
metadata, err := json.Marshal(ph)
if err != nil {
logger.Panicf("BUG: cannot marshal partHeader: %s", err)
}
metadataPath := filepath.Join(partPath, metadataFilename)
fs.MustWriteSync(metadataPath, metadata)
}
func timestampToString(timestamp int64) string {
t := time.Unix(0, timestamp).UTC()
return strings.Replace(t.Format(timestampForPathname), ".", "", 1)
}
const timestampForPathname = "20060102150405.000000000"

View file

@ -0,0 +1,21 @@
package logstorage
import (
"reflect"
"testing"
)
func TestPartHeaderReset(t *testing.T) {
ph := &partHeader{
CompressedSizeBytes: 123,
UncompressedSizeBytes: 234,
RowsCount: 1234,
MinTimestamp: 3434,
MaxTimestamp: 32434,
}
ph.reset()
phZero := &partHeader{}
if !reflect.DeepEqual(ph, phZero) {
t.Fatalf("unexpected non-zero partHeader after reset: %v", ph)
}
}

237
lib/logstorage/partition.go Normal file
View file

@ -0,0 +1,237 @@
package logstorage
import (
"bytes"
"path/filepath"
"sort"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// PartitionStats contains stats for the partition.
type PartitionStats struct {
DatadbStats
IndexdbStats
}
type partition struct {
// s is the parent storage for the partition
s *Storage
// path is the path to the partition directory
path string
// name is the partition name. It is basically the directory name obtained from path.
// It is used for creating keys for partition caches.
name string
// idb is indexdb used for the given partition
idb *indexdb
// ddb is the datadb used for the given partition
ddb *datadb
}
// mustCreatePartition creates a partition at the given path.
//
// The created partition can be opened with mustOpenPartition() after is has been created.
//
// The created partition can be deleted with mustDeletePartition() when it is no longer needed.
func mustCreatePartition(path string) {
fs.MustMkdirFailIfExist(path)
indexdbPath := filepath.Join(path, indexdbDirname)
mustCreateIndexdb(indexdbPath)
datadbPath := filepath.Join(path, datadbDirname)
mustCreateDatadb(datadbPath)
}
// mustDeletePartition deletes partition at the given path.
//
// The partition must be closed with MustClose before deleting it.
func mustDeletePartition(path string) {
fs.MustRemoveAll(path)
}
// mustOpenPartition opens partition at the given path for the given Storage.
//
// The returned partition must be closed when no longer needed with mustClosePartition() call.
func mustOpenPartition(s *Storage, path string) *partition {
name := filepath.Base(path)
// Open indexdb
indexdbPath := filepath.Join(path, indexdbDirname)
idb := mustOpenIndexdb(indexdbPath, name, s)
// Start initializing the partition
pt := &partition{
s: s,
path: path,
name: name,
idb: idb,
}
// Open datadb
datadbPath := filepath.Join(path, datadbDirname)
pt.ddb = mustOpenDatadb(pt, datadbPath, s.flushInterval)
return pt
}
// mustClosePartition closes pt.
//
// The caller must ensure that pt is no longer used before the call to mustClosePartition().
//
// The partition can be deleted if needed after it is closed via mustDeletePartition() call.
func mustClosePartition(pt *partition) {
// Close indexdb
mustCloseIndexdb(pt.idb)
pt.idb = nil
// Close datadb
mustCloseDatadb(pt.ddb)
pt.ddb = nil
pt.name = ""
pt.path = ""
pt.s = nil
}
func (pt *partition) mustAddRows(lr *LogRows) {
// Register rows in indexdb
var pendingRows []int
streamIDs := lr.streamIDs
for i := range lr.timestamps {
streamID := &streamIDs[i]
if pt.hasStreamIDInCache(streamID) {
continue
}
if len(pendingRows) == 0 || !streamIDs[pendingRows[len(pendingRows)-1]].equal(streamID) {
pendingRows = append(pendingRows, i)
}
}
if len(pendingRows) > 0 {
logNewStreams := pt.s.logNewStreams
streamTagsCanonicals := lr.streamTagsCanonicals
sort.Slice(pendingRows, func(i, j int) bool {
return streamIDs[pendingRows[i]].less(&streamIDs[pendingRows[j]])
})
for i, rowIdx := range pendingRows {
streamID := &streamIDs[rowIdx]
if i > 0 && streamIDs[pendingRows[i-1]].equal(streamID) {
continue
}
if pt.hasStreamIDInCache(streamID) {
continue
}
if !pt.idb.hasStreamID(streamID) {
streamTagsCanonical := streamTagsCanonicals[rowIdx]
pt.idb.mustRegisterStream(streamID, streamTagsCanonical)
if logNewStreams {
pt.logNewStream(streamTagsCanonical, lr.rows[rowIdx])
}
}
pt.putStreamIDToCache(streamID)
}
}
// Add rows to datadb
pt.ddb.mustAddRows(lr)
if pt.s.logIngestedRows {
pt.logIngestedRows(lr)
}
}
func (pt *partition) logNewStream(streamTagsCanonical []byte, fields []Field) {
streamTags := getStreamTagsString(streamTagsCanonical)
rf := RowFormatter(fields)
logger.Infof("partition %s: new stream %s for log entry %s", pt.path, streamTags, &rf)
}
func (pt *partition) logIngestedRows(lr *LogRows) {
var rf RowFormatter
for i, fields := range lr.rows {
tf := TimeFormatter(lr.timestamps[i])
streamTags := getStreamTagsString(lr.streamTagsCanonicals[i])
rf = append(rf[:0], fields...)
rf = append(rf, Field{
Name: "_time",
Value: tf.String(),
})
rf = append(rf, Field{
Name: "_stream",
Value: streamTags,
})
sort.Slice(rf, func(i, j int) bool {
return rf[i].Name < rf[j].Name
})
logger.Infof("partition %s: new log entry %s", pt.path, &rf)
}
}
// appendStreamTagsByStreamID appends canonical representation of stream tags for the given sid to dst
// and returns the result.
func (pt *partition) appendStreamTagsByStreamID(dst []byte, sid *streamID) []byte {
// Search for the StreamTags in the cache.
key := bbPool.Get()
defer bbPool.Put(key)
// There is no need in putting partition name into key here,
// since StreamTags is uniquely identified by streamID.
key.B = sid.marshal(key.B)
dstLen := len(dst)
dst = pt.s.streamTagsCache.GetBig(dst, key.B)
if len(dst) > dstLen {
// Fast path - the StreamTags have been found in cache.
return dst
}
// Slow path - search for StreamTags in idb
dst = pt.idb.appendStreamTagsByStreamID(dst, sid)
if len(dst) > dstLen {
// Store the found StreamTags to cache
pt.s.streamTagsCache.SetBig(key.B, dst[dstLen:])
}
return dst
}
func (pt *partition) hasStreamIDInCache(sid *streamID) bool {
var result [1]byte
bb := bbPool.Get()
bb.B = pt.marshalStreamIDCacheKey(bb.B, sid)
value := pt.s.streamIDCache.Get(result[:0], bb.B)
bbPool.Put(bb)
return bytes.Equal(value, okValue)
}
func (pt *partition) putStreamIDToCache(sid *streamID) {
bb := bbPool.Get()
bb.B = pt.marshalStreamIDCacheKey(bb.B, sid)
pt.s.streamIDCache.Set(bb.B, okValue)
bbPool.Put(bb)
}
func (pt *partition) marshalStreamIDCacheKey(dst []byte, sid *streamID) []byte {
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(pt.name))
dst = sid.marshal(dst)
return dst
}
var okValue = []byte("1")
// debugFlush makes sure that all the recently ingested data data becomes searchable
func (pt *partition) debugFlush() {
pt.ddb.debugFlush()
pt.idb.debugFlush()
}
func (pt *partition) updateStats(ps *PartitionStats) {
pt.ddb.updateStats(&ps.DatadbStats)
pt.idb.updateStats(&ps.IndexdbStats)
}

View file

@ -0,0 +1,187 @@
package logstorage
import (
"sync/atomic"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
)
func TestPartitionLifecycle(t *testing.T) {
const path = "TestPartitionLifecycle"
var ddbStats DatadbStats
s := newTestStorage()
for i := 0; i < 3; i++ {
mustCreatePartition(path)
for j := 0; j < 2; j++ {
pt := mustOpenPartition(s, path)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != 0 {
t.Fatalf("unexpected non-zero number of entries in empty partition: %d", n)
}
if ddbStats.InmemoryParts != 0 {
t.Fatalf("unexpected non-zero number of in-memory parts in empty partition: %d", ddbStats.InmemoryParts)
}
if ddbStats.FileParts != 0 {
t.Fatalf("unexpected non-zero number of file parts in empty partition: %d", ddbStats.FileParts)
}
if ddbStats.CompressedInmemorySize != 0 {
t.Fatalf("unexpected non-zero size of inmemory parts for empty partition")
}
if ddbStats.CompressedFileSize != 0 {
t.Fatalf("unexpected non-zero size of file parts for empty partition")
}
time.Sleep(10 * time.Millisecond)
mustClosePartition(pt)
}
mustDeletePartition(path)
}
closeTestStorage(s)
}
func TestPartitionMustAddRowsSerial(t *testing.T) {
const path = "TestPartitionMustAddRowsSerial"
var ddbStats DatadbStats
s := newTestStorage()
mustCreatePartition(path)
pt := mustOpenPartition(s, path)
// Try adding the same entry at a time.
totalRowsCount := uint64(0)
for i := 0; i < 100; i++ {
lr := newTestLogRows(1, 1, 0)
totalRowsCount += uint64(len(lr.timestamps))
pt.mustAddRows(lr)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
}
}
// Try adding different entry at a time.
for i := 0; i < 100; i++ {
lr := newTestLogRows(1, 1, int64(i))
totalRowsCount += uint64(len(lr.timestamps))
pt.mustAddRows(lr)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
}
}
// Re-open the partition and verify the number of entries remains the same
mustClosePartition(pt)
pt = mustOpenPartition(s, path)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries after re-opening the partition; got %d; want %d", n, totalRowsCount)
}
if ddbStats.InmemoryParts != 0 {
t.Fatalf("unexpected non-zero number of in-memory parts after re-opening the partition: %d", ddbStats.InmemoryParts)
}
if ddbStats.FileParts == 0 {
t.Fatalf("the number of file parts must be greater than 0 after re-opening the partition")
}
// Try adding entries for multiple streams at a time
for i := 0; i < 5; i++ {
lr := newTestLogRows(3, 7, 0)
totalRowsCount += uint64(len(lr.timestamps))
pt.mustAddRows(lr)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
}
time.Sleep(time.Millisecond)
}
// Re-open the partition and verify the number of entries remains the same
mustClosePartition(pt)
pt = mustOpenPartition(s, path)
ddbStats.reset()
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries after re-opening the partition; got %d; want %d", n, totalRowsCount)
}
if ddbStats.InmemoryParts != 0 {
t.Fatalf("unexpected non-zero number of in-memory parts after re-opening the partition: %d", ddbStats.InmemoryParts)
}
if ddbStats.FileParts == 0 {
t.Fatalf("the number of file parts must be greater than 0 after re-opening the partition")
}
mustClosePartition(pt)
mustDeletePartition(path)
closeTestStorage(s)
}
func TestPartitionMustAddRowsConcurrent(t *testing.T) {
const path = "TestPartitionMustAddRowsConcurrent"
s := newTestStorage()
mustCreatePartition(path)
pt := mustOpenPartition(s, path)
const workersCount = 3
totalRowsCount := uint64(0)
doneCh := make(chan struct{}, workersCount)
for i := 0; i < cap(doneCh); i++ {
go func() {
for j := 0; j < 7; j++ {
lr := newTestLogRows(5, 10, int64(j))
pt.mustAddRows(lr)
atomic.AddUint64(&totalRowsCount, uint64(len(lr.timestamps)))
}
doneCh <- struct{}{}
}()
}
timer := timerpool.Get(time.Second)
defer timerpool.Put(timer)
for i := 0; i < cap(doneCh); i++ {
select {
case <-doneCh:
case <-timer.C:
t.Fatalf("timeout")
}
}
var ddbStats DatadbStats
pt.ddb.updateStats(&ddbStats)
if n := ddbStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries; got %d; want %d", n, totalRowsCount)
}
mustClosePartition(pt)
mustDeletePartition(path)
closeTestStorage(s)
}
// newTestStorage creates new storage for tests.
//
// When the storage is no longer needed, closeTestStorage() must be called.
func newTestStorage() *Storage {
streamIDCache := workingsetcache.New(1024 * 1024)
streamFilterCache := workingsetcache.New(1024 * 1024)
return &Storage{
flushInterval: time.Second,
streamIDCache: streamIDCache,
streamFilterCache: streamFilterCache,
}
}
// closeTestStorage closes storage created via newTestStorage().
func closeTestStorage(s *Storage) {
s.streamIDCache.Stop()
s.streamFilterCache.Stop()
}

123
lib/logstorage/rows.go Normal file
View file

@ -0,0 +1,123 @@
package logstorage
import (
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
// Field is a single field for the log entry.
type Field struct {
// Name is the name of the field
Name string
// Value is the value of the field
Value string
}
// Reset resets f for future re-use.
func (f *Field) Reset() {
f.Name = ""
f.Value = ""
}
// String returns string representation of f.
func (f *Field) String() string {
name := f.Name
if name == "" {
name = "_msg"
}
return fmt.Sprintf("%q:%q", name, f.Value)
}
func (f *Field) marshal(dst []byte) []byte {
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.Name))
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.Value))
return dst
}
func (f *Field) unmarshal(src []byte) ([]byte, error) {
srcOrig := src
// Unmarshal field name
tail, b, err := encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal field name: %w", err)
}
// Do not use bytesutil.InternBytes(b) here, since it works slower than the string(b) in prod
f.Name = string(b)
src = tail
// Unmarshal field value
tail, b, err = encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal field value: %w", err)
}
// Do not use bytesutil.InternBytes(b) here, since it works slower than the string(b) in prod
f.Value = string(b)
src = tail
return src, nil
}
// rows is an aux structure used during rows merge
type rows struct {
fieldsBuf []Field
timestamps []int64
rows [][]Field
}
// reset resets rs
func (rs *rows) reset() {
fb := rs.fieldsBuf
for i := range fb {
fb[i].Reset()
}
rs.fieldsBuf = fb[:0]
rs.timestamps = rs.timestamps[:0]
rows := rs.rows
for i := range rows {
rows[i] = nil
}
rs.rows = rows[:0]
}
// appendRows appends rows with the given timestamps to rs.
func (rs *rows) appendRows(timestamps []int64, rows [][]Field) {
rs.timestamps = append(rs.timestamps, timestamps...)
fieldsBuf := rs.fieldsBuf
for _, fields := range rows {
fieldsLen := len(fieldsBuf)
fieldsBuf = append(fieldsBuf, fields...)
rs.rows = append(rs.rows, fieldsBuf[fieldsLen:])
}
rs.fieldsBuf = fieldsBuf
}
// mergeRows merges the args and appends them to rs.
func (rs *rows) mergeRows(timestampsA, timestampsB []int64, fieldsA, fieldsB [][]Field) {
for len(timestampsA) > 0 && len(timestampsB) > 0 {
i := 0
minTimestamp := timestampsB[0]
for i < len(timestampsA) && timestampsA[i] <= minTimestamp {
i++
}
rs.appendRows(timestampsA[:i], fieldsA[:i])
fieldsA = fieldsA[i:]
timestampsA = timestampsA[i:]
fieldsA, fieldsB = fieldsB, fieldsA
timestampsA, timestampsB = timestampsB, timestampsA
}
if len(timestampsA) == 0 {
rs.appendRows(timestampsB, fieldsB)
} else {
rs.appendRows(timestampsA, fieldsA)
}
}

287
lib/logstorage/rows_test.go Normal file
View file

@ -0,0 +1,287 @@
package logstorage
import (
"reflect"
"testing"
)
func TestGetRowsSizeBytes(t *testing.T) {
f := func(rows [][]Field, uncompressedSizeBytesExpected int) {
t.Helper()
sizeBytes := uncompressedRowsSizeBytes(rows)
if sizeBytes != uint64(uncompressedSizeBytesExpected) {
t.Fatalf("unexpected sizeBytes; got %d; want %d", sizeBytes, uncompressedSizeBytesExpected)
}
}
f(nil, 0)
f([][]Field{}, 0)
f([][]Field{{}}, 35)
f([][]Field{{{Name: "foo"}}}, 40)
_, rows := newTestRows(1000, 10)
f(rows, 233900)
}
func TestRowsAppendRows(t *testing.T) {
var rs rows
timestamps := []int64{1}
rows := [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
}
rs.appendRows(timestamps, rows)
if len(rs.timestamps) != 1 {
t.Fatalf("unexpected number of row items; got %d; want 1", len(rs.timestamps))
}
rs.appendRows(timestamps, rows)
if len(rs.timestamps) != 2 {
t.Fatalf("unexpected number of row items; got %d; want 2", len(rs.timestamps))
}
for i := range rs.timestamps {
if rs.timestamps[i] != timestamps[0] {
t.Fatalf("unexpected timestamps copied; got %d; want %d", rs.timestamps[i], timestamps[0])
}
if !reflect.DeepEqual(rs.rows[i], rows[0]) {
t.Fatalf("unexpected fields copied\ngot\n%v\nwant\n%v", rs.rows[i], rows[0])
}
}
// append multiple log entries
timestamps, rows = newTestRows(100, 4)
rs.appendRows(timestamps, rows)
if len(rs.timestamps) != 102 {
t.Fatalf("unexpected number of row items; got %d; want 102", len(rs.timestamps))
}
for i := range timestamps {
if rs.timestamps[i+2] != timestamps[i] {
t.Fatalf("unexpected timestamps copied; got %d; want %d", rs.timestamps[i+2], timestamps[i])
}
if !reflect.DeepEqual(rs.rows[i+2], rows[i]) {
t.Fatalf("unexpected log entry copied\ngot\n%v\nwant\n%v", rs.rows[i+2], rows[i])
}
}
// reset rows
rs.reset()
if len(rs.timestamps) != 0 {
t.Fatalf("unexpected non-zero number of row items after reset: %d", len(rs.timestamps))
}
}
func TestMergeRows(t *testing.T) {
f := func(timestampsA, timestampsB []int64, fieldsA, fieldsB [][]Field, timestampsExpected []int64, rowsExpected [][]Field) {
t.Helper()
var rs rows
rs.mergeRows(timestampsA, timestampsB, fieldsA, fieldsB)
if !reflect.DeepEqual(rs.timestamps, timestampsExpected) {
t.Fatalf("unexpected timestamps after merge\ngot\n%v\nwant\n%v", rs.timestamps, timestampsExpected)
}
if !reflect.DeepEqual(rs.rows, rowsExpected) {
t.Fatalf("unexpected rows after merge\ngot\n%v\nwant\n%v", rs.rows, rowsExpected)
}
// check that the result doesn't change when merging in reverse order
rs.reset()
rs.mergeRows(timestampsB, timestampsA, fieldsB, fieldsA)
if !reflect.DeepEqual(rs.timestamps, timestampsExpected) {
t.Fatalf("unexpected timestamps after reverse merge\ngot\n%v\nwant\n%v", rs.timestamps, timestampsExpected)
}
if !reflect.DeepEqual(rs.rows, rowsExpected) {
t.Fatalf("unexpected rows after reverse merge\ngot\n%v\nwant\n%v", rs.rows, rowsExpected)
}
}
f(nil, nil, nil, nil, nil, nil)
// merge single entry with zero entries
timestampsA := []int64{123}
timestampsB := []int64{}
fieldsA := [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
}
fieldsB := [][]Field{}
resultTimestamps := []int64{123}
resultFields := [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
}
f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
// merge two single entries
timestampsA = []int64{123}
timestampsB = []int64{43323}
fieldsA = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
}
fieldsB = [][]Field{
{
{
Name: "asdfds",
Value: "asdfsa",
},
},
}
resultTimestamps = []int64{123, 43323}
resultFields = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "asdfds",
Value: "asdfsa",
},
},
}
f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
// merge identical entries
timestampsA = []int64{123, 456}
timestampsB = []int64{123, 456}
fieldsA = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "foo",
Value: "baz",
},
},
}
fieldsB = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "foo",
Value: "baz",
},
},
}
resultTimestamps = []int64{123, 123, 456, 456}
resultFields = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "foo",
Value: "baz",
},
},
{
{
Name: "foo",
Value: "baz",
},
},
}
f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
// merge interleaved entries
timestampsA = []int64{12, 13432}
timestampsB = []int64{3, 43323}
fieldsA = [][]Field{
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "xfoo",
Value: "xbar",
},
},
}
fieldsB = [][]Field{
{
{
Name: "asd",
Value: "assa",
},
},
{
{
Name: "asdfds",
Value: "asdfsa",
},
},
}
resultTimestamps = []int64{3, 12, 13432, 43323}
resultFields = [][]Field{
{
{
Name: "asd",
Value: "assa",
},
},
{
{
Name: "foo",
Value: "bar",
},
},
{
{
Name: "xfoo",
Value: "xbar",
},
},
{
{
Name: "asdfds",
Value: "asdfsa",
},
},
}
f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
}

532
lib/logstorage/storage.go Normal file
View file

@ -0,0 +1,532 @@
package logstorage
import (
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
)
// StorageStats represents stats for the storage. It may be obtained by calling Storage.UpdateStats().
type StorageStats struct {
// RowsDroppedTooBigTimestamp is the number of rows dropped during data ingestion because their timestamp is smaller than the minimum allowed
RowsDroppedTooBigTimestamp uint64
// RowsDroppedTooSmallTimestamp is the number of rows dropped during data ingestion because their timestamp is bigger than the maximum allowed
RowsDroppedTooSmallTimestamp uint64
// PartitionsCount is the number of partitions in the storage
PartitionsCount uint64
PartitionStats
}
// Reset resets s.
func (s *StorageStats) Reset() {
*s = StorageStats{}
}
// StorageConfig is the config for the Storage.
type StorageConfig struct {
// Retention is the retention for the ingested data.
//
// Older data is automatically deleted.
Retention time.Duration
// FlushInterval is the interval for flushing the in-memory data to disk at the Storage
FlushInterval time.Duration
// FutureRetention is the allowed retention from the current time to future for the ingested data.
//
// Log entries with timestamps bigger than now+FutureRetention are ignored.
FutureRetention time.Duration
// LogNewStreams indicates whether to log newly created log streams.
//
// This can be useful for debugging of high cardinality issues.
// https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality
LogNewStreams bool
// LogIngestedRows indicates whether to log the ingested log entries.
//
// This can be useful for debugging of data ingestion.
LogIngestedRows bool
}
// Storage is the storage for log entries.
type Storage struct {
rowsDroppedTooBigTimestamp uint64
rowsDroppedTooSmallTimestamp uint64
// path is the path to the Storage directory
path string
// retention is the retention for the stored data
//
// older data is automatically deleted
retention time.Duration
// flushInterval is the interval for flushing in-memory data to disk
flushInterval time.Duration
// futureRetention is the maximum allowed interval to write data into the future
futureRetention time.Duration
// logNewStreams instructs to log new streams if it is set to true
logNewStreams bool
// logIngestedRows instructs to log all the ingested log entries if it is set to true
logIngestedRows bool
// flockF is a file, which makes sure that the Storage is opened by a single process
flockF *os.File
// partitions is a list of partitions for the Storage.
//
// It must be accessed under partitionsLock.
partitions []*partitionWrapper
// ptwHot is the "hot" partition, were the last rows were ingested.
//
// It must be accessed under partitionsLock.
ptwHot *partitionWrapper
// partitionsLock protects partitions and ptwHot.
partitionsLock sync.Mutex
// stopCh is closed when the Storage must be stopped.
stopCh chan struct{}
// wg is used for waiting for background workers at MustClose().
wg sync.WaitGroup
// streamIDCache caches (partition, streamIDs) seen during data ingestion.
//
// It reduces the load on persistent storage during data ingestion by skipping
// the check whether the given stream is already registered in the persistent storage.
streamIDCache *workingsetcache.Cache
// streamTagsCache caches StreamTags entries keyed by streamID.
//
// There is no need to put partition into the key for StreamTags,
// since StreamTags are uniquely identified by streamID.
//
// It reduces the load on persistent storage during querying
// when StreamTags must be found for the particular streamID
streamTagsCache *workingsetcache.Cache
// streamFilterCache caches streamIDs keyed by (partition, []TenanID, StreamFilter).
//
// It reduces the load on persistent storage during querying by _stream:{...} filter.
streamFilterCache *workingsetcache.Cache
}
type partitionWrapper struct {
// refCount is the number of active references to p.
// When it reaches zero, then the p is closed.
refCount int32
// The flag, which is set when the partition must be deleted after refCount reaches zero.
mustBeDeleted uint32
// day is the day for the partition in the unix timestamp divided by the number of seconds in the day.
day int64
// pt is the wrapped partition.
pt *partition
}
func newPartitionWrapper(pt *partition, day int64) *partitionWrapper {
pw := &partitionWrapper{
day: day,
pt: pt,
}
pw.incRef()
return pw
}
func (ptw *partitionWrapper) incRef() {
atomic.AddInt32(&ptw.refCount, 1)
}
func (ptw *partitionWrapper) decRef() {
n := atomic.AddInt32(&ptw.refCount, -1)
if n > 0 {
return
}
deletePath := ""
if atomic.LoadUint32(&ptw.mustBeDeleted) != 0 {
deletePath = ptw.pt.path
}
// Close pw.pt, since nobody refers to it.
mustClosePartition(ptw.pt)
ptw.pt = nil
// Delete partition if needed.
if deletePath != "" {
mustDeletePartition(deletePath)
}
}
func (ptw *partitionWrapper) canAddAllRows(lr *LogRows) bool {
minTimestamp := ptw.day * nsecPerDay
maxTimestamp := minTimestamp + nsecPerDay - 1
for _, ts := range lr.timestamps {
if ts < minTimestamp || ts > maxTimestamp {
return false
}
}
return true
}
// mustCreateStorage creates Storage at the given path.
func mustCreateStorage(path string) {
fs.MustMkdirFailIfExist(path)
partitionsPath := filepath.Join(path, partitionsDirname)
fs.MustMkdirFailIfExist(partitionsPath)
}
// MustOpenStorage opens Storage at the given path.
//
// MustClose must be called on the returned Storage when it is no longer needed.
func MustOpenStorage(path string, cfg *StorageConfig) *Storage {
flushInterval := cfg.FlushInterval
if flushInterval < time.Second {
flushInterval = time.Second
}
retention := cfg.Retention
if retention < 24*time.Hour {
retention = 24 * time.Hour
}
futureRetention := cfg.FutureRetention
if futureRetention < 24*time.Hour {
futureRetention = 24 * time.Hour
}
if !fs.IsPathExist(path) {
mustCreateStorage(path)
}
flockF := fs.MustCreateFlockFile(path)
// Load caches
mem := memory.Allowed()
streamIDCachePath := filepath.Join(path, cacheDirname, streamIDCacheFilename)
streamIDCache := workingsetcache.Load(streamIDCachePath, mem/16)
streamTagsCache := workingsetcache.New(mem / 10)
streamFilterCache := workingsetcache.New(mem / 10)
s := &Storage{
path: path,
retention: retention,
flushInterval: flushInterval,
futureRetention: futureRetention,
logNewStreams: cfg.LogNewStreams,
logIngestedRows: cfg.LogIngestedRows,
flockF: flockF,
stopCh: make(chan struct{}),
streamIDCache: streamIDCache,
streamTagsCache: streamTagsCache,
streamFilterCache: streamFilterCache,
}
partitionsPath := filepath.Join(path, partitionsDirname)
fs.MustMkdirIfNotExist(partitionsPath)
des := fs.MustReadDir(partitionsPath)
ptws := make([]*partitionWrapper, len(des))
for i, de := range des {
fname := de.Name()
// Parse the day for the partition
t, err := time.Parse(partitionNameFormat, fname)
if err != nil {
logger.Panicf("FATAL: cannot parse partition filename %q at %q; it must be in the form YYYYMMDD: %s", fname, partitionsPath, err)
}
day := t.UTC().UnixNano() / nsecPerDay
partitionPath := filepath.Join(partitionsPath, fname)
pt := mustOpenPartition(s, partitionPath)
ptws[i] = newPartitionWrapper(pt, day)
}
sort.Slice(ptws, func(i, j int) bool {
return ptws[i].day < ptws[j].day
})
// Delete partitions from the future if needed
maxAllowedDay := s.getMaxAllowedDay()
j := len(ptws) - 1
for j >= 0 {
ptw := ptws[j]
if ptw.day <= maxAllowedDay {
break
}
logger.Infof("the partition %s is scheduled to be deleted because it is outside the -futureRetention=%dd", ptw.pt.path, durationToDays(s.futureRetention))
atomic.StoreUint32(&ptw.mustBeDeleted, 1)
ptw.decRef()
j--
}
j++
for i := j; i < len(ptws); i++ {
ptws[i] = nil
}
ptws = ptws[:j]
s.partitions = ptws
s.runRetentionWatcher()
return s
}
const partitionNameFormat = "20060102"
func (s *Storage) runRetentionWatcher() {
s.wg.Add(1)
go func() {
s.watchRetention()
s.wg.Done()
}()
}
func (s *Storage) watchRetention() {
ticker := time.NewTicker(time.Hour)
defer ticker.Stop()
for {
var ptwsToDelete []*partitionWrapper
minAllowedDay := s.getMinAllowedDay()
s.partitionsLock.Lock()
// Delete outdated partitions.
// s.partitions are sorted by day, so the partitions, which can become outdated, are located at the beginning of the list
for _, ptw := range s.partitions {
if ptw.day >= minAllowedDay {
break
}
ptwsToDelete = append(ptwsToDelete, ptw)
}
for i := range ptwsToDelete {
s.partitions[i] = nil
}
s.partitions = s.partitions[len(ptwsToDelete):]
s.partitionsLock.Unlock()
for _, ptw := range ptwsToDelete {
logger.Infof("the partition %s is scheduled to be deleted because it is outside the -retentionPeriod=%dd", ptw.pt.path, durationToDays(s.retention))
atomic.StoreUint32(&ptw.mustBeDeleted, 1)
ptw.decRef()
}
select {
case <-s.stopCh:
return
case <-ticker.C:
}
}
}
func (s *Storage) getMinAllowedDay() int64 {
return time.Now().UTC().Add(-s.retention).UnixNano() / nsecPerDay
}
func (s *Storage) getMaxAllowedDay() int64 {
return time.Now().UTC().Add(s.futureRetention).UnixNano() / nsecPerDay
}
// MustClose closes s.
//
// It is expected that nobody uses the storage at the close time.
func (s *Storage) MustClose() {
// Stop background workers
close(s.stopCh)
s.wg.Wait()
// Close partitions
for _, pw := range s.partitions {
pw.decRef()
if pw.refCount != 0 {
logger.Panicf("BUG: there are %d users of partition", pw.refCount)
}
}
s.partitions = nil
// Save caches
streamIDCachePath := filepath.Join(s.path, cacheDirname, streamIDCacheFilename)
if err := s.streamIDCache.Save(streamIDCachePath); err != nil {
logger.Panicf("FATAL: cannot save streamID cache to %q: %s", streamIDCachePath, err)
}
s.streamIDCache.Stop()
s.streamIDCache = nil
s.streamTagsCache.Stop()
s.streamTagsCache = nil
s.streamFilterCache.Stop()
s.streamFilterCache = nil
// release lock file
fs.MustClose(s.flockF)
s.flockF = nil
s.path = ""
}
// MustAddRows adds lr to s.
func (s *Storage) MustAddRows(lr *LogRows) {
// Fast path - try adding all the rows to the hot partition
s.partitionsLock.Lock()
ptwHot := s.ptwHot
if ptwHot != nil {
ptwHot.incRef()
}
s.partitionsLock.Unlock()
if ptwHot != nil {
if ptwHot.canAddAllRows(lr) {
ptwHot.pt.mustAddRows(lr)
ptwHot.decRef()
return
}
ptwHot.decRef()
}
// Slow path - rows cannot be added to the hot partition, so split rows among available partitions
minAllowedDay := s.getMinAllowedDay()
maxAllowedDay := s.getMaxAllowedDay()
m := make(map[int64]*LogRows)
for i, ts := range lr.timestamps {
day := ts / nsecPerDay
if day < minAllowedDay {
rf := RowFormatter(lr.rows[i])
tsf := TimeFormatter(ts)
minAllowedTsf := TimeFormatter(minAllowedDay * nsecPerDay)
tooSmallTimestampLogger.Warnf("skipping log entry with too small timestamp=%s; it must be bigger than %s according "+
"to the configured -retentionPeriod. See https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
"log entry: %s", &tsf, &minAllowedTsf, &rf)
atomic.AddUint64(&s.rowsDroppedTooSmallTimestamp, 1)
continue
}
if day > maxAllowedDay {
rf := RowFormatter(lr.rows[i])
tsf := TimeFormatter(ts)
maxAllowedTsf := TimeFormatter(maxAllowedDay * nsecPerDay)
tooBigTimestampLogger.Warnf("skipping log entry with too big timestamp=%s; it must be smaller than %s according "+
"to the configured -futureRetention; see https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
"log entry: %s", &tsf, &maxAllowedTsf, &rf)
atomic.AddUint64(&s.rowsDroppedTooBigTimestamp, 1)
continue
}
lrPart := m[day]
if lrPart == nil {
lrPart = GetLogRows(nil, nil)
m[day] = lrPart
}
lrPart.mustAddInternal(lr.streamIDs[i], ts, lr.rows[i], lr.streamTagsCanonicals[i])
}
for day, lrPart := range m {
ptw := s.getPartitionForDay(day)
ptw.pt.mustAddRows(lrPart)
ptw.decRef()
PutLogRows(lrPart)
}
}
var tooSmallTimestampLogger = logger.WithThrottler("too_small_timestamp", 5*time.Second)
var tooBigTimestampLogger = logger.WithThrottler("too_big_timestamp", 5*time.Second)
const nsecPerDay = 24 * 3600 * 1e9
// TimeFormatter implements fmt.Stringer for timestamp in nanoseconds
type TimeFormatter int64
// String returns human-readable representation for tf.
func (tf *TimeFormatter) String() string {
ts := int64(*tf)
t := time.Unix(0, ts).UTC()
return t.Format(time.RFC3339)
}
func (s *Storage) getPartitionForDay(day int64) *partitionWrapper {
s.partitionsLock.Lock()
// Search for the partition using binary search
ptws := s.partitions
n := sort.Search(len(ptws), func(i int) bool {
return ptws[i].day >= day
})
var ptw *partitionWrapper
if n < len(ptws) {
ptw = ptws[n]
if ptw.day != day {
ptw = nil
}
}
if ptw == nil {
// Missing partition for the given day. Create it.
fname := time.Unix(0, day*nsecPerDay).UTC().Format(partitionNameFormat)
partitionPath := filepath.Join(s.path, partitionsDirname, fname)
mustCreatePartition(partitionPath)
pt := mustOpenPartition(s, partitionPath)
ptw = newPartitionWrapper(pt, day)
if n == len(ptws) {
ptws = append(ptws, ptw)
} else {
ptws = append(ptws[:n+1], ptws[n:]...)
ptws[n] = ptw
}
s.partitions = ptws
}
s.ptwHot = ptw
ptw.incRef()
s.partitionsLock.Unlock()
return ptw
}
// UpdateStats updates ss for the given s.
func (s *Storage) UpdateStats(ss *StorageStats) {
ss.RowsDroppedTooBigTimestamp += atomic.LoadUint64(&s.rowsDroppedTooBigTimestamp)
ss.RowsDroppedTooSmallTimestamp += atomic.LoadUint64(&s.rowsDroppedTooSmallTimestamp)
s.partitionsLock.Lock()
ss.PartitionsCount += uint64(len(s.partitions))
for _, ptw := range s.partitions {
ptw.pt.updateStats(&ss.PartitionStats)
}
s.partitionsLock.Unlock()
}
func (s *Storage) debugFlush() {
s.partitionsLock.Lock()
ptws := append([]*partitionWrapper{}, s.partitions...)
for _, ptw := range ptws {
ptw.incRef()
}
s.partitionsLock.Unlock()
for _, ptw := range ptws {
ptw.pt.debugFlush()
ptw.decRef()
}
}
func durationToDays(d time.Duration) int64 {
return int64(d / (time.Hour * 24))
}

View file

@ -0,0 +1,602 @@
package logstorage
import (
"math"
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
)
// genericSearchOptions contain options used for search.
type genericSearchOptions struct {
// tenantIDs must contain the list of tenantIDs for the search.
tenantIDs []TenantID
// filter is the filter to use for the search
filter filter
// resultColumnNames is names of columns to return in the result.
resultColumnNames []string
}
type searchOptions struct {
// Optional sorted list of tenantIDs for the search.
// If it is empty, then the search is performed by streamIDs
tenantIDs []TenantID
// Optional sorted list of streamIDs for the search.
// If it is empty, then the search is performed by tenantIDs
streamIDs []streamID
// minTimestamp is the minimum timestamp for the search
minTimestamp int64
// maxTimestamp is the maximum timestamp for the search
maxTimestamp int64
// filter is the filter to use for the search
filter filter
// resultColumnNames is names of columns to return in the result
resultColumnNames []string
}
// RunQuery runs the given q and calls processBlock for results
func (s *Storage) RunQuery(tenantIDs []TenantID, q *Query, stopCh <-chan struct{}, processBlock func(columns []BlockColumn)) {
resultColumnNames := q.getResultColumnNames()
so := &genericSearchOptions{
tenantIDs: tenantIDs,
filter: q.f,
resultColumnNames: resultColumnNames,
}
workersCount := cgroup.AvailableCPUs()
s.search(workersCount, so, stopCh, func(workerID uint, br *blockResult) {
brs := getBlockRows()
cs := brs.cs
for i, columnName := range resultColumnNames {
cs = append(cs, BlockColumn{
Name: columnName,
Values: br.getColumnValues(i),
})
}
processBlock(cs)
brs.cs = cs
putBlockRows(brs)
})
}
type blockRows struct {
cs []BlockColumn
}
func (brs *blockRows) reset() {
cs := brs.cs
for i := range cs {
cs[i].reset()
}
brs.cs = cs[:0]
}
func getBlockRows() *blockRows {
v := blockRowsPool.Get()
if v == nil {
return &blockRows{}
}
return v.(*blockRows)
}
func putBlockRows(brs *blockRows) {
brs.reset()
blockRowsPool.Put(brs)
}
var blockRowsPool sync.Pool
// BlockColumn is a single column of a block of data
type BlockColumn struct {
// Name is the column name
Name string
// Values is column values
Values []string
}
func (c *BlockColumn) reset() {
c.Name = ""
c.Values = nil
}
// The number of blocks to search at once by a single worker
//
// This number must be increased on systems with many CPU cores in order to amortize
// the overhead for passing the blockSearchWork to worker goroutines.
const blockSearchWorksPerBatch = 64
// searchResultFunc must process sr.
//
// The callback is called at the worker with the given workerID.
type searchResultFunc func(workerID uint, br *blockResult)
// search searches for the matching rows according to so.
//
// It calls f for each found matching block.
func (s *Storage) search(workersCount int, so *genericSearchOptions, stopCh <-chan struct{}, processBlockResult searchResultFunc) {
// Spin up workers
var wg sync.WaitGroup
workCh := make(chan []*blockSearchWork, workersCount)
wg.Add(workersCount)
for i := 0; i < workersCount; i++ {
go func(workerID uint) {
bs := getBlockSearch()
for bsws := range workCh {
for _, bsw := range bsws {
bs.search(bsw)
if bs.br.RowsCount() > 0 {
processBlockResult(workerID, &bs.br)
}
}
}
putBlockSearch(bs)
wg.Done()
}(uint(i))
}
// Obtain common time filter from so.filter
tf, f := getCommonTimeFilter(so.filter)
// Select partitions according to the selected time range
s.partitionsLock.Lock()
ptws := s.partitions
minDay := tf.minTimestamp / nsecPerDay
n := sort.Search(len(ptws), func(i int) bool {
return ptws[i].day >= minDay
})
ptws = ptws[n:]
maxDay := tf.maxTimestamp / nsecPerDay
n = sort.Search(len(ptws), func(i int) bool {
return ptws[i].day > maxDay
})
ptws = ptws[:n]
for _, ptw := range ptws {
ptw.incRef()
}
s.partitionsLock.Unlock()
// Obtain common streamFilter from f
var sf *StreamFilter
sf, f = getCommonStreamFilter(f)
// Apply search to matching partitions
var pws []*partWrapper
for _, ptw := range ptws {
pws = ptw.pt.search(pws, tf, sf, f, so, workCh, stopCh)
}
// Wait until workers finish their work
close(workCh)
wg.Wait()
// Decrement references to parts
for _, pw := range pws {
pw.decRef()
}
// Decrement references to partitions
for _, ptw := range ptws {
ptw.decRef()
}
}
func (pt *partition) search(pwsDst []*partWrapper, tf *timeFilter, sf *StreamFilter, f filter, so *genericSearchOptions,
workCh chan<- []*blockSearchWork, stopCh <-chan struct{},
) []*partWrapper {
tenantIDs := so.tenantIDs
var streamIDs []streamID
if sf != nil {
streamIDs = pt.idb.searchStreamIDs(tenantIDs, sf)
tenantIDs = nil
}
if hasStreamFilters(f) {
f = initStreamFilters(tenantIDs, pt.idb, f)
}
soInternal := &searchOptions{
tenantIDs: tenantIDs,
streamIDs: streamIDs,
minTimestamp: tf.minTimestamp,
maxTimestamp: tf.maxTimestamp,
filter: f,
resultColumnNames: so.resultColumnNames,
}
return pt.ddb.search(pwsDst, soInternal, workCh, stopCh)
}
func hasStreamFilters(f filter) bool {
switch t := f.(type) {
case *andFilter:
return hasStreamFiltersInList(t.filters)
case *orFilter:
return hasStreamFiltersInList(t.filters)
case *notFilter:
return hasStreamFilters(t.f)
case *streamFilter:
return true
default:
return false
}
}
func hasStreamFiltersInList(filters []filter) bool {
for _, f := range filters {
if hasStreamFilters(f) {
return true
}
}
return false
}
func initStreamFilters(tenantIDs []TenantID, idb *indexdb, f filter) filter {
switch t := f.(type) {
case *andFilter:
return &andFilter{
filters: initStreamFiltersList(tenantIDs, idb, t.filters),
}
case *orFilter:
return &orFilter{
filters: initStreamFiltersList(tenantIDs, idb, t.filters),
}
case *notFilter:
return &notFilter{
f: initStreamFilters(tenantIDs, idb, t.f),
}
case *streamFilter:
return &streamFilter{
f: t.f,
tenantIDs: tenantIDs,
idb: idb,
}
default:
return t
}
}
func initStreamFiltersList(tenantIDs []TenantID, idb *indexdb, filters []filter) []filter {
result := make([]filter, len(filters))
for i, f := range filters {
result[i] = initStreamFilters(tenantIDs, idb, f)
}
return result
}
func (ddb *datadb) search(pwsDst []*partWrapper, so *searchOptions, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) []*partWrapper {
// Select parts with data for the given time range
ddb.partsLock.Lock()
pwsDstLen := len(pwsDst)
pwsDst = appendPartsInTimeRange(pwsDst, ddb.inmemoryParts, so.minTimestamp, so.maxTimestamp)
pwsDst = appendPartsInTimeRange(pwsDst, ddb.fileParts, so.minTimestamp, so.maxTimestamp)
pws := pwsDst[pwsDstLen:]
for _, pw := range pws {
pw.incRef()
}
ddb.partsLock.Unlock()
// Apply search to matching parts
for _, pw := range pws {
pw.p.search(so, workCh, stopCh)
}
return pwsDst
}
func (p *part) search(so *searchOptions, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
bhss := getBlockHeaders()
if len(so.tenantIDs) > 0 {
p.searchByTenantIDs(so, bhss, workCh, stopCh)
} else {
p.searchByStreamIDs(so, bhss, workCh, stopCh)
}
putBlockHeaders(bhss)
}
func getBlockHeaders() *blockHeaders {
v := blockHeadersPool.Get()
if v == nil {
return &blockHeaders{}
}
return v.(*blockHeaders)
}
func putBlockHeaders(bhss *blockHeaders) {
bhss.reset()
blockHeadersPool.Put(bhss)
}
var blockHeadersPool sync.Pool
type blockHeaders struct {
bhs []blockHeader
}
func (bhss *blockHeaders) reset() {
bhs := bhss.bhs
for i := range bhs {
bhs[i].reset()
}
bhss.bhs = bhs[:0]
}
func (p *part) searchByTenantIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
// it is assumed that tenantIDs are sorted
tenantIDs := so.tenantIDs
bsws := make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
scheduleBlockSearch := func(bh *blockHeader) bool {
// Do not use pool for blockSearchWork, since it is returned back to the pool
// at another goroutine, which may run on another CPU core.
// This means that it will be put into another per-CPU pool, which may result
// in slowdown related to memory synchronization between CPU cores.
// This slowdown is increased on systems with bigger number of CPU cores.
bsw := newBlockSearchWork(p, so, bh)
bsws = append(bsws, bsw)
if len(bsws) < cap(bsws) {
return true
}
select {
case <-stopCh:
return false
case workCh <- bsws:
bsws = make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
return true
}
}
// it is assumed that ibhs are sorted
ibhs := p.indexBlockHeaders
for len(ibhs) > 0 && len(tenantIDs) > 0 {
select {
case <-stopCh:
return
default:
}
// locate tenantID equal or bigger than the tenantID in ibhs[0]
tenantID := &tenantIDs[0]
if tenantID.less(&ibhs[0].streamID.tenantID) {
tenantID = &ibhs[0].streamID.tenantID
n := sort.Search(len(tenantIDs), func(i int) bool {
return !tenantIDs[i].less(tenantID)
})
if n == len(tenantIDs) {
tenantIDs = nil
break
}
tenantID = &tenantIDs[n]
tenantIDs = tenantIDs[n:]
}
// locate indexBlockHeader with equal or bigger tenantID than the given tenantID
n := 0
if ibhs[0].streamID.tenantID.less(tenantID) {
n = sort.Search(len(ibhs), func(i int) bool {
return !ibhs[i].streamID.tenantID.less(tenantID)
})
if n == len(ibhs) || n > 0 && ibhs[n].streamID.tenantID.equal(tenantID) {
// The end of ibhs[n-1] may contain blocks for the given tenantID, so move it backwards
n--
}
}
ibh := &ibhs[n]
ibhs = ibhs[n+1:]
if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp {
// Skip the ibh, since it doesn't contain entries on the requested time range
continue
}
bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p)
bhs := bhss.bhs
for len(bhs) > 0 {
// search for blocks with the given tenantID
n = sort.Search(len(bhs), func(i int) bool {
return !bhs[i].streamID.tenantID.less(tenantID)
})
bhs = bhs[n:]
for len(bhs) > 0 && bhs[0].streamID.tenantID.equal(tenantID) {
bh := &bhs[0]
bhs = bhs[1:]
th := &bh.timestampsHeader
if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp {
continue
}
scheduleBlockSearch(bh)
}
if len(bhs) == 0 {
break
}
// search for the next tenantID, which can potentially match tenantID from bhs[0]
tenantID = &bhs[0].streamID.tenantID
n = sort.Search(len(tenantIDs), func(i int) bool {
return !tenantIDs[i].less(tenantID)
})
if n == len(tenantIDs) {
tenantIDs = nil
break
}
tenantID = &tenantIDs[n]
tenantIDs = tenantIDs[n:]
}
}
// Flush the remaining work
if len(bsws) > 0 {
workCh <- bsws
}
}
func (p *part) searchByStreamIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
// it is assumed that streamIDs are sorted
streamIDs := so.streamIDs
bsws := make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
scheduleBlockSearch := func(bh *blockHeader) bool {
// Do not use pool for blockSearchWork, since it is returned back to the pool
// at another goroutine, which may run on another CPU core.
// This means that it will be put into another per-CPU pool, which may result
// in slowdown related to memory synchronization between CPU cores.
// This slowdown is increased on systems with bigger number of CPU cores.
bsw := newBlockSearchWork(p, so, bh)
bsws = append(bsws, bsw)
if len(bsws) < cap(bsws) {
return true
}
select {
case <-stopCh:
return false
case workCh <- bsws:
bsws = make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
return true
}
}
// it is assumed that ibhs are sorted
ibhs := p.indexBlockHeaders
for len(ibhs) > 0 && len(streamIDs) > 0 {
select {
case <-stopCh:
return
default:
}
// locate streamID equal or bigger than the streamID in ibhs[0]
streamID := &streamIDs[0]
if streamID.less(&ibhs[0].streamID) {
streamID = &ibhs[0].streamID
n := sort.Search(len(streamIDs), func(i int) bool {
return !streamIDs[i].less(streamID)
})
if n == len(streamIDs) {
streamIDs = nil
break
}
streamID = &streamIDs[n]
streamIDs = streamIDs[n:]
}
// locate indexBlockHeader with equal or bigger streamID than the given streamID
n := 0
if ibhs[0].streamID.less(streamID) {
n = sort.Search(len(ibhs), func(i int) bool {
return !ibhs[i].streamID.less(streamID)
})
if n == len(ibhs) || n > 0 && ibhs[n].streamID.equal(streamID) {
// The end of ibhs[n-1] may contain blocks for the given streamID, so move it backwards
n--
}
}
ibh := &ibhs[n]
ibhs = ibhs[n+1:]
if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp {
// Skip the ibh, since it doesn't contain entries on the requested time range
continue
}
bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p)
bhs := bhss.bhs
for len(bhs) > 0 {
// search for blocks with the given streamID
n = sort.Search(len(bhs), func(i int) bool {
return !bhs[i].streamID.less(streamID)
})
bhs = bhs[n:]
for len(bhs) > 0 && bhs[0].streamID.equal(streamID) {
bh := &bhs[0]
bhs = bhs[1:]
th := &bh.timestampsHeader
if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp {
continue
}
if !scheduleBlockSearch(bh) {
return
}
}
if len(bhs) == 0 {
break
}
// search for the next streamID, which can potentially match streamID from bhs[0]
streamID = &bhs[0].streamID
n = sort.Search(len(streamIDs), func(i int) bool {
return !streamIDs[i].less(streamID)
})
if n == len(streamIDs) {
streamIDs = nil
break
}
streamID = &streamIDs[n]
streamIDs = streamIDs[n:]
}
}
// Flush the remaining work
if len(bsws) > 0 {
workCh <- bsws
}
}
func appendPartsInTimeRange(dst, src []*partWrapper, minTimestamp, maxTimestamp int64) []*partWrapper {
for _, pw := range src {
if maxTimestamp < pw.p.ph.MinTimestamp || minTimestamp > pw.p.ph.MaxTimestamp {
continue
}
dst = append(dst, pw)
}
return dst
}
func getCommonStreamFilter(f filter) (*StreamFilter, filter) {
switch t := f.(type) {
case *andFilter:
filters := t.filters
for i, filter := range filters {
sf, ok := filter.(*streamFilter)
if ok && !sf.f.isEmpty() {
// Remove sf from filters, since it doesn't filter out anything then.
af := &andFilter{
filters: append(filters[:i:i], filters[i+1:]...),
}
return sf.f, af
}
}
case *streamFilter:
return t.f, &noopFilter{}
}
return nil, f
}
func getCommonTimeFilter(f filter) (*timeFilter, filter) {
switch t := f.(type) {
case *andFilter:
for _, filter := range t.filters {
tf, ok := filter.(*timeFilter)
if ok {
// The tf must remain in af in order to properly filter out rows outside the selected time range
return tf, f
}
}
case *timeFilter:
return t, f
}
return allTimeFilter, f
}
var allTimeFilter = &timeFilter{
minTimestamp: math.MinInt64,
maxTimestamp: math.MaxInt64,
}

View file

@ -0,0 +1,663 @@
package logstorage
import (
"fmt"
"regexp"
"sync/atomic"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
func TestStorageRunQuery(t *testing.T) {
const path = "TestStorageRunQuery"
const tenantsCount = 11
const streamsPerTenant = 3
const blocksPerStream = 5
const rowsPerBlock = 7
sc := &StorageConfig{
Retention: 24 * time.Hour,
}
s := MustOpenStorage(path, sc)
// fill the storage with data
var allTenantIDs []TenantID
baseTimestamp := time.Now().UnixNano() - 3600*1e9
var fields []Field
streamTags := []string{
"job",
"instance",
}
for i := 0; i < tenantsCount; i++ {
tenantID := TenantID{
AccountID: uint32(i),
ProjectID: uint32(10*i + 1),
}
allTenantIDs = append(allTenantIDs, tenantID)
for j := 0; j < streamsPerTenant; j++ {
streamIDValue := fmt.Sprintf("stream_id=%d", j)
for k := 0; k < blocksPerStream; k++ {
lr := GetLogRows(streamTags, nil)
for m := 0; m < rowsPerBlock; m++ {
timestamp := baseTimestamp + int64(m)*1e9 + int64(k)
// Append stream fields
fields = append(fields[:0], Field{
Name: "job",
Value: "foobar",
}, Field{
Name: "instance",
Value: fmt.Sprintf("host-%d:234", j),
})
// append the remaining fields
fields = append(fields, Field{
Name: "_msg",
Value: fmt.Sprintf("log message %d at block %d", m, k),
})
fields = append(fields, Field{
Name: "source-file",
Value: "/foo/bar/baz",
})
fields = append(fields, Field{
Name: "tenant.id",
Value: tenantID.String(),
})
fields = append(fields, Field{
Name: "stream-id",
Value: streamIDValue,
})
lr.MustAdd(tenantID, timestamp, fields)
}
s.MustAddRows(lr)
PutLogRows(lr)
}
}
}
s.debugFlush()
// run tests on the storage data
t.Run("missing-tenant", func(t *testing.T) {
q := mustParseQuery(`"log message"`)
tenantID := TenantID{
AccountID: 0,
ProjectID: 0,
}
processBlock := func(columns []BlockColumn) {
panic(fmt.Errorf("unexpected match"))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
})
t.Run("missing-message-text", func(t *testing.T) {
q := mustParseQuery(`foobar`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
processBlock := func(columns []BlockColumn) {
panic(fmt.Errorf("unexpected match"))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
})
t.Run("matching-tenant-id", func(t *testing.T) {
q := mustParseQuery(`tenant.id:*`)
for i := 0; i < tenantsCount; i++ {
tenantID := TenantID{
AccountID: uint32(i),
ProjectID: uint32(10*i + 1),
}
expectedTenantID := tenantID.String()
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
hasTenantIDColumn := false
var columnNames []string
for _, c := range columns {
if c.Name == "tenant.id" {
hasTenantIDColumn = true
if len(c.Values) == 0 {
panic(fmt.Errorf("unexpected zero rows"))
}
for _, v := range c.Values {
if v != expectedTenantID {
panic(fmt.Errorf("unexpected tenant.id; got %s; want %s", v, expectedTenantID))
}
}
}
columnNames = append(columnNames, c.Name)
}
if !hasTenantIDColumn {
panic(fmt.Errorf("missing tenant.id column among columns: %q", columnNames))
}
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
}
}
})
t.Run("matching-multiple-tenant-ids", func(t *testing.T) {
q := mustParseQuery(`"log message"`)
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
s.RunQuery(allTenantIDs, q, nil, processBlock)
expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-in-filter", func(t *testing.T) {
q := mustParseQuery(`source-file:in(foobar,/foo/bar/baz)`)
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
s.RunQuery(allTenantIDs, q, nil, processBlock)
expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("stream-filter-mismatch", func(t *testing.T) {
q := mustParseQuery(`_stream:{job="foobar",instance=~"host-.+:2345"} log`)
processBlock := func(columns []BlockColumn) {
panic(fmt.Errorf("unexpected match"))
}
s.RunQuery(allTenantIDs, q, nil, processBlock)
})
t.Run("matching-stream-id", func(t *testing.T) {
for i := 0; i < streamsPerTenant; i++ {
q := mustParseQuery(fmt.Sprintf(`log _stream:{job="foobar",instance="host-%d:234"} AND stream-id:*`, i))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
expectedStreamID := fmt.Sprintf("stream_id=%d", i)
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
hasStreamIDColumn := false
var columnNames []string
for _, c := range columns {
if c.Name == "stream-id" {
hasStreamIDColumn = true
if len(c.Values) == 0 {
panic(fmt.Errorf("unexpected zero rows"))
}
for _, v := range c.Values {
if v != expectedStreamID {
panic(fmt.Errorf("unexpected stream-id; got %s; want %s", v, expectedStreamID))
}
}
}
columnNames = append(columnNames, c.Name)
}
if !hasStreamIDColumn {
panic(fmt.Errorf("missing stream-id column among columns: %q", columnNames))
}
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
expectedRowsCount := blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows for stream %d; got %d; want %d", i, rowsCount, expectedRowsCount)
}
}
})
t.Run("matching-multiple-stream-ids-with-re-filter", func(t *testing.T) {
q := mustParseQuery(`_msg:log _stream:{job="foobar",instance=~"host-[^:]+:234"} and re("message [02] at")`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream * 2
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-time-range", func(t *testing.T) {
minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
q := mustParseQuery(fmt.Sprintf(`_time:[%f,%f]`, float64(minTimestamp)/1e9, float64(maxTimestamp)/1e9))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-stream-id-with-time-range", func(t *testing.T) {
minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
q := mustParseQuery(fmt.Sprintf(`_time:[%f,%f] _stream:{job="foobar",instance="host-1:234"}`, float64(minTimestamp)/1e9, float64(maxTimestamp)/1e9))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
rowsCount := uint32(0)
processBlock := func(columns []BlockColumn) {
atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
expectedRowsCount := blocksPerStream
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-stream-id-missing-time-range", func(t *testing.T) {
minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
q := mustParseQuery(fmt.Sprintf(`_stream:{job="foobar",instance="host-1:234"} _time:[%d, %d)`, minTimestamp/1e9, maxTimestamp/1e9))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
processBlock := func(columns []BlockColumn) {
panic(fmt.Errorf("unexpected match"))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
})
t.Run("missing-time-range", func(t *testing.T) {
minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
q := mustParseQuery(fmt.Sprintf(`_time:[%d, %d)`, minTimestamp/1e9, maxTimestamp/1e9))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
processBlock := func(columns []BlockColumn) {
panic(fmt.Errorf("unexpected match"))
}
tenantIDs := []TenantID{tenantID}
s.RunQuery(tenantIDs, q, nil, processBlock)
})
// Close the storage and delete its data
s.MustClose()
fs.MustRemoveAll(path)
}
func mustParseQuery(query string) *Query {
q, err := ParseQuery(query)
if err != nil {
panic(fmt.Errorf("BUG: cannot parse %s: %s", query, err))
}
return q
}
func TestStorageSearch(t *testing.T) {
const path = "TestStorageSearch"
const tenantsCount = 11
const streamsPerTenant = 3
const blocksPerStream = 5
const rowsPerBlock = 7
sc := &StorageConfig{
Retention: 24 * time.Hour,
}
s := MustOpenStorage(path, sc)
// fill the storage with data.
var allTenantIDs []TenantID
baseTimestamp := time.Now().UnixNano() - 3600*1e9
var fields []Field
streamTags := []string{
"job",
"instance",
}
for i := 0; i < tenantsCount; i++ {
tenantID := TenantID{
AccountID: uint32(i),
ProjectID: uint32(10*i + 1),
}
allTenantIDs = append(allTenantIDs, tenantID)
for j := 0; j < streamsPerTenant; j++ {
for k := 0; k < blocksPerStream; k++ {
lr := GetLogRows(streamTags, nil)
for m := 0; m < rowsPerBlock; m++ {
timestamp := baseTimestamp + int64(m)*1e9 + int64(k)
// Append stream fields
fields = append(fields[:0], Field{
Name: "job",
Value: "foobar",
}, Field{
Name: "instance",
Value: fmt.Sprintf("host-%d:234", j),
})
// append the remaining fields
fields = append(fields, Field{
Name: "_msg",
Value: fmt.Sprintf("log message %d at block %d", m, k),
})
fields = append(fields, Field{
Name: "source-file",
Value: "/foo/bar/baz",
})
lr.MustAdd(tenantID, timestamp, fields)
}
s.MustAddRows(lr)
PutLogRows(lr)
}
}
}
s.debugFlush()
// run tests on the filled storage
const workersCount = 3
getBaseFilter := func(minTimestamp, maxTimestamp int64, sf *StreamFilter) filter {
var filters []filter
filters = append(filters, &timeFilter{
minTimestamp: minTimestamp,
maxTimestamp: maxTimestamp,
})
if sf != nil {
filters = append(filters, &streamFilter{
f: sf,
})
}
return &andFilter{
filters: filters,
}
}
t.Run("missing-tenant-smaller-than-existing", func(t *testing.T) {
tenantID := TenantID{
AccountID: 0,
ProjectID: 0,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, nil)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
processBlock := func(workerID uint, br *blockResult) {
panic(fmt.Errorf("unexpected match"))
}
s.search(workersCount, so, nil, processBlock)
})
t.Run("missing-tenant-bigger-than-existing", func(t *testing.T) {
tenantID := TenantID{
AccountID: tenantsCount + 1,
ProjectID: 0,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, nil)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
processBlock := func(workerID uint, br *blockResult) {
panic(fmt.Errorf("unexpected match"))
}
s.search(workersCount, so, nil, processBlock)
})
t.Run("missing-tenant-middle", func(t *testing.T) {
tenantID := TenantID{
AccountID: 1,
ProjectID: 0,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, nil)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
processBlock := func(workerID uint, br *blockResult) {
panic(fmt.Errorf("unexpected match"))
}
s.search(workersCount, so, nil, processBlock)
})
t.Run("matching-tenant-id", func(t *testing.T) {
for i := 0; i < tenantsCount; i++ {
tenantID := TenantID{
AccountID: uint32(i),
ProjectID: uint32(10*i + 1),
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, nil)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
if !br.streamID.tenantID.equal(&tenantID) {
panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
}
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
}
}
})
t.Run("matching-multiple-tenant-ids", func(t *testing.T) {
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, nil)
so := &genericSearchOptions{
tenantIDs: allTenantIDs,
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("stream-filter-mismatch", func(t *testing.T) {
sf := mustNewStreamFilter(`{job="foobar",instance=~"host-.+:2345"}`)
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
so := &genericSearchOptions{
tenantIDs: allTenantIDs,
filter: f,
resultColumnNames: []string{"_msg"},
}
processBlock := func(workerID uint, br *blockResult) {
panic(fmt.Errorf("unexpected match"))
}
s.search(workersCount, so, nil, processBlock)
})
t.Run("matching-stream-id", func(t *testing.T) {
for i := 0; i < streamsPerTenant; i++ {
sf := mustNewStreamFilter(fmt.Sprintf(`{job="foobar",instance="host-%d:234"}`, i))
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
if !br.streamID.tenantID.equal(&tenantID) {
panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
}
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
}
})
t.Run("matching-multiple-stream-ids", func(t *testing.T) {
sf := mustNewStreamFilter(`{job="foobar",instance=~"host-[^:]+:234"}`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
if !br.streamID.tenantID.equal(&tenantID) {
panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
}
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-multiple-stream-ids-with-re-filter", func(t *testing.T) {
sf := mustNewStreamFilter(`{job="foobar",instance=~"host-[^:]+:234"}`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
minTimestamp := baseTimestamp
maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
f = &andFilter{
filters: []filter{
f,
&regexpFilter{
fieldName: "_msg",
re: regexp.MustCompile("message [02] at "),
},
},
}
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
if !br.streamID.tenantID.equal(&tenantID) {
panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
}
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := streamsPerTenant * blocksPerStream * 2
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-stream-id-smaller-time-range", func(t *testing.T) {
sf := mustNewStreamFilter(`{job="foobar",instance="host-1:234"}`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
rowsCount := uint32(0)
processBlock := func(workerID uint, br *blockResult) {
atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
}
s.search(workersCount, so, nil, processBlock)
expectedRowsCount := blocksPerStream
if rowsCount != uint32(expectedRowsCount) {
t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
}
})
t.Run("matching-stream-id-missing-time-range", func(t *testing.T) {
sf := mustNewStreamFilter(`{job="foobar",instance="host-1:234"}`)
tenantID := TenantID{
AccountID: 1,
ProjectID: 11,
}
minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
f := getBaseFilter(minTimestamp, maxTimestamp, sf)
so := &genericSearchOptions{
tenantIDs: []TenantID{tenantID},
filter: f,
resultColumnNames: []string{"_msg"},
}
processBlock := func(workerID uint, br *blockResult) {
panic(fmt.Errorf("unexpected match"))
}
s.search(workersCount, so, nil, processBlock)
})
s.MustClose()
fs.MustRemoveAll(path)
}
func mustNewStreamFilter(s string) *StreamFilter {
sf, err := newStreamFilter(s)
if err != nil {
panic(fmt.Errorf("unexpected error in newStreamFilter(%q): %s", s, err))
}
return sf
}

View file

@ -0,0 +1,102 @@
package logstorage
import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
func TestStorageLifecycle(t *testing.T) {
const path = "TestStorageLifecycle"
for i := 0; i < 3; i++ {
cfg := &StorageConfig{}
s := MustOpenStorage(path, cfg)
s.MustClose()
}
fs.MustRemoveAll(path)
}
func TestStorageMustAddRows(t *testing.T) {
const path = "TestStorageMustAddRows"
var sStats StorageStats
cfg := &StorageConfig{}
s := MustOpenStorage(path, cfg)
// Try adding the same entry multiple times.
totalRowsCount := uint64(0)
for i := 0; i < 100; i++ {
lr := newTestLogRows(1, 1, 0)
lr.timestamps[0] = time.Now().UTC().UnixNano()
totalRowsCount += uint64(len(lr.timestamps))
s.MustAddRows(lr)
sStats.Reset()
s.UpdateStats(&sStats)
if n := sStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
}
}
s.MustClose()
// Re-open the storage and try writing data to it
s = MustOpenStorage(path, cfg)
sStats.Reset()
s.UpdateStats(&sStats)
if n := sStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
}
lr := newTestLogRows(3, 10, 0)
for i := range lr.timestamps {
lr.timestamps[i] = time.Now().UTC().UnixNano()
}
totalRowsCount += uint64(len(lr.timestamps))
s.MustAddRows(lr)
sStats.Reset()
s.UpdateStats(&sStats)
if n := sStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
}
s.MustClose()
// Re-open the storage with big retention and try writing data
// to different days in the past and in the future
cfg = &StorageConfig{
Retention: 365 * 24 * time.Hour,
FutureRetention: 365 * 24 * time.Hour,
}
s = MustOpenStorage(path, cfg)
lr = newTestLogRows(3, 10, 0)
now := time.Now().UTC().UnixNano() - int64(len(lr.timestamps)/2)*nsecPerDay
for i := range lr.timestamps {
lr.timestamps[i] = now
now += nsecPerDay
}
totalRowsCount += uint64(len(lr.timestamps))
s.MustAddRows(lr)
sStats.Reset()
s.UpdateStats(&sStats)
if n := sStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
}
s.MustClose()
// Make sure the stats is valid after re-opening the storage
s = MustOpenStorage(path, cfg)
sStats.Reset()
s.UpdateStats(&sStats)
if n := sStats.RowsCount(); n != totalRowsCount {
t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
}
s.MustClose()
fs.MustRemoveAll(path)
}

View file

@ -0,0 +1,90 @@
package logstorage
import (
"strconv"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
)
// StreamFilter is a filter for streams, e.g. `_stream:{...}`
type StreamFilter struct {
orFilters []*andStreamFilter
}
func (sf *StreamFilter) isEmpty() bool {
for _, af := range sf.orFilters {
if len(af.tagFilters) > 0 {
return false
}
}
return true
}
func (sf *StreamFilter) marshalForCacheKey(dst []byte) []byte {
dst = encoding.MarshalVarUint64(dst, uint64(len(sf.orFilters)))
for _, af := range sf.orFilters {
dst = encoding.MarshalVarUint64(dst, uint64(len(af.tagFilters)))
for _, f := range af.tagFilters {
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.tagName))
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.op))
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.value))
}
}
return dst
}
func (sf *StreamFilter) String() string {
a := make([]string, len(sf.orFilters))
for i := range a {
a[i] = sf.orFilters[i].String()
}
return "{" + strings.Join(a, " or ") + "}"
}
type andStreamFilter struct {
tagFilters []*streamTagFilter
}
func (af *andStreamFilter) String() string {
a := make([]string, len(af.tagFilters))
for i := range a {
a[i] = af.tagFilters[i].String()
}
return strings.Join(a, ",")
}
// streamTagFilter is a filter for `tagName op value`
type streamTagFilter struct {
// tagName is the name for the tag to filter
tagName string
// op is operation such as `=`, `!=`, `=~` or `!~`
op string
// value is the value
value string
regexpOnce sync.Once
regexp *regexutil.PromRegex
}
func (tf *streamTagFilter) getRegexp() *regexutil.PromRegex {
tf.regexpOnce.Do(tf.initRegexp)
return tf.regexp
}
func (tf *streamTagFilter) initRegexp() {
re, err := regexutil.NewPromRegex(tf.value)
if err != nil {
logger.Panicf("BUG: cannot parse regexp %q: %s", tf.value, err)
}
tf.regexp = re
}
func (tf *streamTagFilter) String() string {
return quoteTokenIfNeeded(tf.tagName) + tf.op + strconv.Quote(tf.value)
}

View file

@ -0,0 +1,69 @@
package logstorage
import (
"fmt"
)
// streamID is an internal id of log stream.
//
// Blocks are ordered by streamID inside parts.
type streamID struct {
// tenantID is a tenant id for the given stream.
// It is located at the beginning of streamID in order
// to physically group blocks for the same tenants on the storage.
tenantID TenantID
// id is internal id, which uniquely identifies the stream in the tenant by its labels.
// It is calculated as a hash of canonically sorted stream labels.
//
// Streams with identical sets of labels, which belong to distinct tenants, have the same id.
id u128
}
// reset resets sid for subsequent re-use
func (sid *streamID) reset() {
*sid = streamID{}
}
// String returns human-readable representation for sid.
func (sid *streamID) String() string {
return fmt.Sprintf("(tenant_id=%s, id=%s)", &sid.tenantID, &sid.id)
}
// less returns true if a is less than sid.
func (sid *streamID) less(a *streamID) bool {
if !sid.tenantID.equal(&a.tenantID) {
return sid.tenantID.less(&a.tenantID)
}
return sid.id.less(&a.id)
}
// equal returns true if sid equalt to a.
func (sid *streamID) equal(a *streamID) bool {
if !sid.tenantID.equal(&a.tenantID) {
return false
}
return sid.id.equal(&a.id)
}
// marshal appends the marshaled sid to dst and returns the result
func (sid *streamID) marshal(dst []byte) []byte {
dst = sid.tenantID.marshal(dst)
dst = sid.id.marshal(dst)
return dst
}
// unmarshal unmarshals sid from src and returns the tail from src.
func (sid *streamID) unmarshal(src []byte) ([]byte, error) {
srcOrig := src
tail, err := sid.tenantID.unmarshal(src)
if err != nil {
return srcOrig, err
}
src = tail
tail, err = sid.id.unmarshal(src)
if err != nil {
return srcOrig, err
}
return tail, nil
}

View file

@ -0,0 +1,172 @@
package logstorage
import (
"reflect"
"testing"
)
func TestStreamIDMarshalUnmarshal(t *testing.T) {
f := func(sid *streamID, marshaledLen int) {
t.Helper()
data := sid.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected length of marshaled streamID; got %d; want %d", len(data), marshaledLen)
}
var sid2 streamID
tail, err := sid2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error on unmarshal(%s): %s", sid, err)
}
if len(tail) != 0 {
t.Fatalf("unexpected non-empty tail on unmarshal(%s): %X", sid, tail)
}
if !reflect.DeepEqual(sid, &sid2) {
t.Fatalf("unexpected result on unmarshal; got %s; want %s", &sid2, sid)
}
s1 := sid.String()
s2 := sid2.String()
if s1 != s2 {
t.Fatalf("unexpected string result on unmarshal; got %s; want %s", s2, s1)
}
}
f(&streamID{}, 24)
f(&streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
lo: 89,
hi: 344334,
},
}, 24)
}
func TestStreamIDUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var sid streamID
tail, err := sid.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
f([]byte("1234567890"))
}
func TestStreamIDLessEqual(t *testing.T) {
// compare equal values
sid1 := &streamID{}
sid2 := &streamID{}
if sid1.less(sid2) {
t.Fatalf("less for equal values must return false")
}
if sid2.less(sid1) {
t.Fatalf("less for equal values must return false")
}
if !sid1.equal(sid2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid1, sid2)
}
if !sid2.equal(sid1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid2, sid1)
}
sid1 = &streamID{
tenantID: TenantID{
AccountID: 1,
ProjectID: 2,
},
id: u128{
hi: 123,
lo: 456,
},
}
sid2 = &streamID{
tenantID: TenantID{
AccountID: 1,
ProjectID: 2,
},
id: u128{
hi: 123,
lo: 456,
},
}
if sid1.less(sid2) {
t.Fatalf("less for equal values must return false")
}
if sid2.less(sid1) {
t.Fatalf("less for equal values must return false")
}
if !sid1.equal(sid2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid1, sid2)
}
if !sid2.equal(sid1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid2, sid1)
}
// compare unequal values
sid1 = &streamID{
id: u128{
lo: 456,
},
}
sid2 = &streamID{
id: u128{
hi: 123,
},
}
if !sid1.less(sid2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", sid1, sid2)
}
if sid2.less(sid1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", sid2, sid1)
}
if sid1.equal(sid2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", sid1, sid2)
}
sid1 = &streamID{
id: u128{
hi: 123,
lo: 456,
},
}
sid2 = &streamID{
tenantID: TenantID{
AccountID: 123,
},
}
if !sid1.less(sid2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", sid1, sid2)
}
if sid2.less(sid1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", sid2, sid1)
}
if sid1.equal(sid2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", sid1, sid2)
}
}
func TestStreamIDReset(t *testing.T) {
sid := &streamID{
tenantID: TenantID{
AccountID: 123,
ProjectID: 456,
},
id: u128{
hi: 234,
lo: 9843,
},
}
sid.reset()
sidZero := &streamID{}
if !reflect.DeepEqual(sid, sidZero) {
t.Fatalf("non-zero streamID after reset(): %s", sid)
}
}

View file

@ -0,0 +1,298 @@
package logstorage
import (
"bytes"
"fmt"
"sort"
"strconv"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// GetStreamTags returns a StreamTags from pool.
func GetStreamTags() *StreamTags {
v := streamTagsPool.Get()
if v == nil {
return &StreamTags{}
}
return v.(*StreamTags)
}
// PutStreamTags returns st to the pool.
func PutStreamTags(st *StreamTags) {
st.Reset()
streamTagsPool.Put(st)
}
var streamTagsPool sync.Pool
// StreamTags contains stream tags.
type StreamTags struct {
// buf holds all the data backed by tags
buf []byte
// tags contains added tags.
tags []streamTag
}
// Reset resets st for re-use
func (st *StreamTags) Reset() {
st.buf = st.buf[:0]
tags := st.tags
for i := range tags {
t := &tags[i]
t.Name = nil
t.Value = nil
}
st.tags = tags[:0]
}
// String returns string representation of st.
func (st *StreamTags) String() string {
b := st.marshalString(nil)
return string(b)
}
func (st *StreamTags) marshalString(dst []byte) []byte {
dst = append(dst, '{')
tags := st.tags
if len(tags) > 0 {
dst = tags[0].marshalString(dst)
tags = tags[1:]
for i := range tags {
dst = append(dst, ',')
dst = tags[i].marshalString(dst)
}
}
dst = append(dst, '}')
return dst
}
// Add adds (name:value) tag to st.
func (st *StreamTags) Add(name, value string) {
if len(name) == 0 || len(value) == 0 {
return
}
buf := st.buf
bufLen := len(buf)
buf = append(buf, name...)
bName := buf[bufLen:]
bufLen = len(buf)
buf = append(buf, value...)
bValue := buf[bufLen:]
st.buf = buf
st.tags = append(st.tags, streamTag{
Name: bName,
Value: bValue,
})
}
// MarshalCanonical marshal st in a canonical way
func (st *StreamTags) MarshalCanonical(dst []byte) []byte {
sort.Sort(st)
tags := st.tags
dst = encoding.MarshalVarUint64(dst, uint64(len(tags)))
for i := range tags {
tag := &tags[i]
dst = encoding.MarshalBytes(dst, tag.Name)
dst = encoding.MarshalBytes(dst, tag.Value)
}
return dst
}
// UnmarshalCanonical unmarshals st from src marshaled with MarshalCanonical.
func (st *StreamTags) UnmarshalCanonical(src []byte) ([]byte, error) {
st.Reset()
srcOrig := src
tail, n, err := encoding.UnmarshalVarUint64(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal tags len: %w", err)
}
src = tail
for i := uint64(0); i < n; i++ {
tail, name, err := encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal tag name: %w", err)
}
src = tail
tail, value, err := encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot unmarshal tag value: %w", err)
}
src = tail
sName := bytesutil.ToUnsafeString(name)
sValue := bytesutil.ToUnsafeString(value)
st.Add(sName, sValue)
}
return src, nil
}
func getStreamTagsString(streamTagsCanonical []byte) string {
st := GetStreamTags()
mustUnmarshalStreamTags(st, streamTagsCanonical)
s := st.String()
PutStreamTags(st)
return s
}
func mustUnmarshalStreamTags(dst *StreamTags, src []byte) {
tail, err := dst.UnmarshalCanonical(src)
if err != nil {
logger.Panicf("FATAL: cannot unmarshal StreamTags from value obtained from cache: %s", err)
}
if len(tail) > 0 {
logger.Panicf("FATAL: unexpected tail left after unmarshaling StreamTags; len(tail)=%d; tail=%q", len(tail), tail)
}
}
// Len returns the number of tags in st.
func (st *StreamTags) Len() int {
return len(st.tags)
}
// Less returns true if tag i is smaller than the tag j.
func (st *StreamTags) Less(i, j int) bool {
tags := st.tags
return tags[i].less(&tags[j])
}
// Swap swaps i and j tags
func (st *StreamTags) Swap(i, j int) {
tags := st.tags
tags[i], tags[j] = tags[j], tags[i]
}
// streamTag represents a (name:value) tag for stream.
type streamTag struct {
Name []byte
Value []byte
}
func (tag *streamTag) marshalString(dst []byte) []byte {
dst = append(dst, tag.Name...)
dst = append(dst, '=')
dst = strconv.AppendQuote(dst, bytesutil.ToUnsafeString(tag.Value))
return dst
}
// reset resets the tag.
func (tag *streamTag) reset() {
tag.Name = tag.Name[:0]
tag.Value = tag.Value[:0]
}
func (tag *streamTag) equal(t *streamTag) bool {
return string(tag.Name) == string(t.Name) && string(tag.Value) == string(t.Value)
}
func (tag *streamTag) less(t *streamTag) bool {
if string(tag.Name) != string(t.Name) {
return string(tag.Name) < string(t.Name)
}
return string(tag.Value) < string(t.Value)
}
func (tag *streamTag) indexdbMarshal(dst []byte) []byte {
dst = marshalTagValue(dst, tag.Name)
dst = marshalTagValue(dst, tag.Value)
return dst
}
func (tag *streamTag) indexdbUnmarshal(src []byte) ([]byte, error) {
var err error
src, tag.Name, err = unmarshalTagValue(tag.Name[:0], src)
if err != nil {
return src, fmt.Errorf("cannot unmarshal key: %w", err)
}
src, tag.Value, err = unmarshalTagValue(tag.Value[:0], src)
if err != nil {
return src, fmt.Errorf("cannot unmarshal value: %w", err)
}
return src, nil
}
const (
escapeChar = 0
tagSeparatorChar = 1
kvSeparatorChar = 2
)
func marshalTagValue(dst, src []byte) []byte {
n1 := bytes.IndexByte(src, escapeChar)
n2 := bytes.IndexByte(src, tagSeparatorChar)
n3 := bytes.IndexByte(src, kvSeparatorChar)
if n1 < 0 && n2 < 0 && n3 < 0 {
// Fast path.
dst = append(dst, src...)
dst = append(dst, tagSeparatorChar)
return dst
}
// Slow path.
for _, ch := range src {
switch ch {
case escapeChar:
dst = append(dst, escapeChar, '0')
case tagSeparatorChar:
dst = append(dst, escapeChar, '1')
case kvSeparatorChar:
dst = append(dst, escapeChar, '2')
default:
dst = append(dst, ch)
}
}
dst = append(dst, tagSeparatorChar)
return dst
}
func unmarshalTagValue(dst, src []byte) ([]byte, []byte, error) {
n := bytes.IndexByte(src, tagSeparatorChar)
if n < 0 {
return src, dst, fmt.Errorf("cannot find the end of tag value")
}
b := src[:n]
src = src[n+1:]
for {
n := bytes.IndexByte(b, escapeChar)
if n < 0 {
dst = append(dst, b...)
return src, dst, nil
}
dst = append(dst, b[:n]...)
b = b[n+1:]
if len(b) == 0 {
return src, dst, fmt.Errorf("missing escaped char")
}
switch b[0] {
case '0':
dst = append(dst, escapeChar)
case '1':
dst = append(dst, tagSeparatorChar)
case '2':
dst = append(dst, kvSeparatorChar)
default:
return src, dst, fmt.Errorf("unsupported escaped char: %c", b[0])
}
b = b[1:]
}
}

View file

@ -0,0 +1,91 @@
package logstorage
import (
"fmt"
"net/http"
"strconv"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
// TenantID is an id of a tenant for log streams.
//
// Each log stream is associated with a single TenantID.
type TenantID struct {
// AccountID is the id of the account for the log stream.
AccountID uint32
// ProjectID is the id of the project for the log stream.
ProjectID uint32
}
// Reset resets tid.
func (tid *TenantID) Reset() {
tid.AccountID = 0
tid.ProjectID = 0
}
// String returns human-readable representation of tid
func (tid *TenantID) String() string {
return fmt.Sprintf("{accountID=%d,projectID=%d}", tid.AccountID, tid.ProjectID)
}
// equal returns true if tid equals to a.
func (tid *TenantID) equal(a *TenantID) bool {
return tid.AccountID == a.AccountID && tid.ProjectID == a.ProjectID
}
// less returns true if tid is less than a.
func (tid *TenantID) less(a *TenantID) bool {
if tid.AccountID != a.AccountID {
return tid.AccountID < a.AccountID
}
return tid.ProjectID < a.ProjectID
}
// marshal appends the marshaled tid to dst and returns the result
func (tid *TenantID) marshal(dst []byte) []byte {
dst = encoding.MarshalUint32(dst, tid.AccountID)
dst = encoding.MarshalUint32(dst, tid.ProjectID)
return dst
}
// unmarshal unmarshals tid from src and returns the remaining tail.
func (tid *TenantID) unmarshal(src []byte) ([]byte, error) {
if len(src) < 8 {
return src, fmt.Errorf("cannot unmarshal tenantID from %d bytes; need at least 8 bytes", len(src))
}
tid.AccountID = encoding.UnmarshalUint32(src[:4])
tid.ProjectID = encoding.UnmarshalUint32(src[4:])
return src[8:], nil
}
// GetTenantIDFromRequest returns tenantID from r.
func GetTenantIDFromRequest(r *http.Request) (TenantID, error) {
var tenantID TenantID
accountID, err := getUint32FromHeader(r, "AccountID")
if err != nil {
return tenantID, err
}
projectID, err := getUint32FromHeader(r, "ProjectID")
if err != nil {
return tenantID, err
}
tenantID.AccountID = accountID
tenantID.ProjectID = projectID
return tenantID, nil
}
func getUint32FromHeader(r *http.Request, headerName string) (uint32, error) {
s := r.Header.Get(headerName)
if len(s) == 0 {
return 0, nil
}
n, err := strconv.ParseUint(s, 10, 32)
if err != nil {
return 0, fmt.Errorf("cannot parse %s header %q: %w", headerName, s, err)
}
return uint32(n), nil
}

View file

@ -0,0 +1,124 @@
package logstorage
import (
"reflect"
"testing"
)
func TestTenantIDMarshalUnmarshal(t *testing.T) {
f := func(tid *TenantID) {
t.Helper()
data := tid.marshal(nil)
var tid2 TenantID
tail, err := tid2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error at unmarshal(%s): %s", tid, err)
}
if len(tail) != 0 {
t.Fatalf("unexpected non-emtpy tail after unmarshal(%s): %X", tid, tail)
}
if !reflect.DeepEqual(tid, &tid2) {
t.Fatalf("unexpected value after unmarshal; got %s; want %s", &tid2, tid)
}
s1 := tid.String()
s2 := tid2.String()
if s1 != s2 {
t.Fatalf("unexpected string value after unmarshal; got %s; want %s", s2, s1)
}
}
f(&TenantID{})
f(&TenantID{
AccountID: 123,
ProjectID: 456,
})
}
func TestTenantIDUnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var tid TenantID
tail, err := tid.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("abc"))
}
func TestTenantIDLessEqual(t *testing.T) {
// compare equal values
tid1 := &TenantID{}
tid2 := &TenantID{}
if tid1.less(tid2) {
t.Fatalf("less for equal values must return false")
}
if tid2.less(tid1) {
t.Fatalf("less for equal values must return false")
}
if !tid1.equal(tid2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid1, tid2)
}
if !tid2.equal(tid1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid2, tid1)
}
tid1 = &TenantID{
AccountID: 123,
ProjectID: 456,
}
tid2 = &TenantID{
AccountID: 123,
ProjectID: 456,
}
if tid1.less(tid2) {
t.Fatalf("less for equal values must return false")
}
if tid2.less(tid1) {
t.Fatalf("less for equal values must return false")
}
if !tid1.equal(tid2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid1, tid2)
}
if !tid2.equal(tid1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid2, tid1)
}
// compare unequal values
tid1 = &TenantID{
ProjectID: 456,
}
tid2 = &TenantID{
AccountID: 123,
}
if !tid1.less(tid2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", tid1, tid2)
}
if tid2.less(tid1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", tid2, tid1)
}
if tid1.equal(tid2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", tid1, tid2)
}
tid1 = &TenantID{
AccountID: 123,
}
tid2 = &TenantID{
AccountID: 123,
ProjectID: 456,
}
if !tid1.less(tid2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", tid1, tid2)
}
if tid2.less(tid1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", tid2, tid1)
}
if tid1.equal(tid2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", tid1, tid2)
}
}

153
lib/logstorage/tokenizer.go Normal file
View file

@ -0,0 +1,153 @@
package logstorage
import (
"sort"
"sync"
"unicode"
)
// tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
func tokenizeStrings(dst, a []string) []string {
t := getTokenizer()
m := t.m
for i, s := range a {
if i > 0 && s == a[i-1] {
// This string has been already tokenized
continue
}
tokenizeString(m, s)
}
dstLen := len(dst)
for k := range t.m {
dst = append(dst, k)
}
putTokenizer(t)
// Sort tokens with zero memory allocations
ss := getStringsSorter(dst[dstLen:])
sort.Sort(ss)
putStringsSorter(ss)
return dst
}
type tokenizer struct {
m map[string]struct{}
}
func (t *tokenizer) reset() {
m := t.m
for k := range m {
delete(m, k)
}
}
func tokenizeString(dst map[string]struct{}, s string) {
for len(s) > 0 {
// Search for the next token.
nextIdx := len(s)
for i, c := range s {
if isTokenRune(c) {
nextIdx = i
break
}
}
s = s[nextIdx:]
// Search for the end of the token
nextIdx = len(s)
for i, c := range s {
if !isTokenRune(c) {
nextIdx = i
break
}
}
token := s[:nextIdx]
if len(token) > 0 {
dst[token] = struct{}{}
}
s = s[nextIdx:]
}
}
func isTokenRune(c rune) bool {
return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
}
func getTokenizer() *tokenizer {
v := tokenizerPool.Get()
if v == nil {
return &tokenizer{
m: make(map[string]struct{}),
}
}
return v.(*tokenizer)
}
func putTokenizer(t *tokenizer) {
t.reset()
tokenizerPool.Put(t)
}
var tokenizerPool sync.Pool
type stringsSorter struct {
a []string
}
func (ss *stringsSorter) Len() int {
return len(ss.a)
}
func (ss *stringsSorter) Swap(i, j int) {
a := ss.a
a[i], a[j] = a[j], a[i]
}
func (ss *stringsSorter) Less(i, j int) bool {
a := ss.a
return a[i] < a[j]
}
func getStringsSorter(a []string) *stringsSorter {
v := stringsSorterPool.Get()
if v == nil {
return &stringsSorter{
a: a,
}
}
ss := v.(*stringsSorter)
ss.a = a
return ss
}
func putStringsSorter(ss *stringsSorter) {
ss.a = nil
stringsSorterPool.Put(ss)
}
var stringsSorterPool sync.Pool
type tokensBuf struct {
A []string
}
func (tb *tokensBuf) reset() {
a := tb.A
for i := range a {
a[i] = ""
}
tb.A = a[:0]
}
func getTokensBuf() *tokensBuf {
v := tokensBufPool.Get()
if v == nil {
return &tokensBuf{}
}
return v.(*tokensBuf)
}
func putTokensBuf(tb *tokensBuf) {
tb.reset()
tokensBufPool.Put(tb)
}
var tokensBufPool sync.Pool

View file

@ -0,0 +1,29 @@
package logstorage
import (
"reflect"
"strings"
"testing"
)
func TestTokenizeStrings(t *testing.T) {
f := func(a, tokensExpected []string) {
t.Helper()
tokens := tokenizeStrings(nil, a)
if !reflect.DeepEqual(tokens, tokensExpected) {
t.Fatalf("unexpected tokens;\ngot\n%q\nwant\n%q", tokens, tokensExpected)
}
}
f(nil, nil)
f([]string{""}, nil)
f([]string{"foo"}, []string{"foo"})
f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"TaSte", "bar", "baz", "foo"})
f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"1234", "34", "AS", "f12", "теСТ"})
f(strings.Split(`
Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
`, "\n"), []string{"01", "1", "12181", "13", "22034", "28", "2812", "36020", "38", "43", "45", "48", "497806", "Apr", "CMD", "CPU0", "CRON",
"Core", "above", "clock", "command", "cpu", "debian", "dev", "events", "kernel", "localhost", "null", "online", "root",
"sa1", "temperature", "threshold", "throttled", "total", "v", "whoopsie"})
}

View file

@ -0,0 +1,19 @@
package logstorage
import (
"strings"
"testing"
)
func BenchmarkTokenizeStrings(b *testing.B) {
a := strings.Split(benchLogs, "\n")
b.ReportAllocs()
b.SetBytes(int64(len(benchLogs)))
b.RunParallel(func(pb *testing.PB) {
var tokens []string
for pb.Next() {
tokens = tokenizeStrings(tokens[:0], a)
}
})
}

50
lib/logstorage/u128.go Normal file
View file

@ -0,0 +1,50 @@
package logstorage
import (
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
)
// u128 is 128-bit uint number.
//
// It is used as an unique id of stream.
type u128 struct {
hi uint64
lo uint64
}
// String returns human-readable representation of u.
func (u *u128) String() string {
return fmt.Sprintf("{hi=%d,lo=%d}", u.hi, u.lo)
}
// less returns true if u is less than a.
func (u *u128) less(a *u128) bool {
if u.hi != a.hi {
return u.hi < a.hi
}
return u.lo < a.lo
}
// equal returns true if u equalst to a.
func (u *u128) equal(a *u128) bool {
return u.hi == a.hi && u.lo == a.lo
}
// marshal appends the marshaled u to dst and returns the result.
func (u *u128) marshal(dst []byte) []byte {
dst = encoding.MarshalUint64(dst, u.hi)
dst = encoding.MarshalUint64(dst, u.lo)
return dst
}
// unmarshal unmarshals u from src and returns the tail.
func (u *u128) unmarshal(src []byte) ([]byte, error) {
if len(src) < 16 {
return src, fmt.Errorf("cannot unmarshal u128 from %d bytes; need at least 16 bytes", len(src))
}
u.hi = encoding.UnmarshalUint64(src[:8])
u.lo = encoding.UnmarshalUint64(src[8:])
return src[16:], nil
}

127
lib/logstorage/u128_test.go Normal file
View file

@ -0,0 +1,127 @@
package logstorage
import (
"reflect"
"testing"
)
func TestU128MarshalUnmarshal(t *testing.T) {
f := func(u *u128, marshaledLen int) {
t.Helper()
data := u.marshal(nil)
if len(data) != marshaledLen {
t.Fatalf("unexpected length of marshaled u128; got %d; want %d", len(data), marshaledLen)
}
var u2 u128
tail, err := u2.unmarshal(data)
if err != nil {
t.Fatalf("unexpected error at unmarshal(%s): %s", u, err)
}
if len(tail) != 0 {
t.Fatalf("unexpected non-emtpy tail after unmarshal(%s): %X", u, tail)
}
if !reflect.DeepEqual(u, &u2) {
t.Fatalf("unexpected value obtained from unmarshal(%s); got %s; want %s", u, &u2, u)
}
s1 := u.String()
s2 := u2.String()
if s1 != s2 {
t.Fatalf("unexpected string representation after unmarshal; got %s; want %s", s2, s1)
}
}
f(&u128{}, 16)
f(&u128{
hi: 123,
lo: 456,
}, 16)
}
func TestU128UnmarshalFailure(t *testing.T) {
f := func(data []byte) {
t.Helper()
dataOrig := append([]byte{}, data...)
var u u128
tail, err := u.unmarshal(data)
if err == nil {
t.Fatalf("expecting non-nil error")
}
if string(tail) != string(dataOrig) {
t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
}
}
f(nil)
f([]byte("foo"))
}
func TestU128LessEqual(t *testing.T) {
// compare equal values
u1 := &u128{}
u2 := &u128{}
if u1.less(u2) {
t.Fatalf("less for equal values must return false")
}
if u2.less(u1) {
t.Fatalf("less for equal values must return false")
}
if !u1.equal(u2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u1, u2)
}
if !u2.equal(u1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u2, u1)
}
u1 = &u128{
hi: 123,
lo: 456,
}
u2 = &u128{
hi: 123,
lo: 456,
}
if u1.less(u2) {
t.Fatalf("less for equal values must return false")
}
if u2.less(u1) {
t.Fatalf("less for equal values must return false")
}
if !u1.equal(u2) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u1, u2)
}
if !u2.equal(u1) {
t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u2, u1)
}
// compare unequal values
u1 = &u128{
lo: 456,
}
u2 = &u128{
hi: 123,
}
if !u1.less(u2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", u1, u2)
}
if u2.less(u1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", u2, u1)
}
if u1.equal(u2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", u1, u2)
}
u1 = &u128{
hi: 123,
}
u2 = &u128{
hi: 123,
lo: 456,
}
if !u1.less(u2) {
t.Fatalf("unexpected result for less(%s, %s); got false; want true", u1, u2)
}
if u2.less(u1) {
t.Fatalf("unexpected result for less(%s, %s); got true; want false", u2, u1)
}
if u1.equal(u2) {
t.Fatalf("unexpected result for equal(%s, %s); got true; want false", u1, u2)
}
}

View file

@ -0,0 +1,742 @@
package logstorage
import (
"fmt"
"math"
"math/bits"
"strconv"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// valueType is the type of values stored in every column block.
type valueType byte
const (
// valueTypeUnknown is used for determining whether the value type is unknown.
valueTypeUnknown = valueType(0)
// default encoding for column blocks. Strings are stored as is.
valueTypeString = valueType(1)
// column blocks with small number of unique values are encoded as dict.
valueTypeDict = valueType(2)
// uint values up to 2^8-1 are encoded into valueTypeUint8.
// Every value occupies a single byte.
valueTypeUint8 = valueType(3)
// uint values up to 2^16-1 are encoded into valueTypeUint16.
// Every value occupies 2 bytes.
valueTypeUint16 = valueType(4)
// uint values up to 2^31-1 are encoded into valueTypeUint32.
// Every value occupies 4 bytes.
valueTypeUint32 = valueType(5)
// uint values up to 2^64-1 are encoded into valueTypeUint64.
// Every value occupies 8 bytes.
valueTypeUint64 = valueType(6)
// floating-point values are encoded into valueTypeFloat64.
valueTypeFloat64 = valueType(7)
// column blocks with ipv4 addresses are encoded as 4-byte strings.
valueTypeIPv4 = valueType(8)
// column blocks with ISO8601 timestamps are encoded into valueTypeTimestampISO8601.
// These timestamps are commonly used by Logstash.
valueTypeTimestampISO8601 = valueType(9)
)
type valuesEncoder struct {
// buf contains data for values.
buf []byte
// values contains encoded values.
values []string
}
func (ve *valuesEncoder) reset() {
ve.buf = ve.buf[:0]
vs := ve.values
for i := range vs {
vs[i] = ""
}
ve.values = vs[:0]
}
// encode encodes values to ve.values and returns the encoded value type with min/max encoded values.
func (ve *valuesEncoder) encode(values []string, dict *valuesDict) (valueType, uint64, uint64) {
ve.reset()
if len(values) == 0 {
return valueTypeString, 0, 0
}
var vt valueType
var minValue, maxValue uint64
// Try dict encoding at first, since it gives the highest speedup during querying.
// It also usually gives the best compression, since every value is encoded as a single byte.
ve.buf, ve.values, vt = tryDictEncoding(ve.buf[:0], ve.values[:0], values, dict)
if vt != valueTypeUnknown {
return vt, 0, 0
}
ve.buf, ve.values, vt, minValue, maxValue = tryUintEncoding(ve.buf[:0], ve.values[:0], values)
if vt != valueTypeUnknown {
return vt, minValue, maxValue
}
ve.buf, ve.values, vt, minValue, maxValue = tryFloat64Encoding(ve.buf[:0], ve.values[:0], values)
if vt != valueTypeUnknown {
return vt, minValue, maxValue
}
ve.buf, ve.values, vt, minValue, maxValue = tryIPv4Encoding(ve.buf[:0], ve.values[:0], values)
if vt != valueTypeUnknown {
return vt, minValue, maxValue
}
ve.buf, ve.values, vt, minValue, maxValue = tryTimestampISO8601Encoding(ve.buf[:0], ve.values[:0], values)
if vt != valueTypeUnknown {
return vt, minValue, maxValue
}
// Fall back to default encoding, e.g. leave values as is.
ve.values = append(ve.values[:0], values...)
return valueTypeString, 0, 0
}
func getValuesEncoder() *valuesEncoder {
v := valuesEncoderPool.Get()
if v == nil {
return &valuesEncoder{}
}
return v.(*valuesEncoder)
}
func putValuesEncoder(ve *valuesEncoder) {
ve.reset()
valuesEncoderPool.Put(ve)
}
var valuesEncoderPool sync.Pool
type valuesDecoder struct {
buf []byte
}
func (vd *valuesDecoder) reset() {
vd.buf = vd.buf[:0]
}
// decodeInplace decodes values encoded with the given vt and the given dict inplace.
//
// the decoded values remain valid until vd.reset() is called.
func (vd *valuesDecoder) decodeInplace(values []string, vt valueType, dict *valuesDict) error {
// do not reset vd.buf, since it may contain previously decoded data,
// which must be preserved until reset() call.
dstBuf := vd.buf
switch vt {
case valueTypeString:
// nothing to do - values are already decoded.
case valueTypeUint8:
for i, v := range values {
if len(v) != 1 {
return fmt.Errorf("unexpected value length for uint8; got %d; want 1", len(v))
}
n := uint64(v[0])
dstLen := len(dstBuf)
dstBuf = strconv.AppendUint(dstBuf, n, 10)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeUint16:
for i, v := range values {
if len(v) != 2 {
return fmt.Errorf("unexpected value length for uint16; got %d; want 2", len(v))
}
b := bytesutil.ToUnsafeBytes(v)
n := uint64(encoding.UnmarshalUint16(b))
dstLen := len(dstBuf)
dstBuf = strconv.AppendUint(dstBuf, n, 10)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeUint32:
for i, v := range values {
if len(v) != 4 {
return fmt.Errorf("unexpected value length for uint32; got %d; want 4", len(v))
}
b := bytesutil.ToUnsafeBytes(v)
n := uint64(encoding.UnmarshalUint32(b))
dstLen := len(dstBuf)
dstBuf = strconv.AppendUint(dstBuf, n, 10)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeUint64:
for i, v := range values {
if len(v) != 8 {
return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
}
b := bytesutil.ToUnsafeBytes(v)
n := encoding.UnmarshalUint64(b)
dstLen := len(dstBuf)
dstBuf = strconv.AppendUint(dstBuf, n, 10)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeDict:
dictValues := dict.values
for i, v := range values {
id := int(v[0])
if id >= len(dictValues) {
return fmt.Errorf("unexpected dictionary id: %d; it must be smaller than %d", id, len(dictValues))
}
values[i] = dictValues[id]
}
case valueTypeIPv4:
for i, v := range values {
if len(v) != 4 {
return fmt.Errorf("unexpected value length for ipv4; got %d; want 4", len(v))
}
dstLen := len(dstBuf)
dstBuf = toIPv4String(dstBuf, v)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeTimestampISO8601:
for i, v := range values {
if len(v) != 8 {
return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
}
dstLen := len(dstBuf)
dstBuf = toTimestampISO8601String(dstBuf, v)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
case valueTypeFloat64:
for i, v := range values {
if len(v) != 8 {
return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
}
dstLen := len(dstBuf)
dstBuf = toFloat64String(dstBuf, v)
values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
}
default:
return fmt.Errorf("unknown valueType=%d", vt)
}
vd.buf = dstBuf
return nil
}
func toTimestampISO8601String(dst []byte, v string) []byte {
b := bytesutil.ToUnsafeBytes(v)
n := encoding.UnmarshalUint64(b)
t := time.Unix(0, int64(n)).UTC()
dst = t.AppendFormat(dst, iso8601Timestamp)
return dst
}
func toIPv4String(dst []byte, v string) []byte {
dst = strconv.AppendUint(dst, uint64(v[0]), 10)
dst = append(dst, '.')
dst = strconv.AppendUint(dst, uint64(v[1]), 10)
dst = append(dst, '.')
dst = strconv.AppendUint(dst, uint64(v[2]), 10)
dst = append(dst, '.')
dst = strconv.AppendUint(dst, uint64(v[3]), 10)
return dst
}
func toFloat64String(dst []byte, v string) []byte {
b := bytesutil.ToUnsafeBytes(v)
n := encoding.UnmarshalUint64(b)
f := math.Float64frombits(n)
dst = strconv.AppendFloat(dst, f, 'g', -1, 64)
return dst
}
func getValuesDecoder() *valuesDecoder {
v := valuesDecoderPool.Get()
if v == nil {
return &valuesDecoder{}
}
return v.(*valuesDecoder)
}
func putValuesDecoder(vd *valuesDecoder) {
vd.reset()
valuesDecoderPool.Put(vd)
}
var valuesDecoderPool sync.Pool
func tryTimestampISO8601Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
u64s := encoding.GetUint64s(len(srcValues))
defer encoding.PutUint64s(u64s)
a := u64s.A
var minValue, maxValue uint64
for i, v := range srcValues {
n, ok := tryParseTimestampISO8601(v)
if !ok {
return dstBuf, dstValues, valueTypeUnknown, 0, 0
}
a[i] = n
if i == 0 || n < minValue {
minValue = n
}
if i == 0 || n > maxValue {
maxValue = n
}
}
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint64(dstBuf, n)
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeTimestampISO8601, minValue, maxValue
}
func tryParseTimestampISO8601(s string) (uint64, bool) {
// Do not parse timestamps with timezone, since they cannot be converted back
// to the same string representation in general case.
// This may break search.
if len(s) != len("2006-01-02T15:04:05.000Z") {
return 0, false
}
// Parse year
if s[len("YYYY")] != '-' {
return 0, false
}
yearStr := s[:len("YYYY")]
n, ok := tryParseUint64(yearStr)
if !ok || n > 3000 {
return 0, false
}
year := int(n)
s = s[len("YYYY")+1:]
// Parse month
if s[len("MM")] != '-' {
return 0, false
}
monthStr := s[:len("MM")]
n, ok = tryParseUint64(monthStr)
if !ok || n < 1 || n > 12 {
return 0, false
}
month := time.Month(n)
s = s[len("MM")+1:]
// Parse day
if s[len("DD")] != 'T' {
return 0, false
}
dayStr := s[:len("DD")]
n, ok = tryParseUint64(dayStr)
if !ok || n < 1 || n > 31 {
return 0, false
}
day := int(n)
s = s[len("DD")+1:]
// Parse hour
if s[len("HH")] != ':' {
return 0, false
}
hourStr := s[:len("HH")]
n, ok = tryParseUint64(hourStr)
if !ok || n > 23 {
return 0, false
}
hour := int(n)
s = s[len("HH")+1:]
// Parse minute
if s[len("MM")] != ':' {
return 0, false
}
minuteStr := s[:len("MM")]
n, ok = tryParseUint64(minuteStr)
if !ok || n > 59 {
return 0, false
}
minute := int(n)
s = s[len("MM")+1:]
// Parse second
if s[len("SS")] != '.' {
return 0, false
}
secondStr := s[:len("SS")]
n, ok = tryParseUint64(secondStr)
if !ok || n > 59 {
return 0, false
}
second := int(n)
s = s[len("SS")+1:]
// Parse millisecond
tzDelimiter := s[len("000")]
if tzDelimiter != 'Z' {
return 0, false
}
millisecondStr := s[:len("000")]
n, ok = tryParseUint64(millisecondStr)
if !ok || n > 999 {
return 0, false
}
millisecond := int(n)
s = s[len("000")+1:]
if len(s) != 0 {
return 0, false
}
t := time.Date(year, month, day, hour, minute, second, millisecond*1e6, time.UTC)
ts := t.UnixNano()
return uint64(ts), true
}
func tryParseUint64(s string) (uint64, bool) {
if len(s) == 0 || len(s) > 18 {
return 0, false
}
n := uint64(0)
for i := 0; i < len(s); i++ {
ch := s[i]
if ch < '0' || ch > '9' {
return 0, false
}
n *= 10
n += uint64(ch - '0')
}
return n, true
}
const iso8601Timestamp = "2006-01-02T15:04:05.000Z"
func tryIPv4Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
u32s := encoding.GetUint32s(len(srcValues))
defer encoding.PutUint32s(u32s)
a := u32s.A
var minValue, maxValue uint32
for i, v := range srcValues {
n, ok := tryParseIPv4(v)
if !ok {
return dstBuf, dstValues, valueTypeUnknown, 0, 0
}
a[i] = n
if i == 0 || n < minValue {
minValue = n
}
if i == 0 || n > maxValue {
maxValue = n
}
}
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint32(dstBuf, n)
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeIPv4, uint64(minValue), uint64(maxValue)
}
func tryParseIPv4(s string) (uint32, bool) {
if len(s) < len("1.1.1.1") || len(s) > len("255.255.255.255") || strings.Count(s, ".") != 3 {
// Fast path - the entry isn't IPv4
return 0, false
}
var octets [4]byte
var v uint64
var ok bool
// Parse octet 1
n := strings.IndexByte(s, '.')
if n <= 0 || n > 3 {
return 0, false
}
v, ok = tryParseUint64(s[:n])
if !ok || v > 255 {
return 0, false
}
octets[0] = byte(v)
s = s[n+1:]
// Parse octet 2
n = strings.IndexByte(s, '.')
if n <= 0 || n > 3 {
return 0, false
}
v, ok = tryParseUint64(s[:n])
if !ok || v > 255 {
return 0, false
}
octets[1] = byte(v)
s = s[n+1:]
// Parse octet 3
n = strings.IndexByte(s, '.')
if n <= 0 || n > 3 {
return 0, false
}
v, ok = tryParseUint64(s[:n])
if !ok || v > 255 {
return 0, false
}
octets[2] = byte(v)
s = s[n+1:]
// Parse octet 4
v, ok = tryParseUint64(s)
if !ok || v > 255 {
return 0, false
}
octets[3] = byte(v)
ipv4 := encoding.UnmarshalUint32(octets[:])
return ipv4, true
}
func tryFloat64Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
u64s := encoding.GetUint64s(len(srcValues))
defer encoding.PutUint64s(u64s)
a := u64s.A
var minValue, maxValue float64
for i, v := range srcValues {
f, ok := tryParseFloat64(v)
if !ok {
return dstBuf, dstValues, valueTypeUnknown, 0, 0
}
a[i] = math.Float64bits(f)
if i == 0 || f < minValue {
minValue = f
}
if i == 0 || f > maxValue {
maxValue = f
}
}
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint64(dstBuf, n)
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
minValueU64 := math.Float64bits(minValue)
maxValueU64 := math.Float64bits(maxValue)
return dstBuf, dstValues, valueTypeFloat64, minValueU64, maxValueU64
}
func tryParseFloat64(s string) (float64, bool) {
if len(s) == 0 || len(s) > 20 {
return 0, false
}
// Allow only decimal digits, minus and a dot.
// Do not allows scientific notation (for example 1.23E+05),
// since it cannot be converted back to the same string form.
minus := s[0] == '-'
if minus {
s = s[1:]
}
n := strings.IndexByte(s, '.')
if n < 0 {
// fast path - there are no dots
n, ok := tryParseUint64(s)
if !ok {
return 0, false
}
f := float64(n)
if minus {
f = -f
}
return f, true
}
if n == 0 || n == len(s)-1 {
// Do not allow dots at the beginning and at the end of s,
// since they cannot be converted back to the same string form.
return 0, false
}
sInt := s[:n]
sFrac := s[n+1:]
nInt, ok := tryParseUint64(sInt)
if !ok {
return 0, false
}
nFrac, ok := tryParseUint64(sFrac)
if !ok {
return 0, false
}
f := math.FMA(float64(nFrac), math.Pow10(-len(sFrac)), float64(nInt))
if minus {
f = -f
}
return f, true
}
func tryUintEncoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
u64s := encoding.GetUint64s(len(srcValues))
defer encoding.PutUint64s(u64s)
a := u64s.A
var minValue, maxValue uint64
for i, v := range srcValues {
n, ok := tryParseUint64(v)
if !ok {
return dstBuf, dstValues, valueTypeUnknown, 0, 0
}
a[i] = n
if i == 0 || n < minValue {
minValue = n
}
if i == 0 || n > maxValue {
maxValue = n
}
}
minBitSize := bits.Len64(maxValue)
if minBitSize <= 8 {
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = append(dstBuf, byte(n))
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeUint8, minValue, maxValue
}
if minBitSize <= 16 {
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint16(dstBuf, uint16(n))
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeUint16, minValue, maxValue
}
if minBitSize <= 32 {
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint32(dstBuf, uint32(n))
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeUint32, minValue, maxValue
}
for _, n := range a {
dstLen := len(dstBuf)
dstBuf = encoding.MarshalUint64(dstBuf, n)
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeUint64, minValue, maxValue
}
func tryDictEncoding(dstBuf []byte, dstValues, srcValues []string, dict *valuesDict) ([]byte, []string, valueType) {
dict.reset()
dstBufOrig := dstBuf
dstValuesOrig := dstValues
for _, v := range srcValues {
id, ok := dict.getOrAdd(v)
if !ok {
dict.reset()
return dstBufOrig, dstValuesOrig, valueTypeUnknown
}
dstLen := len(dstBuf)
dstBuf = append(dstBuf, id)
v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
dstValues = append(dstValues, v)
}
return dstBuf, dstValues, valueTypeDict
}
type valuesDict struct {
values []string
}
func (vd *valuesDict) reset() {
vs := vd.values
for i := range vs {
vs[i] = ""
}
vd.values = vs[:0]
}
func (vd *valuesDict) copyFrom(src *valuesDict) {
vd.reset()
vd.values = append(vd.values[:0], src.values...)
}
func (vd *valuesDict) getOrAdd(k string) (byte, bool) {
if len(k) > maxDictSizeBytes {
return 0, false
}
vs := vd.values
dictSizeBytes := 0
for i, v := range vs {
if k == v {
return byte(i), true
}
dictSizeBytes += len(v)
}
if len(vs) >= maxDictLen || dictSizeBytes+len(k) > maxDictSizeBytes {
return 0, false
}
vs = append(vs, k)
vd.values = vs
return byte(len(vs) - 1), true
}
func (vd *valuesDict) marshal(dst []byte) []byte {
values := vd.values
if len(values) > maxDictLen {
logger.Panicf("BUG: valuesDict may contain max %d items; got %d items", maxDictLen, len(values))
}
dst = append(dst, byte(len(values)))
for _, v := range values {
dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(v))
}
return dst
}
func (vd *valuesDict) unmarshal(src []byte) ([]byte, error) {
vd.reset()
srcOrig := src
if len(src) < 1 {
return srcOrig, fmt.Errorf("cannot umarshal dict len from 0 bytes; need at least 1 byte")
}
dictLen := int(src[0])
src = src[1:]
for i := 0; i < dictLen; i++ {
tail, data, err := encoding.UnmarshalBytes(src)
if err != nil {
return srcOrig, fmt.Errorf("cannot umarshal value %d out of %d from dict: %w", i, dictLen, err)
}
src = tail
// Do not use bytesutil.InternBytes(data) here, since it works slower than the string(data) in prod
v := string(data)
vd.values = append(vd.values, v)
}
return src, nil
}
// maxDictSizeBytes is the maximum length of all the keys in the valuesDict
const maxDictSizeBytes = 256
// maxDictLen is the maximum number of entries in the valuesDict.
//
// it shouldn't exceed 255, since the dict len is marshaled into a single byte.
const maxDictLen = 8

View file

@ -0,0 +1,228 @@
package logstorage
import (
"fmt"
"math"
"reflect"
"testing"
)
func TestValuesEncoder(t *testing.T) {
f := func(values []string, expectedValueType valueType, expectedMinValue, expectedMaxValue uint64) {
t.Helper()
ve := getValuesEncoder()
var dict valuesDict
vt, minValue, maxValue := ve.encode(values, &dict)
if vt != expectedValueType {
t.Fatalf("unexpected value type; got %d; want %d", vt, expectedValueType)
}
if minValue != expectedMinValue {
t.Fatalf("unexpected minValue; got %d; want %d", minValue, expectedMinValue)
}
if maxValue != expectedMaxValue {
t.Fatalf("unexpected maxValue; got %d; want %d", maxValue, expectedMaxValue)
}
encodedValues := append([]string{}, ve.values...)
putValuesEncoder(ve)
vd := getValuesDecoder()
if err := vd.decodeInplace(encodedValues, vt, &dict); err != nil {
t.Fatalf("unexpected error in decodeInplace(): %s", err)
}
if len(values) == 0 {
values = []string{}
}
if !reflect.DeepEqual(values, encodedValues) {
t.Fatalf("unexpected values decoded\ngot\n%q\nwant\n%q", encodedValues, values)
}
putValuesDecoder(vd)
}
// An empty values list
f(nil, valueTypeString, 0, 0)
// string values
values := make([]string, maxDictLen+1)
for i := range values {
values[i] = fmt.Sprintf("value_%d", i)
}
f(values, valueTypeString, 0, 0)
// dict values
f([]string{"foobar"}, valueTypeDict, 0, 0)
f([]string{"foo", "bar"}, valueTypeDict, 0, 0)
f([]string{"1", "2foo"}, valueTypeDict, 0, 0)
// uint8 values
for i := range values {
values[i] = fmt.Sprintf("%d", i+1)
}
f(values, valueTypeUint8, 1, uint64(len(values)))
// uint16 values
for i := range values {
values[i] = fmt.Sprintf("%d", (i+1)<<8)
}
f(values, valueTypeUint16, 1<<8, uint64(len(values)<<8))
// uint32 values
for i := range values {
values[i] = fmt.Sprintf("%d", (i+1)<<16)
}
f(values, valueTypeUint32, 1<<16, uint64(len(values)<<16))
// uint64 values
for i := range values {
values[i] = fmt.Sprintf("%d", (i+1)<<32)
}
f(values, valueTypeUint64, 1<<32, uint64(len(values)<<32))
// ipv4 values
for i := range values {
values[i] = fmt.Sprintf("1.2.3.%d", i)
}
f(values, valueTypeIPv4, 16909056, 16909064)
// iso8601 timestamps
for i := range values {
values[i] = fmt.Sprintf("2011-04-19T03:44:01.%03dZ", i)
}
f(values, valueTypeTimestampISO8601, 1303184641000000000, 1303184641008000000)
// float64 values
for i := range values {
values[i] = fmt.Sprintf("%g", math.Sqrt(float64(i+1)))
}
f(values, valueTypeFloat64, 4607182418800017408, 4613937818241073152)
}
func TestTryParseIPv4(t *testing.T) {
f := func(s string, nExpected uint32, okExpected bool) {
t.Helper()
n, ok := tryParseIPv4(s)
if n != nExpected {
t.Fatalf("unexpected n; got %d; want %d", n, nExpected)
}
if ok != okExpected {
t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
}
}
f("", 0, false)
f("foo", 0, false)
f("a.b.c.d", 0, false)
f("1.2.3.4", 0x01020304, true)
f("255.255.255.255", 0xffffffff, true)
f("0.0.0.0", 0, true)
f("127.0.0.1", 0x7f000001, true)
f("127.0.0.x", 0, false)
f("127.0.x.0", 0, false)
f("127.x.0.0", 0, false)
f("x.0.0.0", 0, false)
f("127.127.127.256", 0, false)
f("127.127.256.127", 0, false)
f("127.256.127.127", 0, false)
f("256.127.127.127", 0, false)
f("-1.127.127.127", 0, false)
f("127.-1.127.127", 0, false)
f("127.127.-1.127", 0, false)
f("127.127.127.-1", 0, false)
}
func TestTryParseTimestampISO8601(t *testing.T) {
f := func(s string, timestampExpected uint64, okExpected bool) {
t.Helper()
timestamp, ok := tryParseTimestampISO8601(s)
if timestamp != timestampExpected {
t.Fatalf("unexpected timestamp; got %d; want %d", timestamp, timestampExpected)
}
if ok != okExpected {
t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
}
}
f("2023-01-15T23:45:51.123Z", 1673826351123000000, true)
// Invalid milliseconds
f("2023-01-15T22:15:51.12345Z", 0, false)
f("2023-01-15T22:15:51.12Z", 0, false)
f("2023-01-15T22:15:51Z", 0, false)
// Missing Z
f("2023-01-15T23:45:51.123", 0, false)
// Invalid timestamp
f("foo", 0, false)
f("2023-01-15T23:45:51.123Zxyabcd", 0, false)
f("2023-01-15T23:45:51.123Z01:00", 0, false)
// timestamp with timezone
f("2023-01-16T00:45:51.123+01:00", 0, false)
}
func TestTryParseFloat64(t *testing.T) {
f := func(s string, valueExpected float64, okExpected bool) {
t.Helper()
value, ok := tryParseFloat64(s)
if value != valueExpected {
t.Fatalf("unexpected value; got %v; want %v", value, valueExpected)
}
if ok != okExpected {
t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
}
}
f("0", 0, true)
f("1234567890", 1234567890, true)
f("-1.234567", -1.234567, true)
// Empty value
f("", 0, false)
// Plus in the value isn't allowed, since it cannot be convered back to the same string representation
f("+123", 0, false)
// Dot at the beginning and the end of value isn't allowed, since it cannot converted back to the same string representation
f(".123", 0, false)
f("123.", 0, false)
// Multiple dots aren't allowed
f("123.434.55", 0, false)
// Invalid dots
f("-.123", 0, false)
f(".", 0, false)
// Scientific notation isn't allowed, since it cannot be converted back to the same string representation
f("12e5", 0, false)
// Minus in the middle of string isn't allowed
f("12-5", 0, false)
}
func TestTryParseUint64(t *testing.T) {
f := func(s string, valueExpected uint64, okExpected bool) {
t.Helper()
value, ok := tryParseUint64(s)
if value != valueExpected {
t.Fatalf("unexpected value; got %d; want %d", value, valueExpected)
}
if ok != okExpected {
t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
}
}
f("0", 0, true)
f("123456789012345678", 123456789012345678, true)
// empty value
f("", 0, false)
// too big value
f("1234567890123456789", 0, false)
// invalid value
f("foo", 0, false)
}

View file

@ -0,0 +1,98 @@
package logstorage
import (
"fmt"
"testing"
)
func BenchmarkTryParseTimestampISO8601(b *testing.B) {
a := []string{
"2023-01-15T23:45:51.123Z",
"2023-02-15T23:45:51.123Z",
"2023-02-15T23:45:51.123+01:00",
"2023-02-15T22:45:51.123-10:30",
"2023-02-15T22:45:51.000Z",
}
b.SetBytes(int64(len(a)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
for _, s := range a {
_, ok := tryParseTimestampISO8601(s)
if !ok {
panic(fmt.Errorf("cannot parse timestamp %q", s))
}
}
}
})
}
func BenchmarkTryParseIPv4(b *testing.B) {
a := []string{
"1.2.3.4",
"127.0.0.1",
"255.255.255.255",
"192.43.234.22",
"32.34.54.198",
}
b.SetBytes(int64(len(a)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
for _, s := range a {
_, ok := tryParseIPv4(s)
if !ok {
panic(fmt.Errorf("cannot parse ipv4 %q", s))
}
}
}
})
}
func BenchmarkTryParseUint64(b *testing.B) {
a := []string{
"1234",
"483932",
"28494",
"90012",
"889111",
}
b.SetBytes(int64(len(a)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
for _, s := range a {
_, ok := tryParseUint64(s)
if !ok {
panic(fmt.Errorf("cannot parse uint %q", s))
}
}
}
})
}
func BenchmarkTryParseFloat64(b *testing.B) {
a := []string{
"1.234",
"4.545",
"456.5645",
"-123.434",
"434.322",
}
b.SetBytes(int64(len(a)))
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
for _, s := range a {
_, ok := tryParseFloat64(s)
if !ok {
panic(fmt.Errorf("cannot parse float64 %q", s))
}
}
}
})
}