app/victoria-logs: initial code release

2025-01-10 15:14:09 +00:00 · 2023-06-19 22:55:12 -07:00 · 2023-06-19 22:55:12 -07:00 · 87b66db47d
commit 87b66db47d
parent aeac39cfd1
82 changed files with 31486 additions and 1 deletions
--- a/62
+++ b/62
@ -21,6 +21,7 @@ include package/release/Makefile

 all: \
 	victoria-metrics-prod \
+	victoria-logs-prod \
 	vmagent-prod \
 	vmalert-prod \
 	vmauth-prod \
@ -33,6 +34,7 @@ clean:

 publish: docker-scan \
 	publish-victoria-metrics \
+	publish-victoria-logs \
 	publish-vmagent \
 	publish-vmalert \
 	publish-vmauth \
@ -42,6 +44,7 @@ publish: docker-scan \

 package: \
 	package-victoria-metrics \
+	package-victoria-logs \
 	package-vmagent \
 	package-vmalert \
 	package-vmauth \
@ -178,6 +181,7 @@ publish-release:

 release: \
 	release-victoria-metrics \
+	release-victoria-logs \
 	release-vmutils

 release-victoria-metrics: \
@ -191,7 +195,6 @@ release-victoria-metrics: \
 	release-victoria-metrics-openbsd-amd64 \
 	release-victoria-metrics-windows-amd64

-# adds i386 arch
 release-victoria-metrics-linux-386:
 	GOOS=linux GOARCH=386 $(MAKE) release-victoria-metrics-goos-goarch

@ -238,6 +241,63 @@ release-victoria-metrics-windows-goarch: victoria-metrics-windows-$(GOARCH)-prod
 	cd bin && rm -rf \
 		victoria-metrics-windows-$(GOARCH)-prod.exe

+release-victoria-logs: \
+	release-victoria-logs-linux-386 \
+	release-victoria-logs-linux-amd64 \
+	release-victoria-logs-linux-arm \
+	release-victoria-logs-linux-arm64 \
+	release-victoria-logs-darwin-amd64 \
+	release-victoria-logs-darwin-arm64 \
+	release-victoria-logs-freebsd-amd64 \
+	release-victoria-logs-openbsd-amd64 \
+	release-victoria-logs-windows-amd64
+
+release-victoria-logs-linux-386:
+	GOOS=linux GOARCH=386 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-linux-amd64:
+	GOOS=linux GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-linux-arm:
+	GOOS=linux GOARCH=arm $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-linux-arm64:
+	GOOS=linux GOARCH=arm64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-darwin-amd64:
+	GOOS=darwin GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-darwin-arm64:
+	GOOS=darwin GOARCH=arm64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-freebsd-amd64:
+	GOOS=freebsd GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-openbsd-amd64:
+	GOOS=openbsd GOARCH=amd64 $(MAKE) release-victoria-logs-goos-goarch
+
+release-victoria-logs-windows-amd64:
+	GOARCH=amd64 $(MAKE) release-victoria-logs-windows-goarch
+
+release-victoria-logs-goos-goarch: victoria-logs-$(GOOS)-$(GOARCH)-prod
+	cd bin && \
+		tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
+			victoria-logs-$(GOOS)-$(GOARCH)-prod \
+		&& sha256sum victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
+			victoria-logs-$(GOOS)-$(GOARCH)-prod \
+			| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > victoria-logs-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
+	cd bin && rm -rf victoria-logs-$(GOOS)-$(GOARCH)-prod
+
+release-victoria-logs-windows-goarch: victoria-logs-windows-$(GOARCH)-prod
+	cd bin && \
+		zip victoria-logs-windows-$(GOARCH)-$(PKG_TAG).zip \
+			victoria-logs-windows-$(GOARCH)-prod.exe \
+		&& sha256sum victoria-logs-windows-$(GOARCH)-$(PKG_TAG).zip \
+			victoria-logs-windows-$(GOARCH)-prod.exe \
+			> victoria-logs-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
+	cd bin && rm -rf \
+		victoria-logs-windows-$(GOARCH)-prod.exe
+
 release-vmutils: \
 	release-vmutils-linux-386 \
 	release-vmutils-linux-amd64 \
--- a/app/victoria-logs/Makefile
+++ b/app/victoria-logs/Makefile
@ -0,0 +1,103 @@
+# All these commands must run from repository root.
+
+victoria-logs:
+	APP_NAME=victoria-logs $(MAKE) app-local
+
+victoria-logs-race:
+	APP_NAME=victoria-logs RACE=-race $(MAKE) app-local
+
+victoria-logs-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker
+
+victoria-logs-pure-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-pure
+
+victoria-logs-linux-amd64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-amd64
+
+victoria-logs-linux-arm-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-arm
+
+victoria-logs-linux-arm64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-arm64
+
+victoria-logs-linux-ppc64le-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-ppc64le
+
+victoria-logs-linux-386-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-linux-386
+
+victoria-logs-darwin-amd64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-darwin-amd64
+
+victoria-logs-darwin-arm64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-darwin-arm64
+
+victoria-logs-freebsd-amd64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-freebsd-amd64
+
+victoria-logs-openbsd-amd64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-openbsd-amd64
+
+victoria-logs-windows-amd64-prod:
+	APP_NAME=victoria-logs $(MAKE) app-via-docker-windows-amd64
+
+package-victoria-logs:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker
+
+package-victoria-logs-pure:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-pure
+
+package-victoria-logs-amd64:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-amd64
+
+package-victoria-logs-arm:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-arm
+
+package-victoria-logs-arm64:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-arm64
+
+package-victoria-logs-ppc64le:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-ppc64le
+
+package-victoria-logs-386:
+	APP_NAME=victoria-logs $(MAKE) package-via-docker-386
+
+publish-victoria-logs:
+	APP_NAME=victoria-logs $(MAKE) publish-via-docker
+
+victoria-logs-linux-amd64:
+	APP_NAME=victoria-logs CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-linux-arm:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
+
+victoria-logs-linux-arm64:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-linux-ppc64le:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
+
+victoria-logs-linux-s390x:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
+
+victoria-logs-linux-386:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
+
+victoria-logs-darwin-amd64:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-darwin-arm64:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-freebsd-amd64:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-openbsd-amd64:
+	APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
+
+victoria-logs-windows-amd64:
+	GOARCH=amd64 APP_NAME=victoria-logs $(MAKE) app-local-windows-goarch
+
+victoria-logs-pure:
+	APP_NAME=victoria-logs $(MAKE) app-local-pure
--- a/app/victoria-logs/deployment/Dockerfile
+++ b/app/victoria-logs/deployment/Dockerfile
@ -0,0 +1,8 @@
+ARG base_image
+FROM $base_image
+
+EXPOSE 8428
+
+ENTRYPOINT ["/victoria-logs-prod"]
+ARG src_binary
+COPY $src_binary ./victoria-logs-prod
--- a/app/victoria-logs/main.go
+++ b/app/victoria-logs/main.go
@ -0,0 +1,102 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"net/http"
+	"os"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlselect"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
+)
+
+var (
+	httpListenAddr   = flag.String("httpListenAddr", ":9428", "TCP address to listen for http connections. See also -httpListenAddr.useProxyProtocol")
+	useProxyProtocol = flag.Bool("httpListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted at -httpListenAddr . "+
+		"See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt . "+
+		"With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing")
+	gogc = flag.Int("gogc", 100, "GOGC to use. See https://tip.golang.org/doc/gc-guide")
+)
+
+func main() {
+	// Write flags and help message to stdout, since it is easier to grep or pipe.
+	flag.CommandLine.SetOutput(os.Stdout)
+	flag.Usage = usage
+	envflag.Parse()
+	cgroup.SetGOGC(*gogc)
+	buildinfo.Init()
+	logger.Init()
+	pushmetrics.Init()
+
+	logger.Infof("starting VictoriaLogs at %q...", *httpListenAddr)
+	startTime := time.Now()
+
+	vlstorage.Init()
+	vlselect.Init()
+	vlinsert.Init()
+
+	go httpserver.Serve(*httpListenAddr, *useProxyProtocol, requestHandler)
+	logger.Infof("started VictoriaLogs in %.3f seconds; see https://docs.victoriametrics.com/VictoriaLogs/", time.Since(startTime).Seconds())
+
+	sig := procutil.WaitForSigterm()
+	logger.Infof("received signal %s", sig)
+
+	logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
+	startTime = time.Now()
+	if err := httpserver.Stop(*httpListenAddr); err != nil {
+		logger.Fatalf("cannot stop the webservice: %s", err)
+	}
+	logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
+
+	vlinsert.Stop()
+	vlselect.Stop()
+	vlstorage.Stop()
+
+	fs.MustStopDirRemover()
+
+	logger.Infof("the VictoriaLogs has been stopped in %.3f seconds", time.Since(startTime).Seconds())
+}
+
+func requestHandler(w http.ResponseWriter, r *http.Request) bool {
+	if r.URL.Path == "/" {
+		if r.Method != http.MethodGet {
+			return false
+		}
+		w.Header().Add("Content-Type", "text/html; charset=utf-8")
+		fmt.Fprintf(w, "<h2>Single-node VictoriaLogs</h2></br>")
+		fmt.Fprintf(w, "See docs at <a href='https://docs.victoriametrics.com/VictoriaLogs/'>https://docs.victoriametrics.com/VictoriaLogs/</a></br>")
+		fmt.Fprintf(w, "Useful endpoints:</br>")
+		httpserver.WriteAPIHelp(w, [][2]string{
+			{"metrics", "available service metrics"},
+			{"flags", "command-line flags"},
+		})
+		return true
+	}
+	if vlinsert.RequestHandler(w, r) {
+		return true
+	}
+	if vlselect.RequestHandler(w, r) {
+		return true
+	}
+	return false
+}
+
+func usage() {
+	const s = `
+victoria-logs is a log management and analytics service.
+
+See the docs at https://docs.victoriametrics.com/VictoriaLogs/
+`
+	flagutil.Usage(s)
+}
--- a/app/victoria-logs/multiarch/Dockerfile
+++ b/app/victoria-logs/multiarch/Dockerfile
@ -0,0 +1,12 @@
+# See https://medium.com/on-docker/use-multi-stage-builds-to-inject-ca-certs-ad1e8f01de1b
+ARG certs_image
+ARG root_image
+FROM $certs_image as certs
+RUN apk update && apk upgrade && apk --update --no-cache add ca-certificates
+
+FROM $root_image
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+EXPOSE 8428
+ENTRYPOINT ["/victoria-logs-prod"]
+ARG TARGETARCH
+COPY victoria-logs-linux-${TARGETARCH}-prod ./victoria-logs-prod
--- a/app/vlinsert/elasticsearch/bulk_response.qtpl
+++ b/app/vlinsert/elasticsearch/bulk_response.qtpl
@ -0,0 +1,20 @@
+{% stripspace %}
+
+{% func BulkResponse(n int, tookMs int64) %}
+{
+	"took":{%dl tookMs %},
+	"errors":false,
+	"items":[
+		{% for i := 0; i < n; i++ %}
+		{
+			"create":{
+				"status":201
+			}
+		}
+		{% if i+1 < n %},{% endif %}
+		{% endfor %}
+	]
+}
+{% endfunc %}
+
+{% endstripspace %}
--- a/app/vlinsert/elasticsearch/bulk_response.qtpl.go
+++ b/app/vlinsert/elasticsearch/bulk_response.qtpl.go
@ -0,0 +1,69 @@
+// Code generated by qtc from "bulk_response.qtpl". DO NOT EDIT.
+// See https://github.com/valyala/quicktemplate for details.
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
+package elasticsearch
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
+import (
+	qtio422016 "io"
+
+	qt422016 "github.com/valyala/quicktemplate"
+)
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
+var (
+	_ = qtio422016.Copy
+	_ = qt422016.AcquireByteBuffer
+)
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
+func StreamBulkResponse(qw422016 *qt422016.Writer, n int, tookMs int64) {
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:3
+	qw422016.N().S(`{"took":`)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:5
+	qw422016.N().DL(tookMs)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:5
+	qw422016.N().S(`,"errors":false,"items":[`)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:8
+	for i := 0; i < n; i++ {
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:8
+		qw422016.N().S(`{"create":{"status":201}}`)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
+		if i+1 < n {
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
+			qw422016.N().S(`,`)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:14
+		}
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:15
+	}
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:15
+	qw422016.N().S(`]}`)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+}
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+func WriteBulkResponse(qq422016 qtio422016.Writer, n int, tookMs int64) {
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	qw422016 := qt422016.AcquireWriter(qq422016)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	StreamBulkResponse(qw422016, n, tookMs)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	qt422016.ReleaseWriter(qw422016)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+}
+
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+func BulkResponse(n int, tookMs int64) string {
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	qb422016 := qt422016.AcquireByteBuffer()
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	WriteBulkResponse(qb422016, n, tookMs)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	qs422016 := string(qb422016.B)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	qt422016.ReleaseByteBuffer(qb422016)
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+	return qs422016
+//line app/vlinsert/elasticsearch/bulk_response.qtpl:18
+}
--- a/app/vlinsert/elasticsearch/elasticsearch.go
+++ b/app/vlinsert/elasticsearch/elasticsearch.go
@ -0,0 +1,410 @@
+package elasticsearch
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/fastjson"
+)
+
+var (
+	maxLineSizeBytes = flagutil.NewBytes("insert.maxLineSizeBytes", 256*1024, "The maximum size of a single line, which can be read by /insert/* handlers")
+)
+
+// RequestHandler processes ElasticSearch insert requests
+func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
+	w.Header().Add("Content-Type", "application/json")
+	// This header is needed for Logstash
+	w.Header().Set("X-Elastic-Product", "Elasticsearch")
+
+	if strings.HasPrefix(path, "/_ilm/policy") {
+		// Return fake response for ElasticSearch ilm request.
+		fmt.Fprintf(w, `{}`)
+		return true
+	}
+	if strings.HasPrefix(path, "/_index_template") {
+		// Return fake response for ElasticSearch index template request.
+		fmt.Fprintf(w, `{}`)
+		return true
+	}
+	if strings.HasPrefix(path, "/_ingest") {
+		// Return fake response for ElasticSearch ingest pipeline request.
+		// See: https://www.elastic.co/guide/en/elasticsearch/reference/8.8/put-pipeline-api.html
+		fmt.Fprintf(w, `{}`)
+		return true
+	}
+	if strings.HasPrefix(path, "/_nodes") {
+		// Return fake response for ElasticSearch nodes discovery request.
+		// See: https://www.elastic.co/guide/en/elasticsearch/reference/8.8/cluster.html
+		fmt.Fprintf(w, `{}`)
+		return true
+	}
+	switch path {
+	case "/":
+		switch r.Method {
+		case http.MethodGet:
+			// Return fake response for ElasticSearch ping request.
+			// See the latest available version for ElasticSearch at https://github.com/elastic/elasticsearch/releases
+			fmt.Fprintf(w, `{
+			"version": {
+				"number": "8.8.0"
+			}
+		}`)
+		case http.MethodHead:
+			// Return empty response for Logstash ping request.
+		}
+
+		return true
+	case "/_license":
+		// Return fake response for ElasticSearch license request.
+		fmt.Fprintf(w, `{
+			"license": {
+				"uid": "cbff45e7-c553-41f7-ae4f-9205eabd80xx",
+				"type": "oss",
+				"status": "active",
+				"expiry_date_in_millis" : 4000000000000
+			}
+		}`)
+		return true
+	case "/_bulk":
+		startTime := time.Now()
+		bulkRequestsTotal.Inc()
+
+		// Extract tenantID
+		tenantID, err := logstorage.GetTenantIDFromRequest(r)
+		if err != nil {
+			httpserver.Errorf(w, r, "%s", err)
+			return true
+		}
+
+		// Extract time field name from _time_field query arg
+		var timeField = "_time"
+		if tf := r.FormValue("_time_field"); tf != "" {
+			timeField = tf
+		}
+
+		// Extract message field name from _msg_field query arg
+		var msgField = ""
+		if msgf := r.FormValue("_msg_field"); msgf != "" {
+			msgField = msgf
+		}
+
+		// Extract stream field names from _stream_fields query arg
+		var streamFields []string
+		if sfs := r.FormValue("_stream_fields"); sfs != "" {
+			streamFields = strings.Split(sfs, ",")
+		}
+
+		// Extract field names, which must be ignored
+		var ignoreFields []string
+		if ifs := r.FormValue("ignore_fields"); ifs != "" {
+			ignoreFields = strings.Split(ifs, ",")
+		}
+
+		lr := logstorage.GetLogRows(streamFields, ignoreFields)
+		processLogMessage := func(timestamp int64, fields []logstorage.Field) {
+			lr.MustAdd(tenantID, timestamp, fields)
+			if lr.NeedFlush() {
+				vlstorage.MustAddRows(lr)
+				lr.Reset()
+			}
+		}
+
+		isGzip := r.Header.Get("Content-Encoding") == "gzip"
+		n, err := readBulkRequest(r.Body, isGzip, timeField, msgField, processLogMessage)
+		if err != nil {
+			logger.Warnf("cannot decode log message #%d in /_bulk request: %s", n, err)
+			return true
+		}
+		vlstorage.MustAddRows(lr)
+		logstorage.PutLogRows(lr)
+
+		tookMs := time.Since(startTime).Milliseconds()
+		bw := bufferedwriter.Get(w)
+		defer bufferedwriter.Put(bw)
+		WriteBulkResponse(bw, n, tookMs)
+		_ = bw.Flush()
+		return true
+	default:
+		return false
+	}
+}
+
+var bulkRequestsTotal = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
+
+func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
+	processLogMessage func(timestamp int64, fields []logstorage.Field),
+) (int, error) {
+	// See https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
+
+	if isGzip {
+		zr, err := common.GetGzipReader(r)
+		if err != nil {
+			return 0, fmt.Errorf("cannot read gzipped _bulk request: %w", err)
+		}
+		defer common.PutGzipReader(zr)
+		r = zr
+	}
+
+	wcr := writeconcurrencylimiter.GetReader(r)
+	defer writeconcurrencylimiter.PutReader(wcr)
+
+	lb := lineBufferPool.Get()
+	defer lineBufferPool.Put(lb)
+
+	lb.B = bytesutil.ResizeNoCopyNoOverallocate(lb.B, maxLineSizeBytes.IntN())
+	sc := bufio.NewScanner(wcr)
+	sc.Buffer(lb.B, len(lb.B))
+
+	n := 0
+	nCheckpoint := 0
+	for {
+		ok, err := readBulkLine(sc, timeField, msgField, processLogMessage)
+		wcr.DecConcurrency()
+		if err != nil || !ok {
+			rowsIngestedTotal.Add(n - nCheckpoint)
+			return n, err
+		}
+		n++
+		if batchSize := n - nCheckpoint; n >= 1000 {
+			rowsIngestedTotal.Add(batchSize)
+			nCheckpoint = n
+		}
+	}
+}
+
+var lineBufferPool bytesutil.ByteBufferPool
+
+var rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
+
+func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
+	processLogMessage func(timestamp int64, fields []logstorage.Field),
+) (bool, error) {
+	// Decode command, must be "create" or "index"
+	if !sc.Scan() {
+		if err := sc.Err(); err != nil {
+			if errors.Is(err, bufio.ErrTooLong) {
+				return false, fmt.Errorf(`cannot read "create" or "index" command, since its size exceeds -insert.maxLineSizeBytes=%d`, maxLineSizeBytes.IntN())
+			}
+			return false, err
+		}
+		return false, nil
+	}
+	line := sc.Bytes()
+	p := parserPool.Get()
+	v, err := p.ParseBytes(line)
+	if err != nil {
+		return false, fmt.Errorf(`cannot parse "create" or "index" command: %w`, err)
+	}
+	if v.GetObject("create") == nil && v.GetObject("index") == nil {
+		return false, fmt.Errorf(`unexpected command %q; expected "create" or "index"`, v)
+	}
+	parserPool.Put(p)
+
+	// Decode log message
+	if !sc.Scan() {
+		if err := sc.Err(); err != nil {
+			if errors.Is(err, bufio.ErrTooLong) {
+				return false, fmt.Errorf("cannot read log message, since its size exceeds -insert.maxLineSizeBytes=%d", maxLineSizeBytes.IntN())
+			}
+			return false, err
+		}
+		return false, fmt.Errorf(`missing log message after the "create" or "index" command`)
+	}
+	line = sc.Bytes()
+	pctx := getParserCtx()
+	if err := pctx.parseLogMessage(line); err != nil {
+		invalidJSONLineLogger.Warnf("cannot parse json-encoded log entry: %s", err)
+		return true, nil
+	}
+
+	timestamp, err := extractTimestampFromFields(timeField, pctx.fields)
+	if err != nil {
+		invalidTimestampLogger.Warnf("skipping the log entry because cannot parse timestamp: %s", err)
+		return true, nil
+	}
+	updateMessageFieldName(msgField, pctx.fields)
+	processLogMessage(timestamp, pctx.fields)
+	putParserCtx(pctx)
+	return true, nil
+}
+
+var parserPool fastjson.ParserPool
+
+var (
+	invalidTimestampLogger = logger.WithThrottler("invalidTimestampLogger", 5*time.Second)
+	invalidJSONLineLogger  = logger.WithThrottler("invalidJSONLineLogger", 5*time.Second)
+)
+
+func extractTimestampFromFields(timeField string, fields []logstorage.Field) (int64, error) {
+	for i := range fields {
+		f := &fields[i]
+		if f.Name != timeField {
+			continue
+		}
+		timestamp, err := parseElasticsearchTimestamp(f.Value)
+		if err != nil {
+			return 0, err
+		}
+		f.Value = ""
+		return timestamp, nil
+	}
+	return time.Now().UnixNano(), nil
+}
+
+func updateMessageFieldName(msgField string, fields []logstorage.Field) {
+	if msgField == "" {
+		return
+	}
+	for i := range fields {
+		f := &fields[i]
+		if f.Name == msgField {
+			f.Name = "_msg"
+			return
+		}
+	}
+}
+
+type parserCtx struct {
+	p         fastjson.Parser
+	buf       []byte
+	prefixBuf []byte
+	fields    []logstorage.Field
+}
+
+func (pctx *parserCtx) reset() {
+	pctx.buf = pctx.buf[:0]
+	pctx.prefixBuf = pctx.prefixBuf[:0]
+
+	fields := pctx.fields
+	for i := range fields {
+		lf := &fields[i]
+		lf.Name = ""
+		lf.Value = ""
+	}
+	pctx.fields = fields[:0]
+}
+
+func getParserCtx() *parserCtx {
+	v := parserCtxPool.Get()
+	if v == nil {
+		return &parserCtx{}
+	}
+	return v.(*parserCtx)
+}
+
+func putParserCtx(pctx *parserCtx) {
+	pctx.reset()
+	parserCtxPool.Put(pctx)
+}
+
+var parserCtxPool sync.Pool
+
+func (pctx *parserCtx) parseLogMessage(msg []byte) error {
+	s := bytesutil.ToUnsafeString(msg)
+	v, err := pctx.p.Parse(s)
+	if err != nil {
+		return fmt.Errorf("cannot parse json: %w", err)
+	}
+	if t := v.Type(); t != fastjson.TypeObject {
+		return fmt.Errorf("expecting json dictionary; got %s", t)
+	}
+	pctx.reset()
+	pctx.fields, pctx.buf, pctx.prefixBuf = appendLogFields(pctx.fields, pctx.buf, pctx.prefixBuf, v)
+	return nil
+}
+
+func appendLogFields(dst []logstorage.Field, dstBuf, prefixBuf []byte, v *fastjson.Value) ([]logstorage.Field, []byte, []byte) {
+	o := v.GetObject()
+	o.Visit(func(k []byte, v *fastjson.Value) {
+		t := v.Type()
+		switch t {
+		case fastjson.TypeNull:
+			// Skip nulls
+		case fastjson.TypeObject:
+			// Flatten nested JSON objects.
+			// For example, {"foo":{"bar":"baz"}} is converted to {"foo.bar":"baz"}
+			prefixLen := len(prefixBuf)
+			prefixBuf = append(prefixBuf, k...)
+			prefixBuf = append(prefixBuf, '.')
+			dst, dstBuf, prefixBuf = appendLogFields(dst, dstBuf, prefixBuf, v)
+			prefixBuf = prefixBuf[:prefixLen]
+		case fastjson.TypeArray, fastjson.TypeNumber, fastjson.TypeTrue, fastjson.TypeFalse:
+			// Convert JSON arrays, numbers, true and false values to their string representation
+			dstBufLen := len(dstBuf)
+			dstBuf = v.MarshalTo(dstBuf)
+			value := dstBuf[dstBufLen:]
+			dst, dstBuf = appendLogField(dst, dstBuf, prefixBuf, k, value)
+		case fastjson.TypeString:
+			// Decode JSON strings
+			dstBufLen := len(dstBuf)
+			dstBuf = append(dstBuf, v.GetStringBytes()...)
+			value := dstBuf[dstBufLen:]
+			dst, dstBuf = appendLogField(dst, dstBuf, prefixBuf, k, value)
+		default:
+			logger.Panicf("BUG: unexpected JSON type: %s", t)
+		}
+	})
+	return dst, dstBuf, prefixBuf
+}
+
+func appendLogField(dst []logstorage.Field, dstBuf, prefixBuf, k, value []byte) ([]logstorage.Field, []byte) {
+	dstBufLen := len(dstBuf)
+	dstBuf = append(dstBuf, prefixBuf...)
+	dstBuf = append(dstBuf, k...)
+	name := dstBuf[dstBufLen:]
+
+	dst = append(dst, logstorage.Field{
+		Name:  bytesutil.ToUnsafeString(name),
+		Value: bytesutil.ToUnsafeString(value),
+	})
+	return dst, dstBuf
+}
+
+func parseElasticsearchTimestamp(s string) (int64, error) {
+	if len(s) < len("YYYY-MM-DD") || s[len("YYYY")] != '-' {
+		// Try parsing timestamp in milliseconds
+		n, err := strconv.ParseInt(s, 10, 64)
+		if err != nil {
+			return 0, fmt.Errorf("cannot parse timestamp in milliseconds from %q: %w", s, err)
+		}
+		if n > int64(math.MaxInt64)/1e6 {
+			return 0, fmt.Errorf("too big timestamp in milliseconds: %d; mustn't exceed %d", n, int64(math.MaxInt64)/1e6)
+		}
+		if n < int64(math.MinInt64)/1e6 {
+			return 0, fmt.Errorf("too small timestamp in milliseconds: %d; must be bigger than %d", n, int64(math.MinInt64)/1e6)
+		}
+		n *= 1e6
+		return n, nil
+	}
+	if len(s) == len("YYYY-MM-DD") {
+		t, err := time.Parse("2006-01-02", s)
+		if err != nil {
+			return 0, fmt.Errorf("cannot parse date %q: %w", s, err)
+		}
+		return t.UnixNano(), nil
+	}
+	t, err := time.Parse(time.RFC3339, s)
+	if err != nil {
+		return 0, fmt.Errorf("cannot parse timestamp %q: %w", s, err)
+	}
+	return t.UnixNano(), nil
+}
--- a/app/vlinsert/elasticsearch/elasticsearch_test.go
+++ b/app/vlinsert/elasticsearch/elasticsearch_test.go
@ -0,0 +1,97 @@
+package elasticsearch
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+)
+
+func TestReadBulkRequest(t *testing.T) {
+	f := func(data, timeField, msgField string, rowsExpected int, timestampsExpected []int64, resultExpected string) {
+		t.Helper()
+
+		var timestamps []int64
+		var result string
+		processLogMessage := func(timestamp int64, fields []logstorage.Field) {
+			timestamps = append(timestamps, timestamp)
+
+			a := make([]string, len(fields))
+			for i, f := range fields {
+				a[i] = fmt.Sprintf("%q:%q", f.Name, f.Value)
+			}
+			s := "{" + strings.Join(a, ",") + "}\n"
+			result += s
+		}
+
+		// Read the request without compression
+		r := bytes.NewBufferString(data)
+		rows, err := readBulkRequest(r, false, timeField, msgField, processLogMessage)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		if rows != rowsExpected {
+			t.Fatalf("unexpected rows read; got %d; want %d", rows, rowsExpected)
+		}
+
+		if !reflect.DeepEqual(timestamps, timestampsExpected) {
+			t.Fatalf("unexpected timestamps;\ngot\n%d\nwant\n%d", timestamps, timestampsExpected)
+		}
+		if result != resultExpected {
+			t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
+		}
+
+		// Read the request with compression
+		timestamps = nil
+		result = ""
+		compressedData := compressData(data)
+		r = bytes.NewBufferString(compressedData)
+		rows, err = readBulkRequest(r, true, timeField, msgField, processLogMessage)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		if rows != rowsExpected {
+			t.Fatalf("unexpected rows read; got %d; want %d", rows, rowsExpected)
+		}
+
+		if !reflect.DeepEqual(timestamps, timestampsExpected) {
+			t.Fatalf("unexpected timestamps;\ngot\n%d\nwant\n%d", timestamps, timestampsExpected)
+		}
+		if result != resultExpected {
+			t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
+		}
+	}
+
+	data := `{"create":{"_index":"filebeat-8.8.0"}}
+{"@timestamp":"2023-06-06T04:48:11.735Z","log":{"offset":71770,"file":{"path":"/var/log/auth.log"}},"message":"foobar"}
+{"create":{"_index":"filebeat-8.8.0"}}
+{"@timestamp":"2023-06-06T04:48:12.735Z","message":"baz"}
+{"create":{"_index":"filebeat-8.8.0"}}
+{"message":"xyz","@timestamp":"2023-06-06T04:48:13.735Z","x":"y"}
+`
+	timeField := "@timestamp"
+	msgField := "message"
+	rowsExpected := 3
+	timestampsExpected := []int64{1686026891735000000, 1686026892735000000, 1686026893735000000}
+	resultExpected := `{"@timestamp":"","log.offset":"71770","log.file.path":"/var/log/auth.log","_msg":"foobar"}
+{"@timestamp":"","_msg":"baz"}
+{"_msg":"xyz","@timestamp":"","x":"y"}
+`
+	f(data, timeField, msgField, rowsExpected, timestampsExpected, resultExpected)
+}
+
+func compressData(s string) string {
+	var bb bytes.Buffer
+	zw := gzip.NewWriter(&bb)
+	if _, err := zw.Write([]byte(s)); err != nil {
+		panic(fmt.Errorf("unexpected error when compressing data: %s", err))
+	}
+	if err := zw.Close(); err != nil {
+		panic(fmt.Errorf("unexpected error when closing gzip writer: %s", err))
+	}
+	return bb.String()
+}
--- a/app/vlinsert/elasticsearch/elasticsearch_timing_test.go
+++ b/app/vlinsert/elasticsearch/elasticsearch_timing_test.go
@ -0,0 +1,50 @@
+package elasticsearch
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+)
+
+func BenchmarkReadBulkRequest(b *testing.B) {
+	b.Run("gzip:off", func(b *testing.B) {
+		benchmarkReadBulkRequest(b, false)
+	})
+	b.Run("gzip:on", func(b *testing.B) {
+		benchmarkReadBulkRequest(b, true)
+	})
+}
+
+func benchmarkReadBulkRequest(b *testing.B, isGzip bool) {
+	data := `{"create":{"_index":"filebeat-8.8.0"}}
+{"@timestamp":"2023-06-06T04:48:11.735Z","log":{"offset":71770,"file":{"path":"/var/log/auth.log"}},"message":"foobar"}
+{"create":{"_index":"filebeat-8.8.0"}}
+{"@timestamp":"2023-06-06T04:48:12.735Z","message":"baz"}
+{"create":{"_index":"filebeat-8.8.0"}}
+{"message":"xyz","@timestamp":"2023-06-06T04:48:13.735Z","x":"y"}
+`
+	if isGzip {
+		data = compressData(data)
+	}
+	dataBytes := bytesutil.ToUnsafeBytes(data)
+
+	timeField := "@timestamp"
+	msgField := "message"
+	processLogMessage := func(timestmap int64, fields []logstorage.Field) {}
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(data)))
+	b.RunParallel(func(pb *testing.PB) {
+		r := &bytes.Reader{}
+		for pb.Next() {
+			r.Reset(dataBytes)
+			_, err := readBulkRequest(r, isGzip, timeField, msgField, processLogMessage)
+			if err != nil {
+				panic(fmt.Errorf("unexpected error: %s", err))
+			}
+		}
+	})
+}
--- a/app/vlinsert/main.go
+++ b/app/vlinsert/main.go
@ -0,0 +1,34 @@
+package vlinsert
+
+import (
+	"net/http"
+	"strings"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/elasticsearch"
+)
+
+// Init initializes vlinsert
+func Init() {
+}
+
+// Stop stops vlinsert
+func Stop() {
+}
+
+// RequestHandler handles insert requests for VictoriaLogs
+func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
+	path := r.URL.Path
+	if !strings.HasPrefix(path, "/insert/") {
+		return false
+	}
+	path = strings.TrimPrefix(path, "/insert")
+	path = strings.ReplaceAll(path, "//", "/")
+
+	switch {
+	case strings.HasPrefix(path, "/elasticsearch/"):
+		path = strings.TrimPrefix(path, "/elasticsearch")
+		return elasticsearch.RequestHandler(path, w, r)
+	default:
+		return false
+	}
+}
--- a/app/vlselect/logsql/logsql.go
+++ b/app/vlselect/logsql/logsql.go
@ -0,0 +1,53 @@
+package logsql
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bufferedwriter"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+)
+
+// ProcessQueryRequest handles /select/logsql/query request
+func ProcessQueryRequest(w http.ResponseWriter, r *http.Request, stopCh <-chan struct{}) {
+	// Extract tenantID
+	tenantID, err := logstorage.GetTenantIDFromRequest(r)
+	if err != nil {
+		httpserver.Errorf(w, r, "%s", err)
+		return
+	}
+
+	qStr := r.FormValue("query")
+	q, err := logstorage.ParseQuery(qStr)
+	if err != nil {
+		httpserver.Errorf(w, r, "cannot parse query [%s]: %s", qStr, err)
+		return
+	}
+	w.Header().Set("Content-Type", "application/stream+json; charset=utf-8")
+
+	bw := bufferedwriter.Get(w)
+	defer bufferedwriter.Put(bw)
+
+	tenantIDs := []logstorage.TenantID{tenantID}
+	vlstorage.RunQuery(tenantIDs, q, stopCh, func(columns []logstorage.BlockColumn) {
+		if len(columns) == 0 {
+			return
+		}
+		rowsCount := len(columns[0].Values)
+
+		bb := blockResultPool.Get()
+		for rowIdx := 0; rowIdx < rowsCount; rowIdx++ {
+			WriteJSONRow(bb, columns, rowIdx)
+		}
+		// Do not check for error here, since the only valid error is when the client
+		// closes the connection during Write() call. There is no need in logging this error,
+		// since it may be too verbose and it doesn't give any actionable info.
+		_, _ = bw.Write(bb.B)
+		blockResultPool.Put(bb)
+	})
+	_ = bw.Flush()
+}
+
+var blockResultPool bytesutil.ByteBufferPool
--- a/app/vlselect/logsql/query_response.qtpl
+++ b/app/vlselect/logsql/query_response.qtpl
@ -0,0 +1,20 @@
+{% import (
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+) %}
+
+{% stripspace %}
+
+// JSONRow creates JSON row from the given fields.
+{% func JSONRow(columns []logstorage.BlockColumn, rowIdx int) %}
+{
+	{% code c := &columns[0] %}
+	{%q= c.Name %}:{%q= c.Values[rowIdx] %}
+	{% code columns = columns[1:] %}
+	{% for colIdx := range columns %}
+		{% code c := &columns[colIdx] %}
+		,{%q= c.Name %}:{%q= c.Values[rowIdx] %}
+	{% endfor %}
+}{% newline %}
+{% endfunc %}
+
+{% endstripspace %}
--- a/app/vlselect/logsql/query_response.qtpl.go
+++ b/app/vlselect/logsql/query_response.qtpl.go
@ -0,0 +1,90 @@
+// Code generated by qtc from "query_response.qtpl". DO NOT EDIT.
+// See https://github.com/valyala/quicktemplate for details.
+
+//line app/vlselect/logsql/query_response.qtpl:1
+package logsql
+
+//line app/vlselect/logsql/query_response.qtpl:1
+import (
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+)
+
+// JSONRow creates JSON row from the given fields.
+
+//line app/vlselect/logsql/query_response.qtpl:8
+import (
+	qtio422016 "io"
+
+	qt422016 "github.com/valyala/quicktemplate"
+)
+
+//line app/vlselect/logsql/query_response.qtpl:8
+var (
+	_ = qtio422016.Copy
+	_ = qt422016.AcquireByteBuffer
+)
+
+//line app/vlselect/logsql/query_response.qtpl:8
+func StreamJSONRow(qw422016 *qt422016.Writer, columns []logstorage.BlockColumn, rowIdx int) {
+//line app/vlselect/logsql/query_response.qtpl:8
+	qw422016.N().S(`{`)
+//line app/vlselect/logsql/query_response.qtpl:10
+	c := &columns[0]
+
+//line app/vlselect/logsql/query_response.qtpl:11
+	qw422016.N().Q(c.Name)
+//line app/vlselect/logsql/query_response.qtpl:11
+	qw422016.N().S(`:`)
+//line app/vlselect/logsql/query_response.qtpl:11
+	qw422016.N().Q(c.Values[rowIdx])
+//line app/vlselect/logsql/query_response.qtpl:12
+	columns = columns[1:]
+
+//line app/vlselect/logsql/query_response.qtpl:13
+	for colIdx := range columns {
+//line app/vlselect/logsql/query_response.qtpl:14
+		c := &columns[colIdx]
+
+//line app/vlselect/logsql/query_response.qtpl:14
+		qw422016.N().S(`,`)
+//line app/vlselect/logsql/query_response.qtpl:15
+		qw422016.N().Q(c.Name)
+//line app/vlselect/logsql/query_response.qtpl:15
+		qw422016.N().S(`:`)
+//line app/vlselect/logsql/query_response.qtpl:15
+		qw422016.N().Q(c.Values[rowIdx])
+//line app/vlselect/logsql/query_response.qtpl:16
+	}
+//line app/vlselect/logsql/query_response.qtpl:16
+	qw422016.N().S(`}`)
+//line app/vlselect/logsql/query_response.qtpl:17
+	qw422016.N().S(`
+`)
+//line app/vlselect/logsql/query_response.qtpl:18
+}
+
+//line app/vlselect/logsql/query_response.qtpl:18
+func WriteJSONRow(qq422016 qtio422016.Writer, columns []logstorage.BlockColumn, rowIdx int) {
+//line app/vlselect/logsql/query_response.qtpl:18
+	qw422016 := qt422016.AcquireWriter(qq422016)
+//line app/vlselect/logsql/query_response.qtpl:18
+	StreamJSONRow(qw422016, columns, rowIdx)
+//line app/vlselect/logsql/query_response.qtpl:18
+	qt422016.ReleaseWriter(qw422016)
+//line app/vlselect/logsql/query_response.qtpl:18
+}
+
+//line app/vlselect/logsql/query_response.qtpl:18
+func JSONRow(columns []logstorage.BlockColumn, rowIdx int) string {
+//line app/vlselect/logsql/query_response.qtpl:18
+	qb422016 := qt422016.AcquireByteBuffer()
+//line app/vlselect/logsql/query_response.qtpl:18
+	WriteJSONRow(qb422016, columns, rowIdx)
+//line app/vlselect/logsql/query_response.qtpl:18
+	qs422016 := string(qb422016.B)
+//line app/vlselect/logsql/query_response.qtpl:18
+	qt422016.ReleaseByteBuffer(qb422016)
+//line app/vlselect/logsql/query_response.qtpl:18
+	return qs422016
+//line app/vlselect/logsql/query_response.qtpl:18
+}
--- a/app/vlselect/main.go
+++ b/app/vlselect/main.go
@ -0,0 +1,140 @@
+package vlselect
+
+import (
+	"flag"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlselect/logsql"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+
+		"It shouldn't be high, since a single request can saturate all the CPU cores, while many concurrently executed requests may require high amounts of memory. "+
+		"See also -search.maxQueueDuration")
+	maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the search request waits for execution when -search.maxConcurrentRequests "+
+		"limit is reached; see also -search.maxQueryDuration")
+	maxQueryDuration = flag.Duration("search.maxQueryDuration", time.Second*30, "The maximum duration for query execution")
+)
+
+func getDefaultMaxConcurrentRequests() int {
+	n := cgroup.AvailableCPUs()
+	if n <= 4 {
+		n *= 2
+	}
+	if n > 16 {
+		// A single request can saturate all the CPU cores, so there is no sense
+		// in allowing higher number of concurrent requests - they will just contend
+		// for unavailable CPU time.
+		n = 16
+	}
+	return n
+}
+
+// Init initializes vlselect
+func Init() {
+	concurrencyLimitCh = make(chan struct{}, *maxConcurrentRequests)
+}
+
+// Stop stops vlselect
+func Stop() {
+}
+
+var concurrencyLimitCh chan struct{}
+
+var (
+	concurrencyLimitReached = metrics.NewCounter(`vl_concurrent_select_limit_reached_total`)
+	concurrencyLimitTimeout = metrics.NewCounter(`vl_concurrent_select_limit_timeout_total`)
+
+	_ = metrics.NewGauge(`vl_concurrent_select_capacity`, func() float64 {
+		return float64(cap(concurrencyLimitCh))
+	})
+	_ = metrics.NewGauge(`vl_concurrent_select_current`, func() float64 {
+		return float64(len(concurrencyLimitCh))
+	})
+)
+
+// RequestHandler handles select requests for VictoriaLogs
+func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
+	path := r.URL.Path
+	if !strings.HasPrefix(path, "/select/") {
+		return false
+	}
+	path = strings.TrimPrefix(path, "/select")
+	path = strings.ReplaceAll(path, "//", "/")
+
+	// Limit the number of concurrent queries.
+	startTime := time.Now()
+	stopCh := r.Context().Done()
+	select {
+	case concurrencyLimitCh <- struct{}{}:
+		defer func() { <-concurrencyLimitCh }()
+	default:
+		// Sleep for a while until giving up. This should resolve short bursts in requests.
+		concurrencyLimitReached.Inc()
+		d := getMaxQueryDuration(r)
+		if d > *maxQueueDuration {
+			d = *maxQueueDuration
+		}
+		t := timerpool.Get(d)
+		select {
+		case concurrencyLimitCh <- struct{}{}:
+			timerpool.Put(t)
+			defer func() { <-concurrencyLimitCh }()
+		case <-stopCh:
+			timerpool.Put(t)
+			remoteAddr := httpserver.GetQuotedRemoteAddr(r)
+			requestURI := httpserver.GetRequestURI(r)
+			logger.Infof("client has cancelled the request after %.3f seconds: remoteAddr=%s, requestURI: %q",
+				time.Since(startTime).Seconds(), remoteAddr, requestURI)
+			return true
+		case <-t.C:
+			timerpool.Put(t)
+			concurrencyLimitTimeout.Inc()
+			err := &httpserver.ErrorWithStatusCode{
+				Err: fmt.Errorf("couldn't start executing the request in %.3f seconds, since -search.maxConcurrentRequests=%d concurrent requests "+
+					"are executed. Possible solutions: to reduce query load; to add more compute resources to the server; "+
+					"to increase -search.maxQueueDuration=%s; to increase -search.maxQueryDuration; to increase -search.maxConcurrentRequests",
+					d.Seconds(), *maxConcurrentRequests, maxQueueDuration),
+				StatusCode: http.StatusServiceUnavailable,
+			}
+			httpserver.Errorf(w, r, "%s", err)
+			return true
+		}
+	}
+
+	switch {
+	case path == "/logsql/query":
+		logsqlQueryRequests.Inc()
+		httpserver.EnableCORS(w, r)
+		logsql.ProcessQueryRequest(w, r, stopCh)
+		return true
+	default:
+		return false
+	}
+}
+
+// getMaxQueryDuration returns the maximum duration for query from r.
+func getMaxQueryDuration(r *http.Request) time.Duration {
+	dms, err := httputils.GetDuration(r, "timeout", 0)
+	if err != nil {
+		dms = 0
+	}
+	d := time.Duration(dms) * time.Millisecond
+	if d <= 0 || d > *maxQueryDuration {
+		d = *maxQueryDuration
+	}
+	return d
+}
+
+var (
+	logsqlQueryRequests = metrics.NewCounter(`vl_http_requests_total{path="/select/logsql/query"}`)
+)
--- a/app/vlstorage/main.go
+++ b/app/vlstorage/main.go
@ -0,0 +1,149 @@
+package vlstorage
+
+import (
+	"flag"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	retentionPeriod = flagutil.NewDuration("retentionPeriod", "7d", "Log entries with timestamps older than now-retentionPeriod are automatically deleted; "+
+		"log entries with timestamps outside the retention are also rejected during data ingestion; the minimum supported retention is 1d (one day); "+
+		"see https://docs.victoriametrics.com/VictoriaLogs/#retention")
+	futureRetention = flagutil.NewDuration("futureRetention", "2d", "Log entries with timestamps bigger than now+futureRetention are rejected during data ingestion; "+
+		"see https://docs.victoriametrics.com/VictoriaLogs/#retention")
+	storageDataPath = flag.String("storageDataPath", "victoria-logs-data", "Path to directory with the VictoriaLogs data; "+
+		"see https://docs.victoriametrics.com/VictoriaLogs/#storage")
+	inmemoryDataFlushInterval = flag.Duration("inmemoryDataFlushInterval", 5*time.Second, "The interval for guaranteed saving of in-memory data to disk. "+
+		"The saved data survives unclean shutdown such as OOM crash, hardware reset, SIGKILL, etc. "+
+		"Bigger intervals may help increasing lifetime of flash storage with limited write cycles (e.g. Raspberry PI). "+
+		"Smaller intervals increase disk IO load. Minimum supported value is 1s")
+	logNewStreams = flag.Bool("logNewStreams", false, "Whether to log creation of new streams; this can be useful for debugging of high cardinality issues with log streams; "+
+		"see https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields ; see also -logIngestedRows")
+	logIngestedRows = flag.Bool("logIngestedRows", false, "Whether to log all the ingested log entries; this can be useful for debugging of data ingestion; "+
+		"see https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion ; see also -logNewStreams")
+)
+
+// Init initializes vlstorage.
+//
+// Stop must be called when vlstorage is no longer needed
+func Init() {
+	if strg != nil {
+		logger.Panicf("BUG: Init() has been already called")
+	}
+
+	if retentionPeriod.Msecs < 24*3600*1000 {
+		logger.Fatalf("-retentionPeriod cannot be smaller than a day; got %s", retentionPeriod)
+	}
+	cfg := &logstorage.StorageConfig{
+		Retention:       time.Millisecond * time.Duration(retentionPeriod.Msecs),
+		FlushInterval:   *inmemoryDataFlushInterval,
+		FutureRetention: time.Millisecond * time.Duration(futureRetention.Msecs),
+		LogNewStreams:   *logNewStreams,
+		LogIngestedRows: *logIngestedRows,
+	}
+	strg = logstorage.MustOpenStorage(*storageDataPath, cfg)
+	storageMetrics = initStorageMetrics(strg)
+	metrics.RegisterSet(storageMetrics)
+}
+
+// Stop stops vlstorage.
+func Stop() {
+	metrics.UnregisterSet(storageMetrics)
+	storageMetrics = nil
+
+	strg.MustClose()
+	strg = nil
+}
+
+var strg *logstorage.Storage
+var storageMetrics *metrics.Set
+
+// MustAddRows adds lr to vlstorage
+func MustAddRows(lr *logstorage.LogRows) {
+	strg.MustAddRows(lr)
+}
+
+// RunQuery runs the given q and calls processBlock for the returned data blocks
+func RunQuery(tenantIDs []logstorage.TenantID, q *logstorage.Query, stopCh <-chan struct{}, processBlock func(columns []logstorage.BlockColumn)) {
+	strg.RunQuery(tenantIDs, q, stopCh, processBlock)
+}
+
+func initStorageMetrics(strg *logstorage.Storage) *metrics.Set {
+	ssCache := &logstorage.StorageStats{}
+	var ssCacheLock sync.Mutex
+	var lastUpdateTime time.Time
+
+	m := func() *logstorage.StorageStats {
+		ssCacheLock.Lock()
+		defer ssCacheLock.Unlock()
+		if time.Since(lastUpdateTime) < time.Second {
+			return ssCache
+		}
+		var ss logstorage.StorageStats
+		strg.UpdateStats(&ss)
+		ssCache = &ss
+		lastUpdateTime = time.Now()
+		return ssCache
+	}
+
+	ms := metrics.NewSet()
+
+	ms.NewGauge(fmt.Sprintf(`vl_free_disk_space_bytes{path=%q}`, *storageDataPath), func() float64 {
+		return float64(fs.MustGetFreeSpace(*storageDataPath))
+	})
+
+	ms.NewGauge(`vl_rows{type="inmemory"}`, func() float64 {
+		return float64(m().InmemoryRowsCount)
+	})
+	ms.NewGauge(`vl_rows{type="file"}`, func() float64 {
+		return float64(m().FileRowsCount)
+	})
+	ms.NewGauge(`vl_parts{type="inmemory"}`, func() float64 {
+		return float64(m().InmemoryParts)
+	})
+	ms.NewGauge(`vl_parts{type="file"}`, func() float64 {
+		return float64(m().FileParts)
+	})
+	ms.NewGauge(`vl_blocks{type="inmemory"}`, func() float64 {
+		return float64(m().InmemoryBlocks)
+	})
+	ms.NewGauge(`vl_blocks{type="file"}`, func() float64 {
+		return float64(m().FileBlocks)
+	})
+	ms.NewGauge(`vl_partitions`, func() float64 {
+		return float64(m().PartitionsCount)
+	})
+	ms.NewGauge(`vl_streams_created_total`, func() float64 {
+		return float64(m().StreamsCreatedTotal)
+	})
+
+	ms.NewGauge(`vl_compressed_data_size_bytes{type="inmemory"}`, func() float64 {
+		return float64(m().CompressedInmemorySize)
+	})
+	ms.NewGauge(`vl_compressed_data_size_bytes{type="file"}`, func() float64 {
+		return float64(m().CompressedFileSize)
+	})
+	ms.NewGauge(`vl_uncompressed_data_size_bytes{type="inmemory"}`, func() float64 {
+		return float64(m().UncompressedInmemorySize)
+	})
+	ms.NewGauge(`vl_uncompressed_data_size_bytes{type="file"}`, func() float64 {
+		return float64(m().UncompressedFileSize)
+	})
+
+	ms.NewGauge(`vlinsert_rows_dropped_total{reason="too_big_timestamp"}`, func() float64 {
+		return float64(m().RowsDroppedTooBigTimestamp)
+	})
+	ms.NewGauge(`vlinsert_rows_dropped_total{reason="too_small_timestamp"}`, func() float64 {
+		return float64(m().RowsDroppedTooSmallTimestamp)
+	})
+
+	return ms
+}
--- a/docs/VictoriaLogs/LogsQL.md
+++ b/docs/VictoriaLogs/LogsQL.md
--- a/docs/VictoriaLogs/README.md
+++ b/docs/VictoriaLogs/README.md
@ -0,0 +1,481 @@
+# VictoriaLogs
+
+VictoriaLogs is log management and log analytics system from [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/).
+
+It provides the following key features:
+
+- VictoriaLogs can accept logs from popular log collectors, which support
+  [ElasticSearch data ingestion format](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html). See [these docs](#data-ingestion).
+  [Grafana Loki data ingestion format](https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki) will be supported in the near future -
+  see [the Roadmap](https://docs.victoriametrics.com/VictoriaLogs/Roadmap.html).
+- VictoriaLogs is much easier to setup and operate comparing to ElasticSearch and Grafana Loki. See [these docs](#operation).
+- VictoriaLogs provides easy yet powerful query language with full-text search capabilities across
+  all the [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) -
+  see [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html).
+- VictoriaLogs can be seamlessly combined with good old Unix tools for log analysis such as `grep`, `less`, `sort`, `jq`, etc.
+  See [these docs](#querying-via-command-line) for details.
+- VictoriaLogs capacity and performance scales lineraly with the available resources (CPU, RAM, disk IO, disk space).
+  It runs smoothly on both Raspberry PI and a beefy server with hundreds of CPU cores and terabytes of RAM.
+- VictoriaLogs can handle much bigger data volumes than ElasticSearch and Grafana Loki when running on comparable hardware.
+  A single-node VictoriaLogs instance can substitute large ElasticSearch cluster.
+
+## Operation
+
+### How to run VictoriaLogs
+
+Checkout VictoriaLogs source code. It is located in the VictoriaMetrics repository:
+
+```bash
+git clone https://github.com/VictoriaMetrics/VictoriaMetrics
+cd VictoriaMetrics
+```
+
+Then build VictoriaLogs. The build command requires [Go 1.20](https://golang.org/doc/install).
+
+```bash
+make victoria-logs
+```
+
+Then run the built binary:
+
+```bash
+bin/victoria-logs
+```
+
+VictoriaLogs is ready to [receive logs](#data-ingestion) and [query logs](#querying) at the TCP port `9428` now!
+It has no any external dependencies, so it may run in various environments without additional setup and configuration.
+VictoriaLogs automatically adapts to the available CPU and RAM resources. It also automatically setups and creates
+the needed indexes during [data ingestion](#data-ingestion).
+
+It is possible to change the TCP port via `-httpListenAddr` command-line flag. For example, the following command
+starts VictoriaLogs, which accepts incoming requests at port `9200` (aka ElasticSearch HTTP API port):
+
+```bash
+/path/to/victoria-logs -httpListenAddr=:9200
+```
+
+VictoriaLogs stores the ingested data to the `victoria-logs-data` directory by default. The directory can be changed
+via `-storageDataPath` command-line flag. See [these docs](#storage) for details.
+
+By default VictoriaLogs stores log entries with timestamps in the time range `[now-7d, now]`, while dropping logs outside the given time range.
+E.g. it uses the retention of 7 days. Read [these docs](#retention) on how to control the retention for the [ingested](#data-ingestion) logs.
+
+It is recommended setting up monitoring of VictoriaLogs according to [these docs](#monitoring).
+
+### Data ingestion
+
+VictoriaLogs supports the following data ingestion techniques:
+
+- Via [Filebeat](https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-overview.html). See [these docs](#filebeat-setup).
+- Via [Logstash](https://www.elastic.co/guide/en/logstash/current/introduction.html). See [these docs](#logstash-setup).
+
+The ingested log entries can be queried according to [these docs](#querying).
+
+#### Data ingestion troubleshooting
+
+VictoriaLogs provides the following command-line flags, which can help debugging data ingestion issues:
+
+- `-logNewStreams` - if this flag is passed to VictoriaLogs, then it logs all the newly
+  registered [log streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields).
+  This may help debugging [high cardinality issues](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality).
+- `-logIngestedRows` - if this flag is passed to VictoriaLogs, then it logs all the ingested
+  [log entries](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model).
+
+VictoriaLogs exposes various [metrics](#monitoring), which may help debugging data ingestion issues:
+
+- `vl_rows_ingested_total` - the number of ingested [log entries](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model)
+  since the last VictoriaLogs restart. If this number icreases over time, then logs are successfully ingested into VictoriaLogs.
+  The ingested logs can be inspected in logs by passing `-logIngestedRows` command-line flag to VictoriaLogs.
+- `vl_streams_created_total` - the number of created [log streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields)
+  since the last VictoriaLogs restart. If this metric grows rapidly during extended periods of time, then this may lead
+  to [high cardinality issues](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality).
+  The newly created log streams can be inspected in logs by passing `-logNewStreams` command-line flag to VictoriaLogs.
+
+#### Filebeat setup
+
+Specify [`output.elasicsearch`](https://www.elastic.co/guide/en/beats/filebeat/current/elasticsearch-output.html) section in the `filebeat.yml`
+for sending the collected logs to VictoriaLogs:
+
+```yml
+output.elasticsearch:
+  hosts: ["http://localhost:9428/insert/elasticsearch/"]
+  parameters:
+    _msg_field: "message"
+    _time_field: "@timestamp"
+    _stream_fields: "host.hostname,log.file.path"
+```
+
+Substitute the `localhost:9428` address inside `hosts` section with the real TCP address of VictoriaLogs.
+
+The `_msg_field` parameter must contain the field name with the log message generated by Filebeat. This is usually `message` field.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) for details.
+
+The `_time_field` parameter must contain the field name with the log timestamp generated by Filebeat. This is usually `@timestamp` field.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field) for details.
+
+It is recommended specifying comma-separated list of field names, which uniquely identify every log stream collected by Filebeat, in the `_stream_fields` parameter.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) for details.
+
+If some [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) aren't needed,
+then VictoriaLogs can be instructed to ignore them during data ingestion - just pass `ignore_fields` parameter with comma-separated list of fields to ignore.
+For example, the following config instructs VictoriaLogs to ignore `log.offset` and `event.original` fields in the ingested logs:
+
+```yml
+output.elasticsearch:
+  hosts: ["http://localhost:9428/insert/elasticsearch/"]
+  parameters:
+    _msg_field: "message"
+    _time_field: "@timestamp"
+    _stream_fields: "host.name,log.file.path"
+    ignore_fields: "log.offset,event.original"
+```
+
+When Filebeat ingests logs into VictoriaLogs at a high rate, then it may be needed to tune `worker` and `bulk_max_size` options.
+For example, the following config is optimized for higher than usual ingestion rate:
+
+```yml
+output.elasticsearch:
+  hosts: ["http://localhost:9428/insert/elasticsearch/"]
+  parameters:
+    _msg_field: "message"
+    _time_field: "@timestamp"
+    _stream_fields: "host.name,log.file.path"
+  worker: 8
+  bulk_max_size: 1000
+```
+
+If the Filebeat sends logs to VictoriaLogs in another datacenter, then it may be useful enabling data compression via `compression_level` option.
+This usually allows saving network bandwidth and costs by up to 5 times:
+
+```yml
+output.elasticsearch:
+  hosts: ["http://localhost:9428/insert/elasticsearch/"]
+  parameters:
+    _msg_field: "message"
+    _time_field: "@timestamp"
+    _stream_fields: "host.name,log.file.path"
+  compression_level: 1
+```
+
+By default the ingested logs are stored in the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy).
+If you need storing logs in other tenant, then specify the needed tenant via `headers` at `output.elasticsearch` section.
+For example, the following `filebeat.yml` config instructs Filebeat to store the data to `(AccountID=12, ProjectID=34)` tenant:
+
+```yml
+output.elasticsearch:
+  hosts: ["http://localhost:9428/insert/elasticsearch/"]
+  headers:
+    AccountID: 12
+    ProjectID: 34
+  parameters:
+    _msg_field: "message"
+    _time_field: "@timestamp"
+    _stream_fields: "host.name,log.file.path"
+```
+
+The ingested log entries can be queried according to [these docs](#querying).
+
+See also [data ingestion troubleshooting](#data-ingestion-trobuleshooting) docs.
+
+#### Logstash setup
+
+Specify [`output.elasticsearch`](https://www.elastic.co/guide/en/logstash/current/plugins-outputs-elasticsearch.html) section in the `logstash.conf` file
+for sending the collected logs to VictoriaLogs:
+
+```conf
+output {
+  elasticsearch {
+    hosts => ["http://localhost:9428/insert/elasticsearch/"]
+    parameters => {
+        "_msg_field" => "message"
+        "_time_field" => "@timestamp"
+        "_stream_fields" => "host.name,process.name"
+    }
+  }
+}
+```
+
+Substitute `localhost:9428` address inside `hosts` with the real TCP address of VictoriaLogs.
+
+The `_msg_field` parameter must contain the field name with the log message generated by Logstash. This is usually `message` field.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) for details.
+
+The `_time_field` parameter must contain the field name with the log timestamp generated by Logstash. This is usually `@timestamp` field.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field) for details.
+
+It is recommended specifying comma-separated list of field names, which uniquely identify every log stream collected by Logstash, in the `_stream_fields` parameter.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) for details.
+
+If some [log fields](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model) aren't needed,
+then VictoriaLogs can be instructed to ignore them during data ingestion - just pass `ignore_fields` parameter with comma-separated list of fields to ignore.
+For example, the following config instructs VictoriaLogs to ignore `log.offset` and `event.original` fields in the ingested logs:
+
+```conf
+output {
+  elasticsearch {
+    hosts => ["http://localhost:9428/insert/elasticsearch/"]
+    parameters => {
+        "_msg_field" => "message"
+        "_time_field" => "@timestamp"
+        "_stream_fields" => "host.hostname,process.name"
+        "ignore_fields" => "log.offset,event.original"
+    }
+  }
+}
+```
+
+If the Logstash sends logs to VictoriaLogs in another datacenter, then it may be useful enabling data compression via `http_compression: true` option.
+This usually allows saving network bandwidth and costs by up to 5 times:
+
+```conf
+output {
+  elasticsearch {
+    hosts => ["http://localhost:9428/insert/elasticsearch/"]
+    parameters => {
+        "_msg_field" => "message"
+        "_time_field" => "@timestamp"
+        "_stream_fields" => "host.hostname,process.name"
+    }
+    http_compression => true
+  }
+}
+```
+
+By default the ingested logs are stored in the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy).
+If you need storing logs in other tenant, then specify the needed tenant via `custom_headers` at `output.elasticsearch` section.
+For example, the following `logstash.conf` config instructs Logstash to store the data to `(AccountID=12, ProjectID=34)` tenant:
+
+```conf
+output {
+  elasticsearch {
+    hosts => ["http://localhost:9428/insert/elasticsearch/"]
+    custom_headers => {
+        "AccountID" => "1"
+        "ProjectID" => "2"
+    }
+    parameters => {
+        "_msg_field" => "message"
+        "_time_field" => "@timestamp"
+        "_stream_fields" => "host.hostname,process.name"
+    }
+  }
+}
+```
+
+The ingested log entries can be queried according to [these docs](#querying).
+
+See also [data ingestion troubleshooting](#data-ingestion-trobuleshooting) docs.
+
+### Querying
+
+VictoriaLogs can be queried at the `/select/logsql/query` endpoint. The [LogsQL](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html)
+query must be passed via `query` argument. For example, the following query returns all the log entries with the `error` word:
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error'
+```
+
+The `query` argument can be passed either in the request url itself (aka HTTP GET request) or via request body
+with the `x-www-form-urlencoded` encoding (aka HTTP POST request). The HTTP POST is useful for sending long queries
+when they do not fit the maximum url length of the used clients and proxies.
+
+See [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) for details on what can be passed to the `query` arg.
+The `query` arg must be properly encoded with [percent encoding](https://en.wikipedia.org/wiki/URL_encoding) when passing it to `curl`
+or similar tools.
+
+The `/select/logsql/query` endpoint returns [a stream of JSON lines](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON),
+where each line contains JSON-encoded log entry in the form `{field1="value1",...,fieldN="valueN"}`.
+Example response:
+
+```
+{"_msg":"error: disconnect from 19.54.37.22: Auth fail [preauth]","_stream":"{}","_time":"2023-01-01T13:32:13Z"}
+{"_msg":"some other error","_stream":"{}","_time":"2023-01-01T13:32:15Z"}
+```
+
+The matching lines are sent to the response stream as soon as they are found in VictoriaLogs storage.
+This means that the returned response may contain billions of lines for queries matching too many log entries.
+The response can be interrupted at any time by closing the connection to VictoriaLogs server.
+This allows post-processing the returned lines at the client side with the usual Unix commands such as `grep`, `jq`, `less`, `head`, etc.
+See [these docs](#querying-via-command-line) for more details.
+
+The returned lines aren't sorted by default, since sorting disables the ability to send matching log entries to response stream as soon as they are found.
+Query results can be sorted either at VictoriaLogs side according [to these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#sorting)
+or at client side with the usual `sort` command according to [these docs](#querying-via-command-line).
+
+By default the `(AccountID=0, ProjectID=0)` [tenant](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#multitenancy) is queried.
+If you need querying other tenant, then specify the needed tenant via http request headers. For example, the following query searches
+for log messages at `(AccountID=12, ProjectID=34)` tenant:
+
+```bash
+curl http://localhost:9428/select/logsql/query -H 'AccountID: 12' -H 'ProjectID: 34' -d 'query=error'
+```
+
+The number of requests to `/select/logsql/query` can be [monitored](#monitoring) with `vl_http_requests_total{path="/select/logsql/query"}` metric.
+
+#### Querying via command-line
+
+VictoriaLogs provides good integration with `curl` and other command-line tools because of the following features:
+
+- VictoriaLogs sends the matching log entries to the response stream as soon as they are found.
+  This allows forwarding the response stream to arbitrary [Unix pipes](https://en.wikipedia.org/wiki/Pipeline_(Unix)).
+- VictoriaLogs automatically adjusts query execution speed to the speed of the client, which reads the response stream.
+  For example, if the response stream is piped to `less` command, then the query is suspended
+  until the `less` command reads the next block from the response stream.
+- VictoriaLogs automatically cancels query execution when the client closes the response stream.
+  For example, if the query response is piped to `head` command, then VictoriaLogs stops executing the query
+  when the `head` command closes the response stream.
+
+These features allow executing queries at command-line interface, which potentially select billions of rows,
+without the risk of high resource usage (CPU, RAM, disk IO) at VictoriaLogs server.
+
+For example, the following query can return very big number of matching log entries (e.g. billions) if VictoriaLogs contains
+many log messages with the `error` [word](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#word):
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error'
+```
+
+If the command returns "never-ending" response, then just press `ctrl+C` at any time in order to cancel the query.
+VictoriaLogs notices that the response stream is closed, so it cancels the query and instantly stops consuming CPU, RAM and disk IO for this query.
+
+Then just use `head` command for investigating the returned log messages and narrowing down the query:
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error' | head -10
+```
+
+The `head -10` command reads only the first 10 log messages from the response and then closes the response stream.
+This automatically cancels the query at VictoriaLogs side, so it stops consuming CPU, RAM and disk IO resources.
+
+Sometimes it may be more convenient to use `less` command instead of `head` during the investigation of the returned response:
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error' | less
+```
+
+The `less` command reads the response stream on demand, when the user scrolls down the output.
+VictoriaLogs suspends query execution when `less` stops reading the response stream.
+It doesn't consume CPU and disk IO resources during this time. It resumes query execution
+when the `less` continues reading the response stream.
+
+Suppose that the initial investigation of the returned query results helped determining that the needed log messages contain
+`cannot open file` [phrase](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#phrase-filter).
+Then the query can be narrowed down to `error AND "cannot open file"`
+(see [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#logical-filter) about `AND` operator).
+Then run the updated command in order to continue the investigation:
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error AND "cannot open file"' | head
+```
+
+Note that the `query` arg must be properly encoded with [percent encoding](https://en.wikipedia.org/wiki/URL_encoding) when passing it to `curl`
+or similar tools.
+
+The `pipe the query to "head" or "less" -> investigate the results -> refine the query` iteration
+can be repeated multiple times until the needed log messages are found.
+
+The returned VictoriaLogs query response can be post-processed with any combination of Unix commands,
+which are usually used for log analysis - `grep`, `jq`, `awk`, `sort`, `uniq`, `wc`, etc.
+
+For example, the following command uses `wc -l` Unix command for counting the number of log messages
+with the `error` [word](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#word)
+received from [streams](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) with `app="nginx"` field
+during the last 5 minutes:
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=_stream:{app="nginx"} AND _time:[now-5m,now] AND error' | wc -l
+```
+
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter) about `_stream` filter,
+[these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#time-filter) about `_time` filter
+and [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#logical-filter) about `AND` operator.
+
+The following example shows how to sort query results by the [`_time` field](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field):
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=error' | jq -r '._time + " " + ._msg' | sort | less
+```
+
+This command uses `jq` for extracting [`_time`](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#time-field)
+and [`_msg`](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#message-field) fields from the returned results,
+and piping them to `sort` command.
+
+Note that the `sort` command needs to read all the response stream before returning the sorted results. So the command above
+can take non-trivial amounts of time if the `query` returns too many results. The solution is to narrow down the `query`
+before sorting the results. See [these tips](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#performance-tips)
+on how to narrow down query results.
+
+The following example calculates stats on the number of log messages received during the last 5 minutes
+grouped by `log.level` [field](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#data-model):
+
+```bash
+curl http://localhost:9428/select/logsql/query -d 'query=_time:[now-5m,now] log.level:*' | jq -r '."log.level"' | sort | uniq -c 
+```
+
+The query selects all the log messages with non-empty `log.level` field via ["any value" filter](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#any-value-filter),
+then pipes them to `jq` command, which extracts the `log.level` field value from the returned JSON stream, then the extracted `log.level` values
+are sorted with `sort` command and, finally, they are passed to `uniq -c` command for calculating the needed stats.
+
+See also:
+
+- [Key concepts](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html).
+- [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html).
+
+
+### Monitoring
+
+VictoriaLogs exposes internal metrics in Prometheus exposition format at `http://localhost:9428/metrics` page.
+It is recommended to set up monitoring of these metrics via VictoriaMetrics
+(see [these docs](https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter)),
+vmagent (see [these docs](https://docs.victoriametrics.com/vmagent.html#how-to-collect-metrics-in-prometheus-format)) or via Prometheus.
+
+VictoriaLogs emits own logs to stdout. It is recommended investigating these logs during troubleshooting.
+
+
+### Retention
+
+By default VictoriaLogs stores log entries with timestamps in the time range `[now-7d, now]`, while dropping logs outside the given time range.
+E.g. it uses the retention of 7 days. The retention can be configured with `-retentionPeriod` command-line flag.
+This flag accepts values starting from `1d` (one day) up to `100y` (100 years). See [these docs](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-durations)
+for the supported duration formats.
+
+For example, the following command starts VictoriaLogs with the retention of 8 weeks:
+
+```bash
+/path/to/victoria-logs -retentionPeriod=8w
+```
+
+VictoriaLogs stores the [ingested](#data-ingestion) logs in per-day partition directories. It automatically drops partition directories
+outside the configured retention.
+
+VictoriaLogs automatically drops logs at [data ingestion](#data-ingestion) stage if they have timestamps outside the configured retention.
+A sample of dropped logs is logged with `WARN` message in order to simplify troubleshooting.
+The `vlinsert_rows_dropped_total` [metric](#monitoring) is incremented each time an ingested log entry is dropped because of timestamp outside the retention.
+It is recommended setting up the following alerting rule at [vmalert](https://docs.victoriametrics.com/vmalert.html) in order to be notified
+when logs with wrong timestamps are ingested into VictoriaLogs:
+
+```metricsql
+rate(vlinsert_rows_dropped_total[5m]) > 0
+```
+
+By default VictoriaLogs doesn't accept log entries with timestamps bigger than `now+2d`, e.g. 2 days in the future.
+If you need accepting logs with bigger timestamps, then specify the desired "future retention" via `-futureRetention` command-line flag.
+This flag accepts values starting from `1d`. See [these docs](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-durations)
+for the supported duration formats.
+
+For example, the following command starts VictoriaLogs, which accepts logs with timestamps up to a year in the future:
+
+```bash
+/path/to/victoria-logs -futureRetention=1y
+```
+
+### Storage
+
+VictoriaLogs stores all its data in a single directory - `victoria-logs-data`. The path to the directory can be changed via `-storageDataPath` command-line flag.
+For example, the following command starts VictoriaLogs, which stores the data at `/var/lib/victoria-logs`:
+
+```bash
+/path/to/victoria-logs -storageDataPath=/var/lib/victoria-logs
+```
+
+VictoriaLogs automatically creates the `-storageDataPath` directory on the first run if it is missing.
--- a/docs/VictoriaLogs/Roadmap.md
+++ b/docs/VictoriaLogs/Roadmap.md
@ -0,0 +1,37 @@
+# VictoriaLogs roadmap
+
+The VictoriaLogs Preview is ready for evaluation in production. It is recommended running it alongside the existing solutions
+such as ElasticSearch and Grafana Loki and comparing their resource usage and usability.
+It isn't recommended migrating from existing solutions to VictoriaLogs Preview yet.
+
+The following functionality is available in VictoriaLogs Preview:
+
+- [Data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+- [Querying](https://docs.victoriametrics.com/VictoriaLogs/#querying).
+- [Querying via command-line](https://docs.victoriametrics.com/VictoriaLogs/#querying-via-command-line).
+
+See [operation docs](https://docs.victoriametrics.com/VictoriaLogs/#operation) for details.
+
+The following functionality is planned in the future versions of VictoriaLogs:
+
+- Support for [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) from popular log collectors and formats:
+  - Promtail (aka Grafana Loki)
+  - Vector.dev
+  - Fluentbit
+  - Fluentd
+  - Syslog
+- Add missing functionality to [LogsQL](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html):
+  - [Stream context](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-context).
+  - [Transformation functions](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#transformations).
+  - [Post-filtering](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#post-filters).
+  - [Stats calculations](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stats).
+  - [Sorting](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#sorting).
+  - [Limiters](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#limiters).
+  - The ability to use subqueries inside [in()](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#multi-exact-filter) function.
+- Live tailing for [LogsQL filters](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#filters) aka `tail -f`.
+- Web UI with the following abilities:
+  - Explore the ingested logs.
+  - Build graphs over time for the ingested logs.
+- Ability to make instant snapshots and backups in the way [similar to VictoriaMetrics](https://docs.victoriametrics.com/#how-to-work-with-snapshots).
+- Cluster version of VictoriaLogs.
+- Ability to store data to object storage (such as S3, GCS, Minio).
--- a/docs/VictoriaLogs/keyConcepts.md
+++ b/docs/VictoriaLogs/keyConcepts.md
@ -0,0 +1,219 @@
+# VictoriaLogs key concepts
+
+## Data model
+
+VictoriaLogs works with structured logs. Every log entry may contain arbitrary number of `key=value` pairs (aka fields).
+A single log entry can be expressed as a single-level [JSON](https://www.json.org/json-en.html) object with string keys and values.
+For example:
+
+```json
+{
+  "job": "my-app",
+  "instance": "host123:4567",
+  "level": "error",
+  "client_ip": "1.2.3.4",
+  "trace_id": "1234-56789-abcdef",
+  "_msg": "failed to serve the client request"
+}
+```
+
+VictoriaLogs automatically transforms multi-level JSON (aka nested JSON) into single-level JSON
+during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) according to the following rules:
+
+- Nested dictionaries are flattened by concatenating dictionary keys with `.` char. For example, the following multi-level JSON
+  is transformed into the following single-level JSON:
+
+  ```json
+  {
+    "host": {
+      "name": "foobar"
+      "os": {
+        "version": "1.2.3"
+      }
+    }
+  }
+  ```
+
+  ```json
+  {
+    "host.name": "foobar",
+    "host.os.version": "1.2.3"
+  }
+  ```
+
+- Arrays, numbers and boolean values are converted into strings. This simplifies [full-text search](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) over such values.
+  For example, the following JSON with an array, a number and a boolean value is converted into the following JSON with string values:
+
+  ```json
+  {
+    "tags": ["foo", "bar"],
+    "offset": 12345,
+    "is_error": false
+  }
+  ```
+
+  ```json
+  {
+    "tags": "[\"foo\", \"bar\"]",
+    "offset": "12345",
+    "is_error": "false"
+  }
+  ```
+
+Both label name and label value may contain arbitrary chars. Such chars must be encoded
+during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
+according to [JSON string encoding](https://www.rfc-editor.org/rfc/rfc7159.html#section-7).
+Unicode chars must be encoded with [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoding:
+
+```json
+{
+  "label with whitepsace": "value\nwith\nnewlines",
+  "Поле": "价值",
+}
+```
+
+VictoriaLogs automatically indexes all the fields in all the [ingested](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) logs.
+This enables [full-text search](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) across all the fields.
+
+VictoriaLogs supports the following field types:
+
+* [`_msg` field](#message-field)
+* [`_time` field](#time-field)
+* [`_stream` fields](#stream-fields)
+* [other fields](#other-fields)
+
+
+### Message field
+
+Every ingested [log entry](#data-model) must contain at least a `_msg` field with the actual log message. For example, this is the minimal
+log entry, which can be ingested into VictoriaLogs:
+
+```json
+{
+  "_msg": "some log message"
+}
+```
+
+If the actual log message has other than `_msg` field name, then it is possible to specify the real log message field
+via `_msg_field` query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+For example, if log message is located in the `event.original` field, then specify `_msg_field=event.original` query arg
+during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+
+### Time field
+
+The ingested [log entries](#data-model) may contain `_time` field with the timestamp of the ingested log entry.
+For example:
+
+```json
+{
+  "_msg": "some log message",
+  "_time": "2023-04-12T06:38:11.095Z"
+}
+```
+
+If the actual timestamp has other than `_time` field name, then it is possible to specify the real timestamp
+field via `_time_field` query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+For example, if timestamp is located in the `event.created` field, then specify `_time_field=event.created` query arg
+during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+
+If `_time` field is missing, then the data ingestion time is used as log entry timestamp.
+
+The log entry timestamp allows quickly narrowing down the search to a particular time range.
+See [these docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#time-filter) for details.
+
+### Stream fields
+
+Some [structured logging](#data-model) fields may uniquely identify the application instance, which generates log entries.
+This may be either a single field such as `instance=host123:456` or a set of fields such as
+`(datacenter=..., env=..., job=..., instance=...)` or
+`(kubernetes.namespace=..., kubernetes.node.name=..., kubernetes.pod.name=..., kubernetes.container.name=...)`.
+
+Log entries received from a single application instance form a log stream in VictoriaLogs.
+VictoriaLogs optimizes storing and querying of individual log streams. This provides the following benefits:
+
+- Reduced disk space usage, since a log stream from a single application instance is usually compressed better
+  than a mixed log stream from multiple distinct applications.
+
+- Increased query performance, since VictoriaLogs needs to scan lower amounts of data
+  when [searching by stream labels](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter).
+
+VictoriaLogs cannot determine automatically, which fields uniquely identify every log stream,
+so it stores all the received log entries in a single default stream - `{}`.
+This may lead to not-so-optimal resource usage and query performance.
+
+Therefore it is recommended specifying stream-level fields via `_stream_fields` query arg
+during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion).
+For example, if logs from Kubernetes containers have the following fields:
+
+```json
+{
+  "kubernetes.namespace": "some-namespace",
+  "kubernetes.node.name": "some-node",
+  "kubernetes.pod.name": "some-pod",
+  "kubernetes.container.name": "some-container",
+  "_msg": "some log message"
+}
+```
+
+then sepcify `_stream_fields=kubernetes.namespace,kubernetes.node.name,kubernetes.pod.name,kubernetes.container.name`
+query arg during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion) in order to properly store
+per-container logs into distinct streams.
+
+#### How to determine which fields must be associated with log streams?
+
+[Log streams](#stream-fields) can be associated with fields, which simultaneously meet the following conditions:
+
+- Fields, which remain constant across log entries received from a single application instance.
+- Fields, which uniquely identify the application instance. For example, `instance`, `host`, `container`, etc.
+
+Sometimes a single application instance may generate multiple log streams and store them into distinct log files.
+In this case it is OK to associate the log stream with filepath fields such as `log.file.path` additionally to instance-specific fields.
+
+Structured logs may contain big number of fields, which do not change across log entries received from a single application instance.
+There is no need in associating all these fields with log stream - it is enough to associate only those fields, which uniquely identify
+the application instance across all the ingested logs. Additionally, some fields such as `datacenter`, `environment`, `namespace`, `job` or `app`,
+can be associated with log stream in order to optimize searching by these fields with [stream filtering](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html#stream-filter).
+
+Never associate log streams with fields, which may change across log entries of the same application instance. See [these docs](#high-cardinality) for details.
+
+#### High cardinality
+
+Some fields in the [ingested logs](#data-model) may contain big number of unique values across log entries.
+For example, fields with names such as `ip`, `user_id` or `trace_id` tend to contain big number of unique values.
+VictoriaLogs works perfectly with such fields unless they are associated with [log streams](#stream-fields).
+
+Never associate high-cardinality fields with [log streams](#stream-fields), since this may result
+to the following issues:
+
+- Performance degradation during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
+  and [querying](https://docs.victoriametrics.com/VictoriaLogs/#querying)
+- Increased memory usage
+- Increased CPU usage
+- Increased disk space usage
+- Increased disk read / write IO
+
+VictoriaLogs exposes `vl_streams_created_total` [metric](https://docs.victoriametrics.com/VictoriaLogs/#monitoring),
+which shows the number of created streams since the last VictoriaLogs restart. If this metric grows at a rapid rate
+during long period of time, then there are high chances of high cardinality issues mentioned above.
+VictoriaLogs can log all the newly registered streams when `-logNewStreams` command-line flag is passed to it.
+This can help narrowing down and eliminating high-cardinality fields from [log streams](#stream-fields).
+
+### Other fields
+
+The rest of [structured logging](#data-model) fields are optional. They can be used for simplifying and optimizing search queries.
+For example, it is usually faster to search over a dedicated `trace_id` field instead of searching for the `trace_id` inside long log message.
+E.g. the `trace_id:XXXX-YYYY-ZZZZ` query usually works faster than the `_msg:"trace_id=XXXX-YYYY-ZZZZ"` query.
+
+See [LogsQL docs](https://docs.victoriametrics.com/VictoriaLogs/LogsQL.html) for more details.
+
+## Multitenancy
+
+VictoriaLogs supports multitenancy. A tenant is identified by `(AccountID, ProjectID)` pair, where `AccountID` and `ProjectID` are arbitrary 32-bit unsigned integeres.
+The `AccountID` and `ProjectID` fields can be set during [data ingestion](https://docs.victoriametrics.com/VictoriaLogs/#data-ingestion)
+and [querying](https://docs.victoriametrics.com/VictoriaLogs/#querying) via `AccountID` and `ProjectID` request headers.
+
+If `AccountID` and/or `ProjectID` request headers aren't set, then the default `0` value is used.
+
+VictoriaLogs has very low overhead for per-tenant management, so it is OK to have thousands of tenants in a single VictoriaLogs instance.
+
+VictoriaLogs doesn't perform per-tenant authorization. Use [vmauth](https://docs.victoriametrics.com/vmauth.html) or similar tools for per-tenant authorization.
--- a/lib/logstorage/arena.go
+++ b/lib/logstorage/arena.go
@ -0,0 +1,31 @@
+package logstorage
+
+import (
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+)
+
+type arena struct {
+	b []byte
+}
+
+func (a *arena) reset() {
+	a.b = a.b[:0]
+}
+
+func (a *arena) copyBytes(b []byte) []byte {
+	ab := a.b
+	abLen := len(ab)
+	ab = append(ab, b...)
+	result := ab[abLen:]
+	a.b = ab
+	return result
+}
+
+func (a *arena) newBytes(size int) []byte {
+	ab := a.b
+	abLen := len(ab)
+	ab = bytesutil.ResizeWithCopyMayOverallocate(ab, abLen+size)
+	result := ab[abLen:]
+	a.b = ab
+	return result
+}
--- a/lib/logstorage/block.go
+++ b/lib/logstorage/block.go
@ -0,0 +1,650 @@
+package logstorage
+
+import (
+	"fmt"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// block represents a block of log entries.
+type block struct {
+	// timestamps contains timestamps for log entries.
+	timestamps []int64
+
+	// columns contains values for fields seen in log entries.
+	columns []column
+
+	// constColumns contains fields with constant values across all the block entries.
+	constColumns []Field
+}
+
+func (b *block) reset() {
+	b.timestamps = b.timestamps[:0]
+
+	cs := b.columns
+	for i := range cs {
+		cs[i].reset()
+	}
+	b.columns = cs[:0]
+
+	ccs := b.constColumns
+	for i := range ccs {
+		ccs[i].Reset()
+	}
+	b.constColumns = ccs[:0]
+}
+
+// uncompressedSizeBytes returns the total size of the origianl log entries stored in b.
+//
+// It is supposed that every log entry has the following format:
+//
+// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
+func (b *block) uncompressedSizeBytes() uint64 {
+	rowsCount := uint64(b.Len())
+
+	// Take into account timestamps
+	n := rowsCount * uint64(len(time.RFC3339Nano))
+
+	// Take into account columns
+	cs := b.columns
+	for i := range cs {
+		c := &cs[i]
+		nameLen := uint64(len(c.name))
+		if nameLen == 0 {
+			nameLen = uint64(len("_msg"))
+		}
+		for _, v := range c.values {
+			if len(v) > 0 {
+				n += nameLen + 2 + uint64(len(v))
+			}
+		}
+	}
+
+	// Take into account constColumns
+	ccs := b.constColumns
+	for i := range ccs {
+		cc := &ccs[i]
+		nameLen := uint64(len(cc.Name))
+		if nameLen == 0 {
+			nameLen = uint64(len("_msg"))
+		}
+		n += rowsCount * (2 + nameLen + uint64(len(cc.Value)))
+	}
+
+	return n
+}
+
+// uncompressedRowsSizeBytes returns the size of the uncompressed rows.
+//
+// It is supposed that every row has the following format:
+//
+// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
+func uncompressedRowsSizeBytes(rows [][]Field) uint64 {
+	n := uint64(0)
+	for _, fields := range rows {
+		n += uncompressedRowSizeBytes(fields)
+	}
+	return n
+}
+
+// uncompressedRowSizeBytes returns the size of uncompressed row.
+//
+// It is supposed that the row has the following format:
+//
+// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
+func uncompressedRowSizeBytes(fields []Field) uint64 {
+	n := uint64(len(time.RFC3339Nano)) // log timestamp
+	for _, f := range fields {
+		nameLen := len(f.Name)
+		if nameLen == 0 {
+			nameLen = len("_msg")
+		}
+		n += uint64(2 + nameLen + len(f.Value))
+	}
+	return n
+}
+
+// column contains values for the given field name seen in log entries.
+type column struct {
+	// name is the field name
+	name string
+
+	// values is the values seen for the given log entries.
+	values []string
+}
+
+func (c *column) reset() {
+	c.name = ""
+
+	values := c.values
+	for i := range values {
+		values[i] = ""
+	}
+	c.values = values[:0]
+}
+
+func (c *column) areSameValues() bool {
+	values := c.values
+	if len(values) < 2 {
+		return true
+	}
+	value := values[0]
+	for _, v := range values[1:] {
+		if value != v {
+			return false
+		}
+	}
+	return true
+}
+
+func (c *column) resizeValues(valuesLen int) []string {
+	values := c.values
+	if n := valuesLen - cap(values); n > 0 {
+		values = append(values[:cap(values)], make([]string, n)...)
+	}
+	values = values[:valuesLen]
+	c.values = values
+	return values
+}
+
+// mustWriteTo writes c to sw and updates ch accordingly.
+func (c *column) mustWriteTo(ch *columnHeader, sw *streamWriters) {
+	ch.reset()
+
+	valuesWriter := &sw.fieldValuesWriter
+	bloomFilterWriter := &sw.fieldBloomFilterWriter
+	if c.name == "" {
+		valuesWriter = &sw.messageValuesWriter
+		bloomFilterWriter = &sw.messageBloomFilterWriter
+	}
+
+	ch.name = c.name
+
+	// encode values
+	ve := getValuesEncoder()
+	ch.valueType, ch.minValue, ch.maxValue = ve.encode(c.values, &ch.valuesDict)
+
+	bb := longTermBufPool.Get()
+	defer longTermBufPool.Put(bb)
+
+	// marshal values
+	bb.B = marshalStringsBlock(bb.B[:0], ve.values)
+	putValuesEncoder(ve)
+	ch.valuesSize = uint64(len(bb.B))
+	if ch.valuesSize > maxValuesBlockSize {
+		logger.Panicf("BUG: too valuesSize: %d bytes; mustn't exceed %d bytes", ch.valuesSize, maxValuesBlockSize)
+	}
+	ch.valuesOffset = valuesWriter.bytesWritten
+	valuesWriter.MustWrite(bb.B)
+
+	// create and marshal bloom filter for c.values
+	if ch.valueType != valueTypeDict {
+		tokensBuf := getTokensBuf()
+		tokensBuf.A = tokenizeStrings(tokensBuf.A[:0], c.values)
+		bb.B = bloomFilterMarshal(bb.B[:0], tokensBuf.A)
+		putTokensBuf(tokensBuf)
+	} else {
+		// there is no need in ecoding bloom filter for dictiory type,
+		// since it isn't used during querying - all the dictionary values are available in ch.valuesDict
+		bb.B = bb.B[:0]
+	}
+	ch.bloomFilterSize = uint64(len(bb.B))
+	if ch.bloomFilterSize > maxBloomFilterBlockSize {
+		logger.Panicf("BUG: too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", ch.bloomFilterSize, maxBloomFilterBlockSize)
+	}
+	ch.bloomFilterOffset = bloomFilterWriter.bytesWritten
+	bloomFilterWriter.MustWrite(bb.B)
+}
+
+func (b *block) assertValid() {
+	// Check that timestamps are in ascending order
+	timestamps := b.timestamps
+	for i := 1; i < len(timestamps); i++ {
+		if timestamps[i-1] > timestamps[i] {
+			logger.Panicf("BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d",
+				timestamps[i-1], timestamps[i])
+		}
+	}
+
+	// Check that the number of items in each column matches the number of items in the block.
+	itemsCount := len(timestamps)
+	columns := b.columns
+	for _, c := range columns {
+		if len(c.values) != itemsCount {
+			logger.Panicf("BUG: unexpected number of values for column %q: got %d; want %d", c.name, len(c.values), itemsCount)
+		}
+	}
+}
+
+// MustInitFromRows initializes b from the given timestamps and rows.
+//
+// It is expected that timestamps are sorted.
+func (b *block) MustInitFromRows(timestamps []int64, rows [][]Field) {
+	b.reset()
+
+	assertTimestampsSorted(timestamps)
+	b.timestamps = append(b.timestamps, timestamps...)
+	b.mustInitFromRows(rows)
+	b.sortColumnsByName()
+}
+
+func (b *block) mustInitFromRows(rows [][]Field) {
+	rowsLen := len(rows)
+	if rowsLen == 0 {
+		// Nothing to do
+		return
+	}
+
+	if areSameFieldsInRows(rows) {
+		// Fast path - all the log entries have the same fields
+		fields := rows[0]
+		for i := range fields {
+			f := &fields[i]
+			if areSameValuesForColumn(rows, i) {
+				cc := b.extendConstColumns()
+				cc.Name = f.Name
+				cc.Value = f.Value
+			} else {
+				c := b.extendColumns()
+				c.name = f.Name
+				values := c.resizeValues(rowsLen)
+				for j := range rows {
+					values[j] = rows[j][i].Value
+				}
+			}
+		}
+		return
+	}
+
+	// Slow path - log entries contain different set of fields
+
+	// Determine indexes for columns
+	columnIdxs := getColumnIdxs()
+	for i := range rows {
+		fields := rows[i]
+		for j := range fields {
+			name := fields[j].Name
+			if _, ok := columnIdxs[name]; !ok {
+				columnIdxs[name] = len(columnIdxs)
+			}
+		}
+	}
+
+	// Initialize columns
+	cs := b.resizeColumns(len(columnIdxs))
+	for name, idx := range columnIdxs {
+		c := &cs[idx]
+		c.name = name
+		c.resizeValues(rowsLen)
+	}
+
+	// Write rows to block
+	for i := range rows {
+		for _, f := range rows[i] {
+			idx := columnIdxs[f.Name]
+			cs[idx].values[i] = f.Value
+		}
+	}
+	putColumnIdxs(columnIdxs)
+
+	// Detect const columns
+	for i := len(cs) - 1; i >= 0; i-- {
+		c := &cs[i]
+		if !c.areSameValues() {
+			continue
+		}
+		cc := b.extendConstColumns()
+		cc.Name = c.name
+		cc.Value = c.values[0]
+
+		c.reset()
+		if i < len(cs)-1 {
+			swapColumns(c, &cs[len(cs)-1])
+		}
+		cs = cs[:len(cs)-1]
+	}
+	b.columns = cs
+}
+
+func swapColumns(a, b *column) {
+	*a, *b = *b, *a
+}
+
+func areSameValuesForColumn(rows [][]Field, colIdx int) bool {
+	if len(rows) < 2 {
+		return true
+	}
+	value := rows[0][colIdx].Value
+	rows = rows[1:]
+	for i := range rows {
+		if value != rows[i][colIdx].Value {
+			return false
+		}
+	}
+	return true
+}
+
+func assertTimestampsSorted(timestamps []int64) {
+	for i := range timestamps {
+		if i > 0 && timestamps[i-1] > timestamps[i] {
+			logger.Panicf("BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d",
+				timestamps[i-1], timestamps[i])
+		}
+	}
+}
+
+func (b *block) extendConstColumns() *Field {
+	ccs := b.constColumns
+	if cap(ccs) > len(ccs) {
+		ccs = ccs[:len(ccs)+1]
+	} else {
+		ccs = append(ccs, Field{})
+	}
+	b.constColumns = ccs
+	return &ccs[len(ccs)-1]
+}
+
+func (b *block) extendColumns() *column {
+	cs := b.columns
+	if cap(cs) > len(cs) {
+		cs = cs[:len(cs)+1]
+	} else {
+		cs = append(cs, column{})
+	}
+	b.columns = cs
+	return &cs[len(cs)-1]
+}
+
+func (b *block) resizeColumns(columnsLen int) []column {
+	cs := b.columns[:0]
+	if n := columnsLen - cap(cs); n > 0 {
+		cs = append(cs[:cap(cs)], make([]column, n)...)
+	}
+	cs = cs[:columnsLen]
+	b.columns = cs
+	return cs
+}
+
+func (b *block) sortColumnsByName() {
+	if len(b.columns)+len(b.constColumns) > maxColumnsPerBlock {
+		logger.Panicf("BUG: too big number of columns detected in the block: %d; the number of columns mustn't exceed %d",
+			len(b.columns)+len(b.constColumns), maxColumnsPerBlock)
+	}
+
+	cs := getColumnsSorter()
+	cs.columns = b.columns
+	sort.Sort(cs)
+	putColumnsSorter(cs)
+
+	ccs := getConstColumnsSorter()
+	ccs.columns = b.constColumns
+	sort.Sort(ccs)
+	putConstColumnsSorter(ccs)
+}
+
+// Len returns the number of log entries in b.
+func (b *block) Len() int {
+	return len(b.timestamps)
+}
+
+// InitFromBlockData unmarshals bd to b.
+//
+// sbu and vd are used as a temporary storage for unmarshaled column values.
+//
+// The b becomes outdated after sbu or vd is reset.
+func (b *block) InitFromBlockData(bd *blockData, sbu *stringsBlockUnmarshaler, vd *valuesDecoder) error {
+	b.reset()
+
+	if bd.rowsCount > maxRowsPerBlock {
+		return fmt.Errorf("too many entries found in the block: %d; mustn't exceed %d", bd.rowsCount, maxRowsPerBlock)
+	}
+	rowsCount := int(bd.rowsCount)
+
+	// unmarshal timestamps
+	td := &bd.timestampsData
+	var err error
+	b.timestamps, err = encoding.UnmarshalTimestamps(b.timestamps[:0], td.data, td.marshalType, td.minTimestamp, rowsCount)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal timestamps: %w", err)
+	}
+
+	// unmarshal columns
+	cds := bd.columnsData
+	cs := b.resizeColumns(len(cds))
+	for i := range cds {
+		cd := &cds[i]
+		c := &cs[i]
+		c.name = cd.name
+		c.values, err = sbu.unmarshal(c.values[:0], cd.valuesData, uint64(rowsCount))
+		if err != nil {
+			return fmt.Errorf("cannot unmarshal column %d: %w", i, err)
+		}
+		if err = vd.decodeInplace(c.values, cd.valueType, &cd.valuesDict); err != nil {
+			return fmt.Errorf("cannot decode column values: %w", err)
+		}
+	}
+
+	// unmarshal constColumns
+	b.constColumns = append(b.constColumns[:0], bd.constColumns...)
+
+	return nil
+}
+
+// mustWriteTo writes b with the given sid to sw and updates bh accordingly
+func (b *block) mustWriteTo(sid *streamID, bh *blockHeader, sw *streamWriters) {
+	// Do not store the version used for encoding directly in the block data, since:
+	// - all the blocks in the same part use the same encoding
+	// - the block encoding version can be put in metadata file for the part (aka metadataFilename)
+
+	b.assertValid()
+	bh.reset()
+
+	bh.streamID = *sid
+	bh.uncompressedSizeBytes = b.uncompressedSizeBytes()
+	bh.rowsCount = uint64(b.Len())
+
+	// Marshal timestamps
+	mustWriteTimestampsTo(&bh.timestampsHeader, b.timestamps, sw)
+
+	// Marshal columns
+	cs := b.columns
+	csh := getColumnsHeader()
+	chs := csh.resizeColumnHeaders(len(cs))
+	for i := range cs {
+		cs[i].mustWriteTo(&chs[i], sw)
+	}
+	csh.constColumns = append(csh.constColumns[:0], b.constColumns...)
+
+	bb := longTermBufPool.Get()
+	bb.B = csh.marshal(bb.B)
+	putColumnsHeader(csh)
+	bh.columnsHeaderOffset = sw.columnsHeaderWriter.bytesWritten
+	bh.columnsHeaderSize = uint64(len(bb.B))
+	if bh.columnsHeaderSize > maxColumnsHeaderSize {
+		logger.Panicf("BUG: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", bh.columnsHeaderSize, maxColumnsHeaderSize)
+	}
+	sw.columnsHeaderWriter.MustWrite(bb.B)
+	longTermBufPool.Put(bb)
+}
+
+// appendRows appends log entries from b to dst.
+func (b *block) appendRows(dst *rows) {
+	// copy timestamps
+	dst.timestamps = append(dst.timestamps, b.timestamps...)
+
+	// copy columns
+	fieldsBuf := dst.fieldsBuf
+	ccs := b.constColumns
+	cs := b.columns
+	for i := range b.timestamps {
+		fieldsLen := len(fieldsBuf)
+		// copy const columns
+		for j := range ccs {
+			cc := &ccs[j]
+			fieldsBuf = append(fieldsBuf, Field{
+				Name:  cc.Name,
+				Value: cc.Value,
+			})
+		}
+		// copy other columns
+		for j := range cs {
+			c := &cs[j]
+			value := c.values[i]
+			if len(value) == 0 {
+				continue
+			}
+			fieldsBuf = append(fieldsBuf, Field{
+				Name:  c.name,
+				Value: value,
+			})
+		}
+		dst.rows = append(dst.rows, fieldsBuf[fieldsLen:])
+	}
+	dst.fieldsBuf = fieldsBuf
+}
+
+func areSameFieldsInRows(rows [][]Field) bool {
+	if len(rows) < 2 {
+		return true
+	}
+	fields := rows[0]
+	rows = rows[1:]
+	for i := range rows {
+		leFields := rows[i]
+		if len(fields) != len(leFields) {
+			return false
+		}
+		for j := range leFields {
+			if leFields[j].Name != fields[j].Name {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+var columnIdxsPool sync.Pool
+
+func getColumnIdxs() map[string]int {
+	v := columnIdxsPool.Get()
+	if v == nil {
+		return make(map[string]int)
+	}
+	return v.(map[string]int)
+}
+
+func putColumnIdxs(m map[string]int) {
+	for k := range m {
+		delete(m, k)
+	}
+	columnIdxsPool.Put(m)
+}
+
+func getBlock() *block {
+	v := blockPool.Get()
+	if v == nil {
+		return &block{}
+	}
+	return v.(*block)
+}
+
+func putBlock(b *block) {
+	b.reset()
+	blockPool.Put(b)
+}
+
+var blockPool sync.Pool
+
+type columnsSorter struct {
+	columns []column
+}
+
+func (cs *columnsSorter) reset() {
+	cs.columns = nil
+}
+
+func (cs *columnsSorter) Len() int {
+	return len(cs.columns)
+}
+
+func (cs *columnsSorter) Less(i, j int) bool {
+	columns := cs.columns
+	return columns[i].name < columns[j].name
+}
+
+func (cs *columnsSorter) Swap(i, j int) {
+	columns := cs.columns
+	columns[i], columns[j] = columns[j], columns[i]
+}
+
+func getColumnsSorter() *columnsSorter {
+	v := columnsSorterPool.Get()
+	if v == nil {
+		return &columnsSorter{}
+	}
+	return v.(*columnsSorter)
+}
+
+func putColumnsSorter(cs *columnsSorter) {
+	cs.reset()
+	columnsSorterPool.Put(cs)
+}
+
+var columnsSorterPool sync.Pool
+
+type constColumnsSorter struct {
+	columns []Field
+}
+
+func (ccs *constColumnsSorter) reset() {
+	ccs.columns = nil
+}
+
+func (ccs *constColumnsSorter) Len() int {
+	return len(ccs.columns)
+}
+
+func (ccs *constColumnsSorter) Less(i, j int) bool {
+	columns := ccs.columns
+	return columns[i].Name < columns[j].Name
+}
+
+func (ccs *constColumnsSorter) Swap(i, j int) {
+	columns := ccs.columns
+	columns[i], columns[j] = columns[j], columns[i]
+}
+
+func getConstColumnsSorter() *constColumnsSorter {
+	v := constColumnsSorterPool.Get()
+	if v == nil {
+		return &constColumnsSorter{}
+	}
+	return v.(*constColumnsSorter)
+}
+
+func putConstColumnsSorter(ccs *constColumnsSorter) {
+	ccs.reset()
+	constColumnsSorterPool.Put(ccs)
+}
+
+var constColumnsSorterPool sync.Pool
+
+// mustWriteTimestampsTo writes timestamps to sw and updates th accordingly
+func mustWriteTimestampsTo(th *timestampsHeader, timestamps []int64, sw *streamWriters) {
+	th.reset()
+
+	bb := longTermBufPool.Get()
+	bb.B, th.marshalType, th.minTimestamp = encoding.MarshalTimestamps(bb.B[:0], timestamps, 64)
+	if len(bb.B) > maxTimestampsBlockSize {
+		logger.Panicf("BUG: too big block with timestamps: %d bytes; the maximum supported size is %d bytes", len(bb.B), maxTimestampsBlockSize)
+	}
+	th.maxTimestamp = timestamps[len(timestamps)-1]
+	th.blockOffset = sw.timestampsWriter.bytesWritten
+	th.blockSize = uint64(len(bb.B))
+	sw.timestampsWriter.MustWrite(bb.B)
+	longTermBufPool.Put(bb)
+}
--- a/lib/logstorage/block_data.go
+++ b/lib/logstorage/block_data.go
@ -0,0 +1,383 @@
+package logstorage
+
+import (
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// blockData contains packed data for a single block.
+//
+// The main purpose of this struct is to reduce the work needed during background merge of parts.
+// If the block is full, then the blockData can be written to the destination part
+// without the need to unpack it.
+type blockData struct {
+	// streamID is id of the stream for the data
+	streamID streamID
+
+	// uncompressedSizeBytes is the original (uncompressed) size of log entries stored in the block
+	uncompressedSizeBytes uint64
+
+	// rowsCount is the number of log entries in the block
+	rowsCount uint64
+
+	// timestampsData contains the encoded timestamps data for the block
+	timestampsData timestampsData
+
+	// columnsData contains packed per-column data.
+	columnsData []columnData
+
+	// constColumns contains data for const columns across the block.
+	constColumns []Field
+
+	// a is used for storing byte slices for timestamps and columns.
+	//
+	// It reduces fragmentation for them.
+	a arena
+}
+
+// reset resets bd for subsequent re-use
+func (bd *blockData) reset() {
+	bd.streamID.reset()
+	bd.uncompressedSizeBytes = 0
+	bd.rowsCount = 0
+	bd.timestampsData.reset()
+
+	cds := bd.columnsData
+	for i := range cds {
+		cds[i].reset()
+	}
+	bd.columnsData = cds[:0]
+
+	ccs := bd.constColumns
+	for i := range ccs {
+		ccs[i].Reset()
+	}
+	bd.constColumns = ccs[:0]
+
+	bd.a.reset()
+}
+
+func (bd *blockData) resizeColumnsData(columnsDataLen int) []columnData {
+	cds := bd.columnsData
+	if n := columnsDataLen - cap(cds); n > 0 {
+		cds = append(cds[:cap(cds)], make([]columnData, n)...)
+	}
+	cds = cds[:columnsDataLen]
+	bd.columnsData = cds
+	return cds
+}
+
+// copyFrom copies src to bd.
+func (bd *blockData) copyFrom(src *blockData) {
+	bd.reset()
+
+	bd.streamID = src.streamID
+	bd.uncompressedSizeBytes = src.uncompressedSizeBytes
+	bd.rowsCount = src.rowsCount
+	bd.timestampsData.copyFrom(&src.timestampsData, &bd.a)
+
+	cdsSrc := src.columnsData
+	cds := bd.resizeColumnsData(len(cdsSrc))
+	for i := range cds {
+		cds[i].copyFrom(&cdsSrc[i], &bd.a)
+	}
+	bd.columnsData = cds
+
+	bd.constColumns = append(bd.constColumns[:0], src.constColumns...)
+}
+
+// unmarshalRows appends unmarshaled from bd log entries to dst.
+//
+// The returned log entries are valid until sbu and vd are valid.
+func (bd *blockData) unmarshalRows(dst *rows, sbu *stringsBlockUnmarshaler, vd *valuesDecoder) error {
+	b := getBlock()
+	defer putBlock(b)
+
+	if err := b.InitFromBlockData(bd, sbu, vd); err != nil {
+		return err
+	}
+	b.appendRows(dst)
+	return nil
+}
+
+// mustWriteTo writes bd with the given sid to sw and updates bh accordingly
+func (bd *blockData) mustWriteTo(bh *blockHeader, sw *streamWriters) {
+	// Do not store the version used for encoding directly in the block data, since:
+	// - all the blocks in the same part use the same encoding
+	// - the block encoding version can be put in metadata file for the part (aka metadataFilename)
+
+	bh.reset()
+
+	bh.streamID = bd.streamID
+	bh.uncompressedSizeBytes = bd.uncompressedSizeBytes
+	bh.rowsCount = bd.rowsCount
+
+	// Marshal timestamps
+	bd.timestampsData.mustWriteTo(&bh.timestampsHeader, sw)
+
+	// Marshal columns
+	cds := bd.columnsData
+	csh := getColumnsHeader()
+	chs := csh.resizeColumnHeaders(len(cds))
+	for i := range cds {
+		cds[i].mustWriteTo(&chs[i], sw)
+	}
+	csh.constColumns = append(csh.constColumns[:0], bd.constColumns...)
+
+	bb := longTermBufPool.Get()
+	bb.B = csh.marshal(bb.B)
+	putColumnsHeader(csh)
+	bh.columnsHeaderOffset = sw.columnsHeaderWriter.bytesWritten
+	bh.columnsHeaderSize = uint64(len(bb.B))
+	if bh.columnsHeaderSize > maxColumnsHeaderSize {
+		logger.Panicf("BUG: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", bh.columnsHeaderSize, maxColumnsHeaderSize)
+	}
+	sw.columnsHeaderWriter.MustWrite(bb.B)
+	longTermBufPool.Put(bb)
+}
+
+// mustReadFrom reads block data associated with bh from sr to bd.
+func (bd *blockData) mustReadFrom(bh *blockHeader, sr *streamReaders) {
+	bd.reset()
+
+	bd.streamID = bh.streamID
+	bd.uncompressedSizeBytes = bh.uncompressedSizeBytes
+	bd.rowsCount = bh.rowsCount
+
+	// Read timestamps
+	bd.timestampsData.mustReadFrom(&bh.timestampsHeader, sr, &bd.a)
+
+	// Read columns
+	if bh.columnsHeaderOffset != sr.columnsHeaderReader.bytesRead {
+		logger.Panicf("FATAL: %s: unexpected columnsHeaderOffset=%d; must equal to the number of bytes read: %d",
+			sr.columnsHeaderReader.Path(), bh.columnsHeaderOffset, sr.columnsHeaderReader.bytesRead)
+	}
+	columnsHeaderSize := bh.columnsHeaderSize
+	if columnsHeaderSize > maxColumnsHeaderSize {
+		logger.Panicf("BUG: %s: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes", sr.columnsHeaderReader.Path(), columnsHeaderSize, maxColumnsHeaderSize)
+	}
+	bb := longTermBufPool.Get()
+	bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(columnsHeaderSize))
+	sr.columnsHeaderReader.MustReadFull(bb.B)
+
+	csh := getColumnsHeader()
+	if err := csh.unmarshal(bb.B); err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal columnsHeader: %s", sr.columnsHeaderReader.Path(), err)
+	}
+	longTermBufPool.Put(bb)
+	chs := csh.columnHeaders
+	cds := bd.resizeColumnsData(len(chs))
+	for i := range chs {
+		cds[i].mustReadFrom(&chs[i], sr, &bd.a)
+	}
+	bd.constColumns = append(bd.constColumns[:0], csh.constColumns...)
+	putColumnsHeader(csh)
+}
+
+// timestampsData contains the encoded timestamps data.
+type timestampsData struct {
+	// data contains packed timestamps data.
+	data []byte
+
+	// marshalType is the marshal type for timestamps
+	marshalType encoding.MarshalType
+
+	// minTimestamp is the minimum timestamp in the timestamps data
+	minTimestamp int64
+
+	// maxTimestamp is the maximum timestamp in the timestamps data
+	maxTimestamp int64
+}
+
+// reset resets td for subsequent re-use
+func (td *timestampsData) reset() {
+	td.data = nil
+	td.marshalType = 0
+	td.minTimestamp = 0
+	td.maxTimestamp = 0
+}
+
+// copyFrom copies src to td.
+func (td *timestampsData) copyFrom(src *timestampsData, a *arena) {
+	td.reset()
+
+	td.data = a.copyBytes(src.data)
+	td.marshalType = src.marshalType
+	td.minTimestamp = src.minTimestamp
+	td.maxTimestamp = src.maxTimestamp
+}
+
+// mustWriteTo writes td to sw and updates th accordingly
+func (td *timestampsData) mustWriteTo(th *timestampsHeader, sw *streamWriters) {
+	th.reset()
+
+	th.marshalType = td.marshalType
+	th.minTimestamp = td.minTimestamp
+	th.maxTimestamp = td.maxTimestamp
+	th.blockOffset = sw.timestampsWriter.bytesWritten
+	th.blockSize = uint64(len(td.data))
+	if th.blockSize > maxTimestampsBlockSize {
+		logger.Panicf("BUG: too big timestampsHeader.blockSize: %d bytes; mustn't exceed %d bytes", th.blockSize, maxTimestampsBlockSize)
+	}
+	sw.timestampsWriter.MustWrite(td.data)
+}
+
+// mustReadFrom reads timestamps data associated with th from sr to td.
+func (td *timestampsData) mustReadFrom(th *timestampsHeader, sr *streamReaders, a *arena) {
+	td.reset()
+
+	td.marshalType = th.marshalType
+	td.minTimestamp = th.minTimestamp
+	td.maxTimestamp = th.maxTimestamp
+
+	timestampsReader := &sr.timestampsReader
+	if th.blockOffset != timestampsReader.bytesRead {
+		logger.Panicf("FATAL: %s: unexpected timestampsHeader.blockOffset=%d; must equal to the number of bytes read: %d",
+			timestampsReader.Path(), th.blockOffset, timestampsReader.bytesRead)
+	}
+	timestampsBlockSize := th.blockSize
+	if timestampsBlockSize > maxTimestampsBlockSize {
+		logger.Panicf("FATAL: %s: too big timestamps block with %d bytes; the maximum supported block size is %d bytes",
+			timestampsReader.Path(), timestampsBlockSize, maxTimestampsBlockSize)
+	}
+	td.data = a.newBytes(int(timestampsBlockSize))
+	timestampsReader.MustReadFull(td.data)
+}
+
+// columnData contains packed data for a single column.
+type columnData struct {
+	// name is the column name
+	name string
+
+	// valueType is the type of values stored in valuesData
+	valueType valueType
+
+	// minValue is the minimum encoded uint* or float64 value in the columnHeader
+	//
+	// It is used for fast detection of whether the given columnHeader contains values in the given range
+	minValue uint64
+
+	// maxValue is the maximum encoded uint* or float64 value in the columnHeader
+	//
+	// It is used for fast detection of whether the given columnHeader contains values in the given range
+	maxValue uint64
+
+	// valuesDict contains unique values for valueType = valueTypeDict
+	valuesDict valuesDict
+
+	// valuesData contains packed values data for the given column
+	valuesData []byte
+
+	// bloomFilterData contains packed bloomFilter data for the given column
+	bloomFilterData []byte
+}
+
+// reset rests cd for subsequent re-use
+func (cd *columnData) reset() {
+	cd.name = ""
+	cd.valueType = 0
+
+	cd.minValue = 0
+	cd.maxValue = 0
+	cd.valuesDict.reset()
+
+	cd.valuesData = nil
+	cd.bloomFilterData = nil
+}
+
+// copyFrom copies src to cd.
+func (cd *columnData) copyFrom(src *columnData, a *arena) {
+	cd.reset()
+
+	cd.name = src.name
+	cd.valueType = src.valueType
+
+	cd.minValue = src.minValue
+	cd.maxValue = src.maxValue
+	cd.valuesDict.copyFrom(&src.valuesDict)
+
+	cd.valuesData = a.copyBytes(src.valuesData)
+	cd.bloomFilterData = a.copyBytes(src.bloomFilterData)
+}
+
+// mustWriteTo writes cd to sw and updates ch accordingly.
+func (cd *columnData) mustWriteTo(ch *columnHeader, sw *streamWriters) {
+	ch.reset()
+
+	valuesWriter := &sw.fieldValuesWriter
+	bloomFilterWriter := &sw.fieldBloomFilterWriter
+	if cd.name == "" {
+		valuesWriter = &sw.messageValuesWriter
+		bloomFilterWriter = &sw.messageBloomFilterWriter
+	}
+
+	ch.name = cd.name
+	ch.valueType = cd.valueType
+
+	ch.minValue = cd.minValue
+	ch.maxValue = cd.maxValue
+	ch.valuesDict.copyFrom(&cd.valuesDict)
+
+	// marshal values
+	ch.valuesSize = uint64(len(cd.valuesData))
+	if ch.valuesSize > maxValuesBlockSize {
+		logger.Panicf("BUG: too big valuesSize: %d bytes; mustn't exceed %d bytes", ch.valuesSize, maxValuesBlockSize)
+	}
+	ch.valuesOffset = valuesWriter.bytesWritten
+	valuesWriter.MustWrite(cd.valuesData)
+
+	// marshal bloom filter
+	ch.bloomFilterSize = uint64(len(cd.bloomFilterData))
+	if ch.bloomFilterSize > maxBloomFilterBlockSize {
+		logger.Panicf("BUG: too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", ch.bloomFilterSize, maxBloomFilterBlockSize)
+	}
+	ch.bloomFilterOffset = bloomFilterWriter.bytesWritten
+	bloomFilterWriter.MustWrite(cd.bloomFilterData)
+}
+
+// mustReadFrom reads columns data associated with ch from sr to cd.
+func (cd *columnData) mustReadFrom(ch *columnHeader, sr *streamReaders, a *arena) {
+	cd.reset()
+
+	valuesReader := &sr.fieldValuesReader
+	bloomFilterReader := &sr.fieldBloomFilterReader
+	if ch.name == "" {
+		valuesReader = &sr.messageValuesReader
+		bloomFilterReader = &sr.messageBloomFilterReader
+	}
+
+	cd.name = ch.name
+	cd.valueType = ch.valueType
+
+	cd.minValue = ch.minValue
+	cd.maxValue = ch.maxValue
+	cd.valuesDict.copyFrom(&ch.valuesDict)
+
+	// read values
+	if ch.valuesOffset != valuesReader.bytesRead {
+		logger.Panicf("FATAL: %s: unexpected columnHeader.valuesOffset=%d; must equal to the number of bytes read: %d",
+			valuesReader.Path(), ch.valuesOffset, valuesReader.bytesRead)
+	}
+	valuesSize := ch.valuesSize
+	if valuesSize > maxValuesBlockSize {
+		logger.Panicf("FATAL: %s: values block size cannot exceed %d bytes; got %d bytes", valuesReader.Path(), maxValuesBlockSize, valuesSize)
+	}
+	cd.valuesData = a.newBytes(int(valuesSize))
+	valuesReader.MustReadFull(cd.valuesData)
+
+	// read bloom filter
+	// bloom filter is missing in valueTypeDict.
+	if ch.valueType != valueTypeDict {
+		if ch.bloomFilterOffset != bloomFilterReader.bytesRead {
+			logger.Panicf("FATAL: %s: unexpected columnHeader.bloomFilterOffset=%d; must equal to the number of bytes read: %d",
+				bloomFilterReader.Path(), ch.bloomFilterOffset, bloomFilterReader.bytesRead)
+		}
+		bloomFilterSize := ch.bloomFilterSize
+		if bloomFilterSize > maxBloomFilterBlockSize {
+			logger.Panicf("FATAL: %s: bloom filter block size cannot exceed %d bytes; got %d bytes", bloomFilterReader.Path(), maxBloomFilterBlockSize, bloomFilterSize)
+		}
+		cd.bloomFilterData = a.newBytes(int(bloomFilterSize))
+		bloomFilterReader.MustReadFull(cd.bloomFilterData)
+	}
+}
--- a/lib/logstorage/block_data_test.go
+++ b/lib/logstorage/block_data_test.go
@ -0,0 +1,106 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+func TestBlockDataReset(t *testing.T) {
+	bd := &blockData{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 432,
+			},
+		},
+		uncompressedSizeBytes: 2344,
+		rowsCount:             134,
+		timestampsData: timestampsData{
+			data:         []byte("foo"),
+			marshalType:  encoding.MarshalTypeDeltaConst,
+			minTimestamp: 1234,
+			maxTimestamp: 23443,
+		},
+		columnsData: []columnData{
+			{
+				name:            "foo",
+				valueType:       valueTypeUint16,
+				valuesData:      []byte("aaa"),
+				bloomFilterData: []byte("bsdf"),
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	bd.reset()
+	bdZero := &blockData{
+		columnsData:  []columnData{},
+		constColumns: []Field{},
+	}
+	if !reflect.DeepEqual(bd, bdZero) {
+		t.Fatalf("unexpected non-zero blockData after reset: %v", bd)
+	}
+}
+
+func TestBlockDataCopyFrom(t *testing.T) {
+	f := func(bd *blockData) {
+		t.Helper()
+		var bd2 blockData
+		bd2.copyFrom(bd)
+		bd2.a.b = nil
+		if !reflect.DeepEqual(bd, &bd2) {
+			t.Fatalf("unexpected blockData copy\ngot\n%v\nwant\n%v", &bd2, bd)
+		}
+
+		// Try copying it again to the same destination
+		bd2.copyFrom(bd)
+		bd2.a.b = nil
+		if !reflect.DeepEqual(bd, &bd2) {
+			t.Fatalf("unexpected blockData copy to the same destination\ngot\n%v\nwant\n%v", &bd2, bd)
+		}
+	}
+	f(&blockData{})
+
+	bd := &blockData{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 432,
+			},
+		},
+		uncompressedSizeBytes: 8943,
+		rowsCount:             134,
+		timestampsData: timestampsData{
+			data:         []byte("foo"),
+			marshalType:  encoding.MarshalTypeDeltaConst,
+			minTimestamp: 1234,
+			maxTimestamp: 23443,
+		},
+		columnsData: []columnData{
+			{
+				name:            "foo",
+				valueType:       valueTypeUint16,
+				valuesData:      []byte("aaa"),
+				bloomFilterData: []byte("bsdf"),
+			},
+			{
+				name:            "bar",
+				valuesData:      []byte("aaa"),
+				bloomFilterData: []byte("bsdf"),
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "foobar",
+				Value: "baz",
+			},
+		},
+	}
+	f(bd)
+}
--- a/lib/logstorage/block_header.go
+++ b/lib/logstorage/block_header.go
@ -0,0 +1,766 @@
+package logstorage
+
+import (
+	"fmt"
+	"math"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// blockHeader contains information about a single block.
+//
+// blockHeader is stored in the indexFilename file.
+type blockHeader struct {
+	// streamID is a stream id for entries in the block
+	streamID streamID
+
+	// uncompressedSizeBytes is the original (uncompressed) size of log entries stored in the block
+	uncompressedSizeBytes uint64
+
+	// rowsCount is the number of log entries stored in the block
+	rowsCount uint64
+
+	// timestampsHeader contains information about timestamps for log entries in the block
+	timestampsHeader timestampsHeader
+
+	// columnsHeaderOffset is the offset of columnsHeader at columnsHeaderFilename
+	columnsHeaderOffset uint64
+
+	// columnsHeaderSize is the size of columnsHeader at columnsHeaderFilename
+	columnsHeaderSize uint64
+}
+
+// reset resets bh, so it can be re-used.
+func (bh *blockHeader) reset() {
+	bh.streamID.reset()
+	bh.uncompressedSizeBytes = 0
+	bh.rowsCount = 0
+	bh.timestampsHeader.reset()
+	bh.columnsHeaderOffset = 0
+	bh.columnsHeaderSize = 0
+}
+
+func (bh *blockHeader) copyFrom(src *blockHeader) {
+	bh.reset()
+
+	bh.streamID = src.streamID
+	bh.uncompressedSizeBytes = src.uncompressedSizeBytes
+	bh.rowsCount = src.rowsCount
+	bh.timestampsHeader.copyFrom(&src.timestampsHeader)
+	bh.columnsHeaderOffset = src.columnsHeaderOffset
+	bh.columnsHeaderSize = src.columnsHeaderSize
+}
+
+// marshal appends the marshaled bh to dst and returns the result.
+func (bh *blockHeader) marshal(dst []byte) []byte {
+	// Do not store the version used for encoding directly in the block header, since:
+	// - all the block headers in the same part use the same encoding
+	// - the block header encoding version can be put in metadata file for the part (aka metadataFilename)
+
+	dst = bh.streamID.marshal(dst)
+	dst = encoding.MarshalVarUint64(dst, bh.uncompressedSizeBytes)
+	dst = encoding.MarshalVarUint64(dst, bh.rowsCount)
+	dst = bh.timestampsHeader.marshal(dst)
+	dst = encoding.MarshalVarUint64(dst, bh.columnsHeaderOffset)
+	dst = encoding.MarshalVarUint64(dst, bh.columnsHeaderSize)
+
+	return dst
+}
+
+// unmarshal unmarshals bh from src and returns the remaining tail.
+func (bh *blockHeader) unmarshal(src []byte) ([]byte, error) {
+	bh.reset()
+
+	srcOrig := src
+
+	// unmarshal bh.streamID
+	tail, err := bh.streamID.unmarshal(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal streamID: %w", err)
+	}
+	src = tail
+
+	// unmarshal bh.uncompressedSizeBytes
+	tail, n, err := encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal uncompressedSizeBytes: %w", err)
+	}
+	bh.uncompressedSizeBytes = n
+	src = tail
+
+	// unmarshal bh.rowsCount
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal rowsCount: %w", err)
+	}
+	if n > maxRowsPerBlock {
+		return srcOrig, fmt.Errorf("too big value for rowsCount: %d; mustn't exceed %d", n, maxRowsPerBlock)
+	}
+	bh.rowsCount = n
+	src = tail
+
+	// unmarshal bh.timestampsHeader
+	tail, err = bh.timestampsHeader.unmarshal(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal timestampsHeader: %w", err)
+	}
+	src = tail
+
+	// unmarshal columnsHeaderOffset
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal columnsHeaderOffset: %w", err)
+	}
+	bh.columnsHeaderOffset = n
+	src = tail
+
+	// unmarshal columnsHeaderSize
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal columnsHeaderSize: %w", err)
+	}
+	if n > maxColumnsHeaderSize {
+		return srcOrig, fmt.Errorf("too big value for columnsHeaderSize: %d; mustn't exceed %d", n, maxColumnsHeaderSize)
+	}
+	bh.columnsHeaderSize = n
+	src = tail
+
+	return src, nil
+}
+
+func getBlockHeader() *blockHeader {
+	v := blockHeaderPool.Get()
+	if v == nil {
+		return &blockHeader{}
+	}
+	return v.(*blockHeader)
+}
+
+func putBlockHeader(bh *blockHeader) {
+	bh.reset()
+	blockHeaderPool.Put(bh)
+}
+
+var blockHeaderPool sync.Pool
+
+// unmarshalBlockHeaders appends unmarshaled from src blockHeader entries to dst and returns the result.
+func unmarshalBlockHeaders(dst []blockHeader, src []byte) ([]blockHeader, error) {
+	dstOrig := dst
+	for len(src) > 0 {
+		if len(dst) < cap(dst) {
+			dst = dst[:len(dst)+1]
+		} else {
+			dst = append(dst, blockHeader{})
+		}
+		bh := &dst[len(dst)-1]
+		tail, err := bh.unmarshal(src)
+		if err != nil {
+			return dstOrig, fmt.Errorf("cannot unmarshal blockHeader entries: %w", err)
+		}
+		src = tail
+	}
+	if err := validateBlockHeaders(dst[len(dstOrig):]); err != nil {
+		return dstOrig, err
+	}
+	return dst, nil
+}
+
+func validateBlockHeaders(bhs []blockHeader) error {
+	for i := 1; i < len(bhs); i++ {
+		bhCurr := &bhs[i]
+		bhPrev := &bhs[i-1]
+		if bhCurr.streamID.less(&bhPrev.streamID) {
+			return fmt.Errorf("unexpected blockHeader with smaller streamID=%s after bigger streamID=%s at position %d", &bhCurr.streamID, &bhPrev.streamID, i)
+		}
+		if !bhCurr.streamID.equal(&bhPrev.streamID) {
+			continue
+		}
+		thCurr := bhCurr.timestampsHeader
+		thPrev := bhPrev.timestampsHeader
+		if thCurr.minTimestamp < thPrev.minTimestamp {
+			return fmt.Errorf("unexpected blockHeader with smaller timestamp=%d after bigger timestamp=%d at position %d", thCurr.minTimestamp, thPrev.minTimestamp, i)
+		}
+	}
+	return nil
+}
+
+func resetBlockHeaders(bhs []blockHeader) []blockHeader {
+	for i := range bhs {
+		bhs[i].reset()
+	}
+	return bhs[:0]
+}
+
+func getColumnsHeader() *columnsHeader {
+	v := columnsHeaderPool.Get()
+	if v == nil {
+		return &columnsHeader{}
+	}
+	return v.(*columnsHeader)
+}
+
+func putColumnsHeader(csh *columnsHeader) {
+	csh.reset()
+	columnsHeaderPool.Put(csh)
+}
+
+var columnsHeaderPool sync.Pool
+
+// columnsHeader contains information about columns in a single block.
+//
+// columnsHeader is stored in the columnsHeaderFilename file.
+type columnsHeader struct {
+	// columnHeaders contains the information about every column seen in the block.
+	columnHeaders []columnHeader
+
+	// constColumns contain fields with constant values across all the block entries.
+	constColumns []Field
+}
+
+func (csh *columnsHeader) reset() {
+	chs := csh.columnHeaders
+	for i := range chs {
+		chs[i].reset()
+	}
+	csh.columnHeaders = chs[:0]
+
+	ccs := csh.constColumns
+	for i := range ccs {
+		ccs[i].Reset()
+	}
+	csh.constColumns = ccs[:0]
+}
+
+func (csh *columnsHeader) getConstColumnValue(name string) string {
+	if name == "_msg" {
+		name = ""
+	}
+	ccs := csh.constColumns
+	for i := range ccs {
+		cc := &ccs[i]
+		if cc.Name == name {
+			return cc.Value
+		}
+	}
+	return ""
+}
+
+func (csh *columnsHeader) getColumnHeader(name string) *columnHeader {
+	if name == "_msg" {
+		name = ""
+	}
+	chs := csh.columnHeaders
+	for i := range chs {
+		ch := &chs[i]
+		if ch.name == name {
+			return ch
+		}
+	}
+	return nil
+}
+
+func (csh *columnsHeader) resizeConstColumns(columnsLen int) []Field {
+	ccs := csh.constColumns
+	if n := columnsLen - cap(ccs); n > 0 {
+		ccs = append(ccs[:cap(ccs)], make([]Field, n)...)
+	}
+	ccs = ccs[:columnsLen]
+	csh.constColumns = ccs
+	return ccs
+}
+
+func (csh *columnsHeader) resizeColumnHeaders(columnHeadersLen int) []columnHeader {
+	chs := csh.columnHeaders
+	if n := columnHeadersLen - cap(chs); n > 0 {
+		chs = append(chs[:cap(chs)], make([]columnHeader, n)...)
+	}
+	chs = chs[:columnHeadersLen]
+	csh.columnHeaders = chs
+	return chs
+}
+
+func (csh *columnsHeader) marshal(dst []byte) []byte {
+	chs := csh.columnHeaders
+	dst = encoding.MarshalVarUint64(dst, uint64(len(chs)))
+	for i := range chs {
+		dst = chs[i].marshal(dst)
+	}
+
+	ccs := csh.constColumns
+	dst = encoding.MarshalVarUint64(dst, uint64(len(ccs)))
+	for i := range ccs {
+		dst = ccs[i].marshal(dst)
+	}
+
+	return dst
+}
+
+func (csh *columnsHeader) unmarshal(src []byte) error {
+	csh.reset()
+
+	// unmarshal columnHeaders
+	tail, n, err := encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal columnHeaders len: %w", err)
+	}
+	if n > maxColumnsPerBlock {
+		return fmt.Errorf("too many column headers: %d; mustn't exceed %d", n, maxColumnsPerBlock)
+	}
+	src = tail
+	chs := csh.resizeColumnHeaders(int(n))
+	for i := range chs {
+		tail, err = chs[i].unmarshal(src)
+		if err != nil {
+			return fmt.Errorf("cannot unmarshal columnHeader %d out of %d columnHeaders: %w", i, len(chs), err)
+		}
+		src = tail
+	}
+	csh.columnHeaders = chs
+
+	// unmarshal constColumns
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal constColumns len: %w", err)
+	}
+	if n+uint64(len(csh.columnHeaders)) > maxColumnsPerBlock {
+		return fmt.Errorf("too many columns: %d; mustn't exceed %d", n+uint64(len(csh.columnHeaders)), maxColumnsPerBlock)
+	}
+	src = tail
+	ccs := csh.resizeConstColumns(int(n))
+	for i := range ccs {
+		tail, err = ccs[i].unmarshal(src)
+		if err != nil {
+			return fmt.Errorf("cannot unmarshal constColumn %d out of %d columns: %w", i, len(ccs), err)
+		}
+		src = tail
+	}
+
+	// Verify that the src is empty
+	if len(src) > 0 {
+		return fmt.Errorf("unexpected non-empty tail left after unmarshaling columnsHeader: len(tail)=%d", len(src))
+	}
+
+	return nil
+}
+
+// columnHeaders contains information for values, which belong to a single label in a single block.
+//
+// The main column with an empty name is stored in messageValuesFilename,
+// while the rest of columns are stored in fieldValuesFilename.
+// This allows minimizing disk read IO when filtering by non-message columns.
+//
+// Every block column contains also a bloom filter for all the tokens stored in the column.
+// This bloom filter is used for fast determining whether the given block may contain the given tokens.
+//
+// Tokens in bloom filter depend on valueType:
+//
+//   - valueTypeString stores lowercased tokens seen in all the values
+//   - valueTypeDict doesn't store anything in the bloom filter, since all the encoded values
+//     are available directly in the valuesDict field
+//   - valueTypeUint8, valueTypeUint16, valueTypeUint32 and valueTypeUint64 stores encoded uint values
+//   - valueTypeFloat64 stores encoded float64 values
+//   - valueTypeIPv4 stores encoded into uint32 ips
+//   - valueTypeTimestampISO8601 stores encoded into uint64 timestamps
+//
+// Bloom filters for main column with an empty name is stored in messageBloomFilename,
+// while the rest of columns are stored in fieldBloomFilename.
+type columnHeader struct {
+	// name contains column name aka label name
+	name string
+
+	// valueType is the type of values stored in the block
+	valueType valueType
+
+	// minValue is the minimum encoded value for uint*, ipv4, timestamp and float64 value in the columnHeader
+	//
+	// It is used for fast detection of whether the given columnHeader contains values in the given range
+	minValue uint64
+
+	// maxValue is the maximum encoded value for uint*, ipv4, timestamp and float64 value in the columnHeader
+	//
+	// It is used for fast detection of whether the given columnHeader contains values in the given range
+	maxValue uint64
+
+	// valuesDict contains unique values for valueType = valueTypeDict
+	valuesDict valuesDict
+
+	// valuesOffset contains the offset of the block in either messageValuesFilename or fieldValuesFilename
+	valuesOffset uint64
+
+	// valuesSize contains the size of the block in either messageValuesFilename or fieldValuesFilename
+	valuesSize uint64
+
+	// bloomFilterOffset contains the offset of the bloom filter in either messageBloomFilename or fieldBloomFilename
+	bloomFilterOffset uint64
+
+	// bloomFilterSize contains the size of the bloom filter in either messageBloomFilename or fieldBloomFilename
+	bloomFilterSize uint64
+}
+
+// reset resets ch
+func (ch *columnHeader) reset() {
+	ch.name = ""
+	ch.valueType = 0
+
+	ch.minValue = 0
+	ch.maxValue = 0
+	ch.valuesDict.reset()
+
+	ch.valuesOffset = 0
+	ch.valuesSize = 0
+
+	ch.bloomFilterOffset = 0
+	ch.bloomFilterSize = 0
+}
+
+// marshal appends marshaled ch to dst and returns the result.
+func (ch *columnHeader) marshal(dst []byte) []byte {
+	// check minValue/maxValue
+	if ch.valueType == valueTypeFloat64 {
+		minValue := math.Float64frombits(ch.minValue)
+		maxValue := math.Float64frombits(ch.maxValue)
+		if minValue > maxValue {
+			logger.Panicf("BUG: minValue=%g must be smaller than maxValue=%g", minValue, maxValue)
+		}
+	} else {
+		if ch.minValue > ch.maxValue {
+			logger.Panicf("BUG: minValue=%d must be smaller than maxValue=%d", ch.minValue, ch.maxValue)
+		}
+	}
+
+	// Encode common fields - ch.name and ch.valueType
+	dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(ch.name))
+	dst = append(dst, byte(ch.valueType))
+
+	// Encode other fields depending on ch.valueType
+	switch ch.valueType {
+	case valueTypeString:
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeDict:
+		dst = ch.valuesDict.marshal(dst)
+		dst = ch.marshalValues(dst)
+	case valueTypeUint8:
+		dst = append(dst, byte(ch.minValue))
+		dst = append(dst, byte(ch.maxValue))
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeUint16:
+		dst = encoding.MarshalUint16(dst, uint16(ch.minValue))
+		dst = encoding.MarshalUint16(dst, uint16(ch.maxValue))
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeUint32:
+		dst = encoding.MarshalUint32(dst, uint32(ch.minValue))
+		dst = encoding.MarshalUint32(dst, uint32(ch.maxValue))
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeUint64:
+		dst = encoding.MarshalUint64(dst, ch.minValue)
+		dst = encoding.MarshalUint64(dst, ch.maxValue)
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeFloat64:
+		// float64 values are encoded as uint64 via math.Float64bits()
+		dst = encoding.MarshalUint64(dst, ch.minValue)
+		dst = encoding.MarshalUint64(dst, ch.maxValue)
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeIPv4:
+		dst = encoding.MarshalUint32(dst, uint32(ch.minValue))
+		dst = encoding.MarshalUint32(dst, uint32(ch.maxValue))
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	case valueTypeTimestampISO8601:
+		// timestamps are encoded in nanoseconds
+		dst = encoding.MarshalUint64(dst, ch.minValue)
+		dst = encoding.MarshalUint64(dst, ch.maxValue)
+		dst = ch.marshalValuesAndBloomFilters(dst)
+	default:
+		logger.Panicf("BUG: unknown valueType=%d", ch.valueType)
+	}
+
+	return dst
+}
+
+func (ch *columnHeader) marshalValuesAndBloomFilters(dst []byte) []byte {
+	dst = ch.marshalValues(dst)
+	dst = ch.marshalBloomFilters(dst)
+	return dst
+}
+
+func (ch *columnHeader) marshalValues(dst []byte) []byte {
+	dst = encoding.MarshalVarUint64(dst, ch.valuesOffset)
+	dst = encoding.MarshalVarUint64(dst, ch.valuesSize)
+	return dst
+}
+
+func (ch *columnHeader) marshalBloomFilters(dst []byte) []byte {
+	dst = encoding.MarshalVarUint64(dst, ch.bloomFilterOffset)
+	dst = encoding.MarshalVarUint64(dst, ch.bloomFilterSize)
+	return dst
+}
+
+// unmarshal unmarshals ch from src and returns the tail left after unmarshaling.
+func (ch *columnHeader) unmarshal(src []byte) ([]byte, error) {
+	ch.reset()
+
+	srcOrig := src
+
+	// Unmarshal column name
+	tail, data, err := encoding.UnmarshalBytes(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal column name: %w", err)
+	}
+	// Do not use bytesutil.InternBytes(data) here, since it works slower than the string(data) in prod
+	ch.name = string(data)
+	src = tail
+
+	// Unmarshal value type
+	if len(src) < 1 {
+		return srcOrig, fmt.Errorf("cannot unmarshal valueType from 0 bytes for column %q; need at least 1 byte", ch.name)
+	}
+	ch.valueType = valueType(src[0])
+	src = src[1:]
+
+	// Unmarshal the rest of data depending on valueType
+	switch ch.valueType {
+	case valueTypeString:
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeString for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeDict:
+		tail, err = ch.valuesDict.unmarshal(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal dict at valueTypeDict for column %q: %w", ch.name, err)
+		}
+		src = tail
+
+		tail, err = ch.unmarshalValues(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values at valueTypeDict for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeUint8:
+		if len(src) < 2 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint8 from %d bytes for column %q; need at least 2 bytes", len(src), ch.name)
+		}
+		ch.minValue = uint64(src[0])
+		ch.maxValue = uint64(src[1])
+		src = src[2:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint8 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeUint16:
+		if len(src) < 4 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint16 from %d bytes for column %q; need at least 4 bytes", len(src), ch.name)
+		}
+		ch.minValue = uint64(encoding.UnmarshalUint16(src))
+		ch.maxValue = uint64(encoding.UnmarshalUint16(src[2:]))
+		src = src[4:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint16 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeUint32:
+		if len(src) < 8 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint32 from %d bytes for column %q; need at least 8 bytes", len(src), ch.name)
+		}
+		ch.minValue = uint64(encoding.UnmarshalUint32(src))
+		ch.maxValue = uint64(encoding.UnmarshalUint32(src[4:]))
+		src = src[8:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint32 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeUint64:
+		if len(src) < 16 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeUint64 from %d bytes for column %q; need at least 16 bytes", len(src), ch.name)
+		}
+		ch.minValue = encoding.UnmarshalUint64(src)
+		ch.maxValue = encoding.UnmarshalUint64(src[8:])
+		src = src[16:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeUint64 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeFloat64:
+		if len(src) < 16 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeFloat64 from %d bytes for column %q; need at least 16 bytes", len(src), ch.name)
+		}
+		// min and max values must be converted to real values with math.Float64frombits() during querying.
+		ch.minValue = encoding.UnmarshalUint64(src)
+		ch.maxValue = encoding.UnmarshalUint64(src[8:])
+		src = src[16:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeFloat64 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeIPv4:
+		if len(src) < 8 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeIPv4 from %d bytes for column %q; need at least 8 bytes", len(src), ch.name)
+		}
+		ch.minValue = uint64(encoding.UnmarshalUint32(src))
+		ch.maxValue = uint64(encoding.UnmarshalUint32(src[4:]))
+		src = src[8:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeIPv4 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	case valueTypeTimestampISO8601:
+		if len(src) < 16 {
+			return srcOrig, fmt.Errorf("cannot unmarshal min/max values at valueTypeTimestampISO8601 from %d bytes for column %q; need at least 16 bytes",
+				len(src), ch.name)
+		}
+		ch.minValue = encoding.UnmarshalUint64(src)
+		ch.maxValue = encoding.UnmarshalUint64(src[8:])
+		src = src[16:]
+
+		tail, err = ch.unmarshalValuesAndBloomFilters(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal values and bloom filters at valueTypeTimestampISO8601 for column %q: %w", ch.name, err)
+		}
+		src = tail
+	default:
+		return srcOrig, fmt.Errorf("unexpected valueType=%d for column %q", ch.valueType, ch.name)
+	}
+
+	return src, nil
+}
+
+func (ch *columnHeader) unmarshalValuesAndBloomFilters(src []byte) ([]byte, error) {
+	srcOrig := src
+
+	tail, err := ch.unmarshalValues(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal values: %w", err)
+	}
+	src = tail
+
+	tail, err = ch.unmarshalBloomFilters(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal bloom filters: %w", err)
+	}
+	src = tail
+
+	return src, nil
+}
+
+func (ch *columnHeader) unmarshalValues(src []byte) ([]byte, error) {
+	srcOrig := src
+
+	tail, n, err := encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal valuesOffset: %w", err)
+	}
+	ch.valuesOffset = n
+	src = tail
+
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal valuesSize: %w", err)
+	}
+	if n > maxValuesBlockSize {
+		return srcOrig, fmt.Errorf("too big valuesSize: %d bytes; mustn't exceed %d bytes", n, maxValuesBlockSize)
+	}
+	ch.valuesSize = n
+	src = tail
+
+	return src, nil
+}
+
+func (ch *columnHeader) unmarshalBloomFilters(src []byte) ([]byte, error) {
+	srcOrig := src
+
+	tail, n, err := encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal bloomFilterOffset: %w", err)
+	}
+	ch.bloomFilterOffset = n
+	src = tail
+
+	tail, n, err = encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal bloomFilterSize: %w", err)
+	}
+	if n > maxBloomFilterBlockSize {
+		return srcOrig, fmt.Errorf("too big bloomFilterSize: %d bytes; mustn't exceed %d bytes", n, maxBloomFilterBlockSize)
+	}
+	ch.bloomFilterSize = n
+	src = tail
+
+	return src, nil
+}
+
+// timestampsHeader contains the information about timestamps block.
+type timestampsHeader struct {
+	// blockOffset is an offset of timestamps block inside timestampsFilename file
+	blockOffset uint64
+
+	// blockSize is the size of the timestamps block inside timestampsFilename file
+	blockSize uint64
+
+	// minTimestamp is the mimumum timestamp seen in the block
+	minTimestamp int64
+
+	// maxTimestamp is the maximum timestamp seen in the block
+	maxTimestamp int64
+
+	// marshalType is the type used for encoding the timestamps block
+	marshalType encoding.MarshalType
+}
+
+// reset resets th, so it can be reused
+func (th *timestampsHeader) reset() {
+	th.blockOffset = 0
+	th.blockSize = 0
+	th.minTimestamp = 0
+	th.maxTimestamp = 0
+	th.marshalType = 0
+}
+
+func (th *timestampsHeader) copyFrom(src *timestampsHeader) {
+	th.blockOffset = src.blockOffset
+	th.blockSize = src.blockSize
+	th.minTimestamp = src.minTimestamp
+	th.maxTimestamp = src.maxTimestamp
+	th.marshalType = src.marshalType
+}
+
+// marshal appends marshaled th to dst and returns the result.
+func (th *timestampsHeader) marshal(dst []byte) []byte {
+	dst = encoding.MarshalUint64(dst, th.blockOffset)
+	dst = encoding.MarshalUint64(dst, th.blockSize)
+	dst = encoding.MarshalUint64(dst, uint64(th.minTimestamp))
+	dst = encoding.MarshalUint64(dst, uint64(th.maxTimestamp))
+	dst = append(dst, byte(th.marshalType))
+	return dst
+}
+
+// unmarshal unmarshals th from src and returns the tail left after the unmarshaling.
+func (th *timestampsHeader) unmarshal(src []byte) ([]byte, error) {
+	th.reset()
+
+	if len(src) < 33 {
+		return src, fmt.Errorf("cannot unmarshal timestampsHeader from %d bytes; need at least 33 bytes", len(src))
+	}
+
+	th.blockOffset = encoding.UnmarshalUint64(src)
+	th.blockSize = encoding.UnmarshalUint64(src[8:])
+	th.minTimestamp = int64(encoding.UnmarshalUint64(src[16:]))
+	th.maxTimestamp = int64(encoding.UnmarshalUint64(src[24:]))
+	th.marshalType = encoding.MarshalType(src[32])
+
+	return src[33:], nil
+}
--- a/lib/logstorage/block_header_test.go
+++ b/lib/logstorage/block_header_test.go
@ -0,0 +1,454 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+func TestBlockHeaderMarshalUnmarshal(t *testing.T) {
+	f := func(bh *blockHeader, marshaledLen int) {
+		t.Helper()
+		data := bh.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected lengths of the marshaled blockHeader; got %d; want %d", len(data), marshaledLen)
+		}
+		bh2 := &blockHeader{}
+		tail, err := bh2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error in unmarshal: %s", err)
+		}
+		if len(tail) > 0 {
+			t.Fatalf("unexpected non-empty tail after unmarshal: %X", tail)
+		}
+		if !reflect.DeepEqual(bh, bh2) {
+			t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", bh2, bh)
+		}
+	}
+	f(&blockHeader{}, 61)
+	f(&blockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				lo: 3443,
+				hi: 23434,
+			},
+		},
+		uncompressedSizeBytes: 4344,
+		rowsCount:             1234,
+		timestampsHeader: timestampsHeader{
+			blockOffset:  13234,
+			blockSize:    8843,
+			minTimestamp: -4334,
+			maxTimestamp: 23434,
+			marshalType:  encoding.MarshalTypeNearestDelta2,
+		},
+		columnsHeaderOffset: 4384,
+		columnsHeaderSize:   894,
+	}, 65)
+}
+
+func TestColumnsHeaderMarshalUnmarshal(t *testing.T) {
+	f := func(csh *columnsHeader, marshaledLen int) {
+		t.Helper()
+		data := csh.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected lengths of the marshaled columnsHeader; got %d; want %d", len(data), marshaledLen)
+		}
+		csh2 := &columnsHeader{}
+		err := csh2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error in unmarshal: %s", err)
+		}
+		if !reflect.DeepEqual(csh, csh2) {
+			t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", csh2, csh)
+		}
+	}
+	f(&columnsHeader{}, 2)
+	f(&columnsHeader{
+		columnHeaders: []columnHeader{
+			{
+				name:              "foobar",
+				valueType:         valueTypeString,
+				valuesOffset:      12345,
+				valuesSize:        23434,
+				bloomFilterOffset: 89843,
+				bloomFilterSize:   8934,
+			},
+			{
+				name:              "message",
+				valueType:         valueTypeUint16,
+				minValue:          123,
+				maxValue:          456,
+				valuesOffset:      3412345,
+				valuesSize:        234434,
+				bloomFilterOffset: 83,
+				bloomFilterSize:   34,
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}, 50)
+}
+
+func TestBlockHeaderUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		bh := getBlockHeader()
+		defer putBlockHeader(bh)
+		tail, err := bh.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail;\ngot\n%q\nwant\n%q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+
+	bh := blockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				lo: 3443,
+				hi: 23434,
+			},
+		},
+		uncompressedSizeBytes: 4344,
+		rowsCount:             1234,
+		timestampsHeader: timestampsHeader{
+			blockOffset:  13234,
+			blockSize:    8843,
+			minTimestamp: -4334,
+			maxTimestamp: 23434,
+			marshalType:  encoding.MarshalTypeNearestDelta2,
+		},
+		columnsHeaderOffset: 4384,
+		columnsHeaderSize:   894,
+	}
+	data := bh.marshal(nil)
+	for len(data) > 0 {
+		data = data[:len(data)-1]
+		f(data)
+	}
+}
+
+func TestColumnsHeaderUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		csh := getColumnsHeader()
+		defer putColumnsHeader(csh)
+		err := csh.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+
+	csh := columnsHeader{
+		columnHeaders: []columnHeader{
+			{
+				name:              "foobar",
+				valueType:         valueTypeString,
+				valuesOffset:      12345,
+				valuesSize:        23434,
+				bloomFilterOffset: 89843,
+				bloomFilterSize:   8934,
+			},
+			{
+				name:              "message",
+				valueType:         valueTypeUint16,
+				minValue:          123,
+				maxValue:          456,
+				valuesOffset:      3412345,
+				valuesSize:        234434,
+				bloomFilterOffset: 83,
+				bloomFilterSize:   34,
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	data := csh.marshal(nil)
+	for len(data) > 0 {
+		data = data[:len(data)-1]
+		f(data)
+	}
+}
+
+func TestBlockHeaderReset(t *testing.T) {
+	bh := &blockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				lo: 3443,
+				hi: 23434,
+			},
+		},
+		uncompressedSizeBytes: 8984,
+		rowsCount:             1234,
+		timestampsHeader: timestampsHeader{
+			blockOffset:  13234,
+			blockSize:    8843,
+			minTimestamp: -4334,
+			maxTimestamp: 23434,
+			marshalType:  encoding.MarshalTypeNearestDelta2,
+		},
+		columnsHeaderOffset: 12332,
+		columnsHeaderSize:   234,
+	}
+	bh.reset()
+	bhZero := &blockHeader{}
+	if !reflect.DeepEqual(bh, bhZero) {
+		t.Fatalf("unexpected non-zero blockHeader after reset: %v", bh)
+	}
+}
+
+func TestColumnsHeaderReset(t *testing.T) {
+	csh := &columnsHeader{
+		columnHeaders: []columnHeader{
+			{
+				name:              "foobar",
+				valueType:         valueTypeString,
+				valuesOffset:      12345,
+				valuesSize:        23434,
+				bloomFilterOffset: 89843,
+				bloomFilterSize:   8934,
+			},
+			{
+				name:              "message",
+				valueType:         valueTypeUint16,
+				minValue:          123,
+				maxValue:          456,
+				valuesOffset:      3412345,
+				valuesSize:        234434,
+				bloomFilterOffset: 83,
+				bloomFilterSize:   34,
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	csh.reset()
+	cshZero := &columnsHeader{
+		columnHeaders: []columnHeader{},
+		constColumns:  []Field{},
+	}
+	if !reflect.DeepEqual(csh, cshZero) {
+		t.Fatalf("unexpected non-zero columnsHeader after reset: %v", csh)
+	}
+}
+
+func TestMarshalUnmarshalBlockHeaders(t *testing.T) {
+	f := func(bhs []blockHeader, marshaledLen int) {
+		t.Helper()
+		var data []byte
+		for i := range bhs {
+			data = bhs[i].marshal(data)
+		}
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected length for marshaled blockHeader entries; got %d; want %d", len(data), marshaledLen)
+		}
+		bhs2, err := unmarshalBlockHeaders(nil, data)
+		if err != nil {
+			t.Fatalf("unexpected error when unmarshaling blockHeader entries: %s", err)
+		}
+		if !reflect.DeepEqual(bhs, bhs2) {
+			t.Fatalf("unexpected blockHeader entries unmarshaled\ngot\n%v\nwant\n%v", bhs2, bhs)
+		}
+	}
+	f(nil, 0)
+	f([]blockHeader{{}}, 61)
+	f([]blockHeader{
+		{},
+		{
+			streamID: streamID{
+				tenantID: TenantID{
+					AccountID: 123,
+					ProjectID: 456,
+				},
+				id: u128{
+					lo: 3443,
+					hi: 23434,
+				},
+			},
+			uncompressedSizeBytes: 89894,
+			rowsCount:             1234,
+			timestampsHeader: timestampsHeader{
+				blockOffset:  13234,
+				blockSize:    8843,
+				minTimestamp: -4334,
+				maxTimestamp: 23434,
+				marshalType:  encoding.MarshalTypeNearestDelta2,
+			},
+			columnsHeaderOffset: 12332,
+			columnsHeaderSize:   234,
+		},
+	}, 127)
+}
+
+func TestColumnHeaderMarshalUnmarshal(t *testing.T) {
+	f := func(ch *columnHeader, marshaledLen int) {
+		t.Helper()
+		data := ch.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected marshaled length of columnHeader; got %d; want %d", len(data), marshaledLen)
+		}
+		var ch2 columnHeader
+		tail, err := ch2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error in umarshal(%v): %s", ch, err)
+		}
+		if len(tail) > 0 {
+			t.Fatalf("unexpected non-empty tail after unmarshal(%v): %X", ch, tail)
+		}
+		if !reflect.DeepEqual(ch, &ch2) {
+			t.Fatalf("unexpected columnHeader after unmarshal;\ngot\n%v\nwant\n%v", &ch2, ch)
+		}
+	}
+	f(&columnHeader{
+		name:      "foo",
+		valueType: valueTypeUint8,
+	}, 11)
+	ch := &columnHeader{
+		name:      "foobar",
+		valueType: valueTypeDict,
+
+		valuesOffset: 12345,
+		valuesSize:   254452,
+	}
+	ch.valuesDict.getOrAdd("abc")
+	f(ch, 18)
+}
+
+func TestColumnHeaderUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var ch columnHeader
+		tail, err := ch.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+
+	ch := &columnHeader{
+		name:            "abc",
+		valueType:       valueTypeUint16,
+		bloomFilterSize: 3244,
+	}
+	data := ch.marshal(nil)
+	f(data[:len(data)-1])
+}
+
+func TestColumnHeaderReset(t *testing.T) {
+	ch := &columnHeader{
+		name:      "foobar",
+		valueType: valueTypeUint16,
+
+		valuesOffset: 12345,
+		valuesSize:   254452,
+
+		bloomFilterOffset: 34898234,
+		bloomFilterSize:   873434,
+	}
+	ch.valuesDict.getOrAdd("abc")
+	ch.reset()
+	chZero := &columnHeader{}
+	chZero.valuesDict.values = []string{}
+	if !reflect.DeepEqual(ch, chZero) {
+		t.Fatalf("unexpected non-zero columnHeader after reset: %v", ch)
+	}
+}
+
+func TestTimestampsHeaderMarshalUnmarshal(t *testing.T) {
+	f := func(th *timestampsHeader, marshaledLen int) {
+		t.Helper()
+		data := th.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected length of marshaled timestampsHeader; got %d; want %d", len(data), marshaledLen)
+		}
+		var th2 timestampsHeader
+		tail, err := th2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error in unmarshal(%v): %s", th, err)
+		}
+		if len(tail) > 0 {
+			t.Fatalf("unexpected non-nil tail after unmarshal(%v): %X", th, tail)
+		}
+		if !reflect.DeepEqual(th, &th2) {
+			t.Fatalf("unexpected timestampsHeader after unmarshal; got\n%v\nwant\n%v", &th2, th)
+		}
+	}
+	f(&timestampsHeader{}, 33)
+
+	f(&timestampsHeader{
+		blockOffset:  12345,
+		blockSize:    3424834,
+		minTimestamp: -123443,
+		maxTimestamp: 234343,
+		marshalType:  encoding.MarshalTypeZSTDNearestDelta,
+	}, 33)
+}
+
+func TestTimestampsHeaderUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var th timestampsHeader
+		tail, err := th.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+}
+
+func TestTimestampsHeaderReset(t *testing.T) {
+	th := &timestampsHeader{
+		blockOffset:  12345,
+		blockSize:    3424834,
+		minTimestamp: -123443,
+		maxTimestamp: 234343,
+		marshalType:  encoding.MarshalTypeZSTDNearestDelta,
+	}
+	th.reset()
+	thZero := &timestampsHeader{}
+	if !reflect.DeepEqual(th, thZero) {
+		t.Fatalf("unexpected non-zero timestampsHeader after reset: %v", th)
+	}
+}
--- a/lib/logstorage/block_search.go
+++ b/lib/logstorage/block_search.go
@ -0,0 +1,645 @@
+package logstorage
+
+import (
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+type blockSearchWork struct {
+	// p is the part where the block belongs to.
+	p *part
+
+	// so contains search options for the block search
+	so *searchOptions
+
+	// bh is the header of the block to search.
+	bh blockHeader
+}
+
+func newBlockSearchWork(p *part, so *searchOptions, bh *blockHeader) *blockSearchWork {
+	var bsw blockSearchWork
+	bsw.p = p
+	bsw.so = so
+	bsw.bh.copyFrom(bh)
+	return &bsw
+}
+
+func getBlockSearch() *blockSearch {
+	v := blockSearchPool.Get()
+	if v == nil {
+		return &blockSearch{}
+	}
+	return v.(*blockSearch)
+}
+
+func putBlockSearch(bs *blockSearch) {
+	bs.reset()
+	blockSearchPool.Put(bs)
+}
+
+var blockSearchPool sync.Pool
+
+type blockSearch struct {
+	// bsw is the actual work to perform on the given block pointed by bsw.ph
+	bsw *blockSearchWork
+
+	// br contains result for the search in the block after search() call
+	br blockResult
+
+	// timestampsCache contains cached timestamps for the given block.
+	timestampsCache *encoding.Int64s
+
+	// bloomFilterCache contains cached bloom filters for requested columns in the given block
+	bloomFilterCache map[string]*bloomFilter
+
+	// valuesCache contains cached values for requested columns in the given block
+	valuesCache map[string]*stringBucket
+
+	// sbu is used for unmarshaling local columns
+	sbu stringsBlockUnmarshaler
+
+	// csh is the columnsHeader associated with the given block
+	csh columnsHeader
+}
+
+func (bs *blockSearch) reset() {
+	bs.bsw = nil
+	bs.br.reset()
+
+	if bs.timestampsCache != nil {
+		encoding.PutInt64s(bs.timestampsCache)
+		bs.timestampsCache = nil
+	}
+
+	bloomFilterCache := bs.bloomFilterCache
+	for k, bf := range bloomFilterCache {
+		putBloomFilter(bf)
+		delete(bloomFilterCache, k)
+	}
+
+	valuesCache := bs.valuesCache
+	for k, values := range valuesCache {
+		putStringBucket(values)
+		delete(valuesCache, k)
+	}
+
+	bs.sbu.reset()
+	bs.csh.reset()
+}
+
+func (bs *blockSearch) partPath() string {
+	return bs.bsw.p.path
+}
+
+func (bs *blockSearch) search(bsw *blockSearchWork) {
+	bs.reset()
+
+	bs.bsw = bsw
+
+	bs.csh.initFromBlockHeader(bsw.p, &bsw.bh)
+
+	// search rows matching the given filter
+	bm := getFilterBitmap(int(bsw.bh.rowsCount))
+	bm.setBits()
+	bs.bsw.so.filter.apply(bs, bm)
+
+	bs.br.mustInit(bs, bm)
+	if bm.isZero() {
+		putFilterBitmap(bm)
+		return
+	}
+
+	// fetch the requested columns to bs.br.
+	for _, columnName := range bs.bsw.so.resultColumnNames {
+		switch columnName {
+		case "_stream":
+			bs.br.addStreamColumn(bs)
+		case "_time":
+			bs.br.addTimeColumn(bs)
+		default:
+			v := bs.csh.getConstColumnValue(columnName)
+			if v != "" {
+				bs.br.addConstColumn(v)
+				continue
+			}
+			ch := bs.csh.getColumnHeader(columnName)
+			if ch == nil {
+				bs.br.addConstColumn("")
+			} else {
+				bs.br.addColumn(bs, ch, bm)
+			}
+		}
+	}
+	putFilterBitmap(bm)
+}
+
+func (csh *columnsHeader) initFromBlockHeader(p *part, bh *blockHeader) {
+	bb := longTermBufPool.Get()
+	columnsHeaderSize := bh.columnsHeaderSize
+	if columnsHeaderSize > maxColumnsHeaderSize {
+		logger.Panicf("FATAL: %s: columns header size cannot exceed %d bytes; got %d bytes", p.path, maxColumnsHeaderSize, columnsHeaderSize)
+	}
+	bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(columnsHeaderSize))
+	p.columnsHeaderFile.MustReadAt(bb.B, int64(bh.columnsHeaderOffset))
+
+	if err := csh.unmarshal(bb.B); err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal columns header: %s", p.path, err)
+	}
+	longTermBufPool.Put(bb)
+}
+
+// getBloomFilterForColumn returns bloom filter for the given ch.
+//
+// The returned bloom filter belongs to bs, so it becomes invalid after bs reset.
+func (bs *blockSearch) getBloomFilterForColumn(ch *columnHeader) *bloomFilter {
+	bf := bs.bloomFilterCache[ch.name]
+	if bf != nil {
+		return bf
+	}
+
+	p := bs.bsw.p
+
+	bloomFilterFile := p.fieldBloomFilterFile
+	if ch.name == "" {
+		bloomFilterFile = p.messageBloomFilterFile
+	}
+
+	bb := longTermBufPool.Get()
+	bloomFilterSize := ch.bloomFilterSize
+	if bloomFilterSize > maxBloomFilterBlockSize {
+		logger.Panicf("FATAL: %s: bloom filter block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxBloomFilterBlockSize, bloomFilterSize)
+	}
+	bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(bloomFilterSize))
+	bloomFilterFile.MustReadAt(bb.B, int64(ch.bloomFilterOffset))
+	bf = getBloomFilter()
+	if err := bf.unmarshal(bb.B); err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal bloom filter: %s", bs.partPath(), err)
+	}
+	longTermBufPool.Put(bb)
+
+	if bs.bloomFilterCache == nil {
+		bs.bloomFilterCache = make(map[string]*bloomFilter)
+	}
+	bs.bloomFilterCache[ch.name] = bf
+	return bf
+}
+
+// getValuesForColumn returns block values for the given ch.
+//
+// The returned values belong to bs, so they become invalid after bs reset.
+func (bs *blockSearch) getValuesForColumn(ch *columnHeader) []string {
+	values := bs.valuesCache[ch.name]
+	if values != nil {
+		return values.a
+	}
+
+	p := bs.bsw.p
+
+	valuesFile := p.fieldValuesFile
+	if ch.name == "" {
+		valuesFile = p.messageValuesFile
+	}
+
+	bb := longTermBufPool.Get()
+	valuesSize := ch.valuesSize
+	if valuesSize > maxValuesBlockSize {
+		logger.Panicf("FATAL: %s: values block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxValuesBlockSize, valuesSize)
+	}
+	bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(valuesSize))
+	valuesFile.MustReadAt(bb.B, int64(ch.valuesOffset))
+
+	values = getStringBucket()
+	var err error
+	values.a, err = bs.sbu.unmarshal(values.a[:0], bb.B, bs.bsw.bh.rowsCount)
+	longTermBufPool.Put(bb)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal column %q: %s", bs.partPath(), ch.name, err)
+	}
+
+	if bs.valuesCache == nil {
+		bs.valuesCache = make(map[string]*stringBucket)
+	}
+	bs.valuesCache[ch.name] = values
+	return values.a
+}
+
+// getTimestamps returns timestamps for the given bs.
+//
+// The returned timestamps belong to bs, so they become invalid after bs reset.
+func (bs *blockSearch) getTimestamps() []int64 {
+	timestamps := bs.timestampsCache
+	if timestamps != nil {
+		return timestamps.A
+	}
+
+	p := bs.bsw.p
+
+	bb := longTermBufPool.Get()
+	th := &bs.bsw.bh.timestampsHeader
+	blockSize := th.blockSize
+	if blockSize > maxTimestampsBlockSize {
+		logger.Panicf("FATAL: %s: timestamps block size cannot exceed %d bytes; got %d bytes", bs.partPath(), maxTimestampsBlockSize, blockSize)
+	}
+	bb.B = bytesutil.ResizeNoCopyMayOverallocate(bb.B, int(blockSize))
+	p.timestampsFile.MustReadAt(bb.B, int64(th.blockOffset))
+
+	rowsCount := int(bs.bsw.bh.rowsCount)
+	timestamps = encoding.GetInt64s(rowsCount)
+	var err error
+	timestamps.A, err = encoding.UnmarshalTimestamps(timestamps.A[:0], bb.B, th.marshalType, th.minTimestamp, rowsCount)
+	longTermBufPool.Put(bb)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal timestamps: %s", bs.partPath(), err)
+	}
+	bs.timestampsCache = timestamps
+	return timestamps.A
+}
+
+// mustReadBlockHeaders reads ih block headers from p, appends them to dst and returns the result.
+func (ih *indexBlockHeader) mustReadBlockHeaders(dst []blockHeader, p *part) []blockHeader {
+	bbCompressed := longTermBufPool.Get()
+	indexBlockSize := ih.indexBlockSize
+	if indexBlockSize > maxIndexBlockSize {
+		logger.Panicf("FATAL: %s: index block size cannot exceed %d bytes; got %d bytes", p.indexFile.Path(), maxIndexBlockSize, indexBlockSize)
+	}
+	bbCompressed.B = bytesutil.ResizeNoCopyMayOverallocate(bbCompressed.B, int(indexBlockSize))
+	p.indexFile.MustReadAt(bbCompressed.B, int64(ih.indexBlockOffset))
+
+	bb := longTermBufPool.Get()
+	var err error
+	bb.B, err = encoding.DecompressZSTD(bb.B, bbCompressed.B)
+	longTermBufPool.Put(bbCompressed)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot decompress indexBlock read at offset %d with size %d: %s", p.indexFile.Path(), ih.indexBlockOffset, ih.indexBlockSize, err)
+	}
+
+	dst, err = unmarshalBlockHeaders(dst, bb.B)
+	longTermBufPool.Put(bb)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal block headers read at offset %d with size %d: %s", p.indexFile.Path(), ih.indexBlockOffset, ih.indexBlockSize, err)
+	}
+
+	return dst
+}
+
+type blockResult struct {
+	buf       []byte
+	valuesBuf []string
+
+	// streamID is streamID for the given blockResult
+	streamID streamID
+
+	// cs contain values for result columns
+	cs []blockResultColumn
+
+	// timestamps contain timestamps for the selected log entries
+	timestamps []int64
+}
+
+func (br *blockResult) reset() {
+	br.buf = br.buf[:0]
+
+	vb := br.valuesBuf
+	for i := range vb {
+		vb[i] = ""
+	}
+	br.valuesBuf = vb[:0]
+
+	br.streamID.reset()
+
+	cs := br.cs
+	for i := range cs {
+		cs[i].reset()
+	}
+	br.cs = cs[:0]
+
+	br.timestamps = br.timestamps[:0]
+}
+
+func (br *blockResult) RowsCount() int {
+	return len(br.timestamps)
+}
+
+func (br *blockResult) mustInit(bs *blockSearch, bm *filterBitmap) {
+	br.reset()
+
+	br.streamID = bs.bsw.bh.streamID
+
+	if !bm.isZero() {
+		// Initialize timestamps, since they are used for determining the number of rows in br.RowsCount()
+		srcTimestamps := bs.getTimestamps()
+		dstTimestamps := br.timestamps[:0]
+		bm.forEachSetBit(func(idx int) bool {
+			ts := srcTimestamps[idx]
+			dstTimestamps = append(dstTimestamps, ts)
+			return true
+		})
+		br.timestamps = dstTimestamps
+	}
+}
+
+func (br *blockResult) addColumn(bs *blockSearch, ch *columnHeader, bm *filterBitmap) {
+	buf := br.buf
+	valuesBuf := br.valuesBuf
+	valuesBufLen := len(valuesBuf)
+	var dictValues []string
+
+	appendValue := func(v string) {
+		bufLen := len(buf)
+		buf = append(buf, v...)
+		s := bytesutil.ToUnsafeString(buf[bufLen:])
+		valuesBuf = append(valuesBuf, s)
+	}
+
+	switch ch.valueType {
+	case valueTypeString:
+		visitValues(bs, ch, bm, func(v string) bool {
+			appendValue(v)
+			return true
+		})
+	case valueTypeDict:
+		dictValues = ch.valuesDict.values
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 1 {
+				logger.Panicf("FATAL: %s: unexpected dict value size for column %q; got %d bytes; want 1 byte", bs.partPath(), ch.name, len(v))
+			}
+			dictIdx := v[0]
+			if int(dictIdx) >= len(dictValues) {
+				logger.Panicf("FATAL: %s: too big dict index for column %q: %d; should be smaller than %d", bs.partPath(), ch.name, dictIdx, len(dictValues))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeUint8:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 1 {
+				logger.Panicf("FATAL: %s: unexpected size for uint8 column %q; got %d bytes; want 1 byte", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeUint16:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 2 {
+				logger.Panicf("FATAL: %s: unexpected size for uint16 column %q; got %d bytes; want 2 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeUint32:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 4 {
+				logger.Panicf("FATAL: %s: unexpected size for uint32 column %q; got %d bytes; want 4 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeUint64:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 8 {
+				logger.Panicf("FATAL: %s: unexpected size for uint64 column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeFloat64:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 8 {
+				logger.Panicf("FATAL: %s: unexpected size for float64 column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeIPv4:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 4 {
+				logger.Panicf("FATAL: %s: unexpected size for ipv4 column %q; got %d bytes; want 4 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	case valueTypeTimestampISO8601:
+		visitValues(bs, ch, bm, func(v string) bool {
+			if len(v) != 8 {
+				logger.Panicf("FATAL: %s: unexpected size for timestmap column %q; got %d bytes; want 8 bytes", bs.partPath(), ch.name, len(v))
+			}
+			appendValue(v)
+			return true
+		})
+	default:
+		logger.Panicf("FATAL: %s: unknown valueType=%d for column %q", bs.partPath(), ch.valueType, ch.name)
+	}
+
+	encodedValues := valuesBuf[valuesBufLen:]
+
+	valuesBufLen = len(valuesBuf)
+	for _, v := range dictValues {
+		appendValue(v)
+	}
+	dictValues = valuesBuf[valuesBufLen:]
+
+	br.cs = append(br.cs, blockResultColumn{
+		valueType:     ch.valueType,
+		dictValues:    dictValues,
+		encodedValues: encodedValues,
+	})
+	br.buf = buf
+	br.valuesBuf = valuesBuf
+}
+
+func (br *blockResult) addTimeColumn(bs *blockSearch) {
+	br.cs = append(br.cs, blockResultColumn{
+		isTime: true,
+	})
+}
+
+func (br *blockResult) addStreamColumn(bs *blockSearch) {
+	bb := bbPool.Get()
+	bb.B = bs.bsw.p.pt.appendStreamTagsByStreamID(bb.B[:0], &br.streamID)
+	if len(bb.B) > 0 {
+		st := GetStreamTags()
+		mustUnmarshalStreamTags(st, bb.B)
+		bb.B = st.marshalString(bb.B[:0])
+		PutStreamTags(st)
+	}
+	s := bytesutil.ToUnsafeString(bb.B)
+	br.addConstColumn(s)
+	bbPool.Put(bb)
+}
+
+func (br *blockResult) addConstColumn(value string) {
+	buf := br.buf
+	bufLen := len(buf)
+	buf = append(buf, value...)
+	s := bytesutil.ToUnsafeString(buf[bufLen:])
+	br.buf = buf
+
+	valuesBuf := br.valuesBuf
+	valuesBufLen := len(valuesBuf)
+	valuesBuf = append(valuesBuf, s)
+	br.valuesBuf = valuesBuf
+
+	br.cs = append(br.cs, blockResultColumn{
+		isConst:       true,
+		valueType:     valueTypeUnknown,
+		encodedValues: valuesBuf[valuesBufLen:],
+	})
+}
+
+// getColumnValues returns values for the column with the given idx.
+//
+// The returned values are valid until br.reset() is called.
+func (br *blockResult) getColumnValues(idx int) []string {
+	c := &br.cs[idx]
+	if c.values != nil {
+		return c.values
+	}
+
+	buf := br.buf
+	valuesBuf := br.valuesBuf
+	valuesBufLen := len(valuesBuf)
+
+	if c.isConst {
+		v := c.encodedValues[0]
+		for range br.timestamps {
+			valuesBuf = append(valuesBuf, v)
+		}
+		c.values = valuesBuf[valuesBufLen:]
+		br.valuesBuf = valuesBuf
+		return c.values
+	}
+	if c.isTime {
+		for _, timestamp := range br.timestamps {
+			t := time.Unix(0, timestamp).UTC()
+			bufLen := len(buf)
+			buf = t.AppendFormat(buf, time.RFC3339Nano)
+			s := bytesutil.ToUnsafeString(buf[bufLen:])
+			valuesBuf = append(valuesBuf, s)
+		}
+		c.values = valuesBuf[valuesBufLen:]
+		br.buf = buf
+		br.valuesBuf = valuesBuf
+		return c.values
+	}
+
+	appendValue := func(v string) {
+		bufLen := len(buf)
+		buf = append(buf, v...)
+		s := bytesutil.ToUnsafeString(buf[bufLen:])
+		valuesBuf = append(valuesBuf, s)
+	}
+
+	switch c.valueType {
+	case valueTypeString:
+		c.values = c.encodedValues
+		return c.values
+	case valueTypeDict:
+		dictValues := c.dictValues
+		for _, v := range c.encodedValues {
+			dictIdx := v[0]
+			appendValue(dictValues[dictIdx])
+		}
+	case valueTypeUint8:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			n := uint64(v[0])
+			bb.B = strconv.AppendUint(bb.B[:0], n, 10)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeUint16:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			b := bytesutil.ToUnsafeBytes(v)
+			n := uint64(encoding.UnmarshalUint16(b))
+			bb.B = strconv.AppendUint(bb.B[:0], n, 10)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeUint32:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			b := bytesutil.ToUnsafeBytes(v)
+			n := uint64(encoding.UnmarshalUint32(b))
+			bb.B = strconv.AppendUint(bb.B[:0], n, 10)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeUint64:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			b := bytesutil.ToUnsafeBytes(v)
+			n := encoding.UnmarshalUint64(b)
+			bb.B = strconv.AppendUint(bb.B[:0], n, 10)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeFloat64:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			bb.B = toFloat64String(bb.B[:0], v)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeIPv4:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			bb.B = toIPv4String(bb.B[:0], v)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	case valueTypeTimestampISO8601:
+		bb := bbPool.Get()
+		for _, v := range c.encodedValues {
+			bb.B = toTimestampISO8601String(bb.B[:0], v)
+			appendValue(bytesutil.ToUnsafeString(bb.B))
+		}
+		bbPool.Put(bb)
+	default:
+		logger.Panicf("BUG: unknown valueType=%d", c.valueType)
+	}
+
+	c.values = valuesBuf[valuesBufLen:]
+	br.buf = buf
+	br.valuesBuf = valuesBuf
+
+	return c.values
+}
+
+type blockResultColumn struct {
+	// isConst is set to true if the column is const.
+	//
+	// The column value is stored in encodedValues[0]
+	isConst bool
+
+	// isTime is set to true if the column contains _time values.
+	//
+	// The column values are stored in blockResult.timestamps
+	isTime bool
+
+	// valueType is the type of non-cost value
+	valueType valueType
+
+	// dictValues contain dictionary values for valueTypeDict column
+	dictValues []string
+
+	// encodedValues contain encoded values for non-const column
+	encodedValues []string
+
+	// values contain decoded values after getColumnValues() call for the given column
+	values []string
+}
+
+func (c *blockResultColumn) reset() {
+	c.isConst = false
+	c.isTime = false
+	c.valueType = valueTypeUnknown
+	c.dictValues = nil
+	c.encodedValues = nil
+	c.values = nil
+}
--- a/lib/logstorage/block_stream_merger.go
+++ b/lib/logstorage/block_stream_merger.go
@ -0,0 +1,288 @@
+package logstorage
+
+import (
+	"container/heap"
+	"fmt"
+	"strings"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// mustMergeBlockStreams merges bsrs to bsw and updates ph accordingly.
+//
+// Finalize() is guaranteed to be called on bsrs and bsw before returning from the func.
+func mustMergeBlockStreams(ph *partHeader, bsw *blockStreamWriter, bsrs []*blockStreamReader, stopCh <-chan struct{}) {
+	bsm := getBlockStreamMerger()
+	bsm.mustInit(bsw, bsrs)
+	for len(bsm.readersHeap) > 0 {
+		if needStop(stopCh) {
+			break
+		}
+		bsr := bsm.readersHeap[0]
+		bsm.mustWriteBlock(&bsr.blockData, bsw)
+		if bsr.NextBlock() {
+			heap.Fix(&bsm.readersHeap, 0)
+		} else {
+			heap.Pop(&bsm.readersHeap)
+		}
+	}
+	bsm.mustFlushRows()
+	putBlockStreamMerger(bsm)
+
+	bsw.Finalize(ph)
+	mustCloseBlockStreamReaders(bsrs)
+}
+
+// blockStreamMerger merges block streams
+type blockStreamMerger struct {
+	// bsw is the block stream writer to write the merged blocks.
+	bsw *blockStreamWriter
+
+	// bsrs contains the original readers passed to mustInit().
+	// They are used by ReadersPaths()
+	bsrs []*blockStreamReader
+
+	// readersHeap contains a heap of readers to read blocks to merge.
+	readersHeap blockStreamReadersHeap
+
+	// streamID is the stream ID for the pending data.
+	streamID streamID
+
+	// sbu is the unmarshaler for strings in rows and rowsTmp.
+	sbu *stringsBlockUnmarshaler
+
+	// vd is the decoder for unmarshaled strings.
+	vd *valuesDecoder
+
+	// bd is the pending blockData.
+	// bd is unpacked into rows when needed.
+	bd blockData
+
+	// rows is pending log entries.
+	rows rows
+
+	// rowsTmp is temporary storage for log entries during merge.
+	rowsTmp rows
+
+	// uncompressedRowsSizeBytes is the current size of uncompressed rows.
+	//
+	// It is used for flushing rows to blocks when their size reaches maxUncompressedBlockSize
+	uncompressedRowsSizeBytes uint64
+}
+
+func (bsm *blockStreamMerger) reset() {
+	bsm.bsw = nil
+
+	rhs := bsm.readersHeap
+	for i := range rhs {
+		rhs[i] = nil
+	}
+	bsm.readersHeap = rhs[:0]
+
+	bsm.streamID.reset()
+	bsm.resetRows()
+}
+
+func (bsm *blockStreamMerger) resetRows() {
+	if bsm.sbu != nil {
+		putStringsBlockUnmarshaler(bsm.sbu)
+		bsm.sbu = nil
+	}
+	if bsm.vd != nil {
+		putValuesDecoder(bsm.vd)
+		bsm.vd = nil
+	}
+	bsm.bd.reset()
+
+	bsm.rows.reset()
+	bsm.rowsTmp.reset()
+
+	bsm.uncompressedRowsSizeBytes = 0
+}
+
+func (bsm *blockStreamMerger) mustInit(bsw *blockStreamWriter, bsrs []*blockStreamReader) {
+	bsm.reset()
+
+	bsm.bsw = bsw
+	bsm.bsrs = bsrs
+
+	rsh := bsm.readersHeap[:0]
+	for _, bsr := range bsrs {
+		if bsr.NextBlock() {
+			rsh = append(rsh, bsr)
+		}
+	}
+	bsm.readersHeap = rsh
+	heap.Init(&bsm.readersHeap)
+}
+
+// mustWriteBlock writes bd to bsm
+func (bsm *blockStreamMerger) mustWriteBlock(bd *blockData, bsw *blockStreamWriter) {
+	bsm.checkNextBlock(bd)
+	switch {
+	case !bd.streamID.equal(&bsm.streamID):
+		// The bd contains another streamID.
+		// Write the current log entries under the current streamID, then process the bd.
+		bsm.mustFlushRows()
+		bsm.streamID = bd.streamID
+		if bd.uncompressedSizeBytes >= maxUncompressedBlockSize {
+			// Fast path - write full bd to the output without extracting log entries from it.
+			bsw.MustWriteBlockData(bd)
+		} else {
+			// Slow path - copy the bd to the curr bd.
+			bsm.bd.copyFrom(bd)
+		}
+	case bd.uncompressedSizeBytes >= maxUncompressedBlockSize:
+		// The bd contains the same streamID and it is full,
+		// so it can be written next after the current log entries
+		// without the need to merge the bd with the current log entries.
+		// Write the current log entries and then the bd.
+		bsm.mustFlushRows()
+		bsw.MustWriteBlockData(bd)
+	default:
+		// The bd contains the same streamID and it isn't full,
+		// so it must be merged with the current log entries.
+		bsm.mustMergeRows(bd)
+	}
+}
+
+// checkNextBlock checks whether the bd can be written next after the current data.
+func (bsm *blockStreamMerger) checkNextBlock(bd *blockData) {
+	if len(bsm.rows.timestamps) > 0 && bsm.bd.rowsCount > 0 {
+		logger.Panicf("BUG: bsm.bd must be empty when bsm.rows isn't empty! got %d log entries in bsm.bd", bsm.bd.rowsCount)
+	}
+	if bd.streamID.less(&bsm.streamID) {
+		logger.Panicf("FATAL: cannot merge %s: the streamID=%s for the next block is smaller than the streamID=%s for the current block",
+			bsm.ReadersPaths(), &bd.streamID, &bsm.streamID)
+	}
+	if !bd.streamID.equal(&bsm.streamID) {
+		return
+	}
+	// streamID at bd equals streamID at bsm. Check that minTimestamp in bd is bigger or equal to the minTimestmap at bsm.
+	if bd.rowsCount == 0 {
+		return
+	}
+	nextMinTimestamp := bd.timestampsData.minTimestamp
+	if len(bsm.rows.timestamps) == 0 {
+		if bsm.bd.rowsCount == 0 {
+			return
+		}
+		minTimestamp := bsm.bd.timestampsData.minTimestamp
+		if nextMinTimestamp < minTimestamp {
+			logger.Panicf("FATAL: cannot merge %s: the next block's minTimestamp=%d is smaller than the minTimestamp=%d for the current block",
+				bsm.ReadersPaths(), nextMinTimestamp, minTimestamp)
+		}
+		return
+	}
+	minTimestamp := bsm.rows.timestamps[0]
+	if nextMinTimestamp < minTimestamp {
+		logger.Panicf("FATAL: cannot merge %s: the next block's minTimestamp=%d is smaller than the minTimestamp=%d for log entries for the current block",
+			bsm.ReadersPaths(), nextMinTimestamp, minTimestamp)
+	}
+}
+
+// ReadersPaths returns paths for input blockStreamReaders
+func (bsm *blockStreamMerger) ReadersPaths() string {
+	paths := make([]string, len(bsm.bsrs))
+	for i, bsr := range bsm.bsrs {
+		paths[i] = bsr.Path()
+	}
+	return fmt.Sprintf("[%s]", strings.Join(paths, ","))
+}
+
+// mustMergeRows merges the current log entries inside bsm with bd log entries.
+func (bsm *blockStreamMerger) mustMergeRows(bd *blockData) {
+	if bsm.bd.rowsCount > 0 {
+		// Unmarshal log entries from bsm.bd
+		bsm.mustUnmarshalRows(&bsm.bd)
+		bsm.bd.reset()
+	}
+
+	// Unmarshal log entries from bd
+	rowsLen := len(bsm.rows.timestamps)
+	bsm.mustUnmarshalRows(bd)
+
+	// Merge unmarshaled log entries
+	timestamps := bsm.rows.timestamps
+	rows := bsm.rows.rows
+	bsm.rowsTmp.mergeRows(timestamps[:rowsLen], timestamps[rowsLen:], rows[:rowsLen], rows[rowsLen:])
+	bsm.rows, bsm.rowsTmp = bsm.rowsTmp, bsm.rows
+	bsm.rowsTmp.reset()
+
+	if bsm.uncompressedRowsSizeBytes >= maxUncompressedBlockSize {
+		bsm.mustFlushRows()
+	}
+}
+
+func (bsm *blockStreamMerger) mustUnmarshalRows(bd *blockData) {
+	rowsLen := len(bsm.rows.timestamps)
+	if bsm.sbu == nil {
+		bsm.sbu = getStringsBlockUnmarshaler()
+	}
+	if bsm.vd == nil {
+		bsm.vd = getValuesDecoder()
+	}
+	if err := bd.unmarshalRows(&bsm.rows, bsm.sbu, bsm.vd); err != nil {
+		logger.Panicf("FATAL: cannot merge %s: cannot unmarshal log entries from blockData: %s", bsm.ReadersPaths(), err)
+	}
+	bsm.uncompressedRowsSizeBytes += uncompressedRowsSizeBytes(bsm.rows.rows[rowsLen:])
+}
+
+func (bsm *blockStreamMerger) mustFlushRows() {
+	if len(bsm.rows.timestamps) == 0 {
+		bsm.bsw.MustWriteBlockData(&bsm.bd)
+	} else {
+		bsm.bsw.MustWriteRows(&bsm.streamID, bsm.rows.timestamps, bsm.rows.rows)
+	}
+	bsm.resetRows()
+}
+
+func getBlockStreamMerger() *blockStreamMerger {
+	v := blockStreamMergerPool.Get()
+	if v == nil {
+		return &blockStreamMerger{}
+	}
+	return v.(*blockStreamMerger)
+}
+
+func putBlockStreamMerger(bsm *blockStreamMerger) {
+	bsm.reset()
+	blockStreamMergerPool.Put(bsm)
+}
+
+var blockStreamMergerPool sync.Pool
+
+type blockStreamReadersHeap []*blockStreamReader
+
+func (h *blockStreamReadersHeap) Len() int {
+	return len(*h)
+}
+
+func (h *blockStreamReadersHeap) Less(i, j int) bool {
+	x := *h
+	a := &x[i].blockData
+	b := &x[j].blockData
+	if !a.streamID.equal(&b.streamID) {
+		return a.streamID.less(&b.streamID)
+	}
+	return a.timestampsData.minTimestamp < b.timestampsData.minTimestamp
+}
+
+func (h *blockStreamReadersHeap) Swap(i, j int) {
+	x := *h
+	x[i], x[j] = x[j], x[i]
+}
+
+func (h *blockStreamReadersHeap) Push(v interface{}) {
+	bsr := v.(*blockStreamReader)
+	*h = append(*h, bsr)
+}
+
+func (h *blockStreamReadersHeap) Pop() interface{} {
+	x := *h
+	bsr := x[len(x)-1]
+	x[len(x)-1] = nil
+	*h = x[:len(x)-1]
+	return bsr
+}
--- a/lib/logstorage/block_stream_reader.go
+++ b/lib/logstorage/block_stream_reader.go
@ -0,0 +1,383 @@
+package logstorage
+
+import (
+	"path/filepath"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+type readerWithStats struct {
+	r         filestream.ReadCloser
+	bytesRead uint64
+}
+
+func (r *readerWithStats) reset() {
+	r.r = nil
+	r.bytesRead = 0
+}
+
+func (r *readerWithStats) init(rc filestream.ReadCloser) {
+	r.reset()
+
+	r.r = rc
+}
+
+// Path returns the path to r file
+func (r *readerWithStats) Path() string {
+	return r.r.Path()
+}
+
+// MustReadFull reads len(data) to r.
+func (r *readerWithStats) MustReadFull(data []byte) {
+	fs.MustReadData(r.r, data)
+	r.bytesRead += uint64(len(data))
+}
+
+func (r *readerWithStats) Read(p []byte) (int, error) {
+	n, err := r.r.Read(p)
+	r.bytesRead += uint64(n)
+	return n, err
+}
+
+func (r *readerWithStats) MustClose() {
+	r.r.MustClose()
+	r.r = nil
+}
+
+// streamReaders contains readers for blockStreamReader
+type streamReaders struct {
+	metaindexReader          readerWithStats
+	indexReader              readerWithStats
+	columnsHeaderReader      readerWithStats
+	timestampsReader         readerWithStats
+	fieldValuesReader        readerWithStats
+	fieldBloomFilterReader   readerWithStats
+	messageValuesReader      readerWithStats
+	messageBloomFilterReader readerWithStats
+}
+
+func (sr *streamReaders) reset() {
+	sr.metaindexReader.reset()
+	sr.indexReader.reset()
+	sr.columnsHeaderReader.reset()
+	sr.timestampsReader.reset()
+	sr.fieldValuesReader.reset()
+	sr.fieldBloomFilterReader.reset()
+	sr.messageValuesReader.reset()
+	sr.messageBloomFilterReader.reset()
+}
+
+func (sr *streamReaders) init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader, fieldValuesReader, fieldBloomFilterReader,
+	messageValuesReader, messageBloomFilterReader filestream.ReadCloser,
+) {
+	sr.metaindexReader.init(metaindexReader)
+	sr.indexReader.init(indexReader)
+	sr.columnsHeaderReader.init(columnsHeaderReader)
+	sr.timestampsReader.init(timestampsReader)
+	sr.fieldValuesReader.init(fieldValuesReader)
+	sr.fieldBloomFilterReader.init(fieldBloomFilterReader)
+	sr.messageValuesReader.init(messageValuesReader)
+	sr.messageBloomFilterReader.init(messageBloomFilterReader)
+}
+
+func (sr *streamReaders) totalBytesRead() uint64 {
+	n := uint64(0)
+	n += sr.metaindexReader.bytesRead
+	n += sr.indexReader.bytesRead
+	n += sr.columnsHeaderReader.bytesRead
+	n += sr.timestampsReader.bytesRead
+	n += sr.fieldValuesReader.bytesRead
+	n += sr.fieldBloomFilterReader.bytesRead
+	n += sr.messageValuesReader.bytesRead
+	n += sr.messageBloomFilterReader.bytesRead
+	return n
+}
+
+func (sr *streamReaders) MustClose() {
+	sr.metaindexReader.MustClose()
+	sr.indexReader.MustClose()
+	sr.columnsHeaderReader.MustClose()
+	sr.timestampsReader.MustClose()
+	sr.fieldValuesReader.MustClose()
+	sr.fieldBloomFilterReader.MustClose()
+	sr.messageValuesReader.MustClose()
+	sr.messageBloomFilterReader.MustClose()
+}
+
+// blockStreamReader is used for reading blocks in streaming manner from a part.
+type blockStreamReader struct {
+	// blockData contains the data for the last read block
+	blockData blockData
+
+	// ph is the header for the part
+	ph partHeader
+
+	// streamReaders contains data readers in stream mode
+	streamReaders streamReaders
+
+	// indexBlockHeaders contains the list of all the indexBlockHeader entries for the part
+	indexBlockHeaders []indexBlockHeader
+
+	// blockHeaders contains the list of blockHeader entries for the current indexBlockHeader pointed by nextIndexBlockIdx
+	blockHeaders []blockHeader
+
+	// nextIndexBlockIdx is the index of the next item to read from indexBlockHeaders
+	nextIndexBlockIdx int
+
+	// nextBlockIdx is the index of the next item to read from blockHeaders
+	nextBlockIdx int
+
+	// globalUncompressedSizeBytes is the total size of log entries seen in the part
+	globalUncompressedSizeBytes uint64
+
+	// globalRowsCount is the number of log entries seen in the part
+	globalRowsCount uint64
+
+	// globalBlocksCount is the number of blocks seen in the part
+	globalBlocksCount uint64
+
+	// sidLast is the stream id for the previously read block
+	sidLast streamID
+
+	// minTimestampLast is the minimum timestamp for the previously read block
+	minTimestampLast int64
+}
+
+// reset resets bsr, so it can be re-used
+func (bsr *blockStreamReader) reset() {
+	bsr.blockData.reset()
+	bsr.ph.reset()
+	bsr.streamReaders.reset()
+
+	ihs := bsr.indexBlockHeaders
+	if len(ihs) > 10e3 {
+		// The ihs len is unbound, so it is better to drop too long indexBlockHeaders in order to reduce memory usage
+		ihs = nil
+	}
+	for i := range ihs {
+		ihs[i].reset()
+	}
+	bsr.indexBlockHeaders = ihs[:0]
+
+	bhs := bsr.blockHeaders
+	for i := range bhs {
+		bhs[i].reset()
+	}
+	bsr.blockHeaders = bhs[:0]
+
+	bsr.nextIndexBlockIdx = 0
+	bsr.nextBlockIdx = 0
+	bsr.globalUncompressedSizeBytes = 0
+	bsr.globalRowsCount = 0
+	bsr.globalBlocksCount = 0
+
+	bsr.sidLast.reset()
+	bsr.minTimestampLast = 0
+}
+
+// Path returns part path for bsr (e.g. file path, url or in-memory reference)
+func (bsr *blockStreamReader) Path() string {
+	path := bsr.streamReaders.metaindexReader.Path()
+	return filepath.Dir(path)
+}
+
+// MustInitFromInmemoryPart initializes bsr from mp.
+func (bsr *blockStreamReader) MustInitFromInmemoryPart(mp *inmemoryPart) {
+	bsr.reset()
+
+	bsr.ph = mp.ph
+
+	// Initialize streamReaders
+	metaindexReader := mp.metaindex.NewReader()
+	indexReader := mp.index.NewReader()
+	columnsHeaderReader := mp.columnsHeader.NewReader()
+	timestampsReader := mp.timestamps.NewReader()
+	fieldValuesReader := mp.fieldValues.NewReader()
+	fieldBloomFilterReader := mp.fieldBloomFilter.NewReader()
+	messageValuesReader := mp.messageValues.NewReader()
+	messageBloomFilterReader := mp.messageBloomFilter.NewReader()
+
+	bsr.streamReaders.init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader,
+		fieldValuesReader, fieldBloomFilterReader, messageValuesReader, messageBloomFilterReader)
+
+	// Read metaindex data
+	bsr.indexBlockHeaders = mustReadIndexBlockHeaders(bsr.indexBlockHeaders[:0], &bsr.streamReaders.metaindexReader)
+}
+
+// MustInitFromFilePart initializes bsr from file part at the given path.
+func (bsr *blockStreamReader) MustInitFromFilePart(path string) {
+	bsr.reset()
+
+	// Files in the part are always read without OS cache pollution,
+	// since they are usually deleted after the merge.
+	const nocache = true
+
+	metaindexPath := filepath.Join(path, metaindexFilename)
+	indexPath := filepath.Join(path, indexFilename)
+	columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
+	timestampsPath := filepath.Join(path, timestampsFilename)
+	fieldValuesPath := filepath.Join(path, fieldValuesFilename)
+	fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
+	messageValuesPath := filepath.Join(path, messageValuesFilename)
+	messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
+
+	bsr.ph.mustReadMetadata(path)
+
+	// Open data readers
+	metaindexReader := filestream.MustOpen(metaindexPath, nocache)
+	indexReader := filestream.MustOpen(indexPath, nocache)
+	columnsHeaderReader := filestream.MustOpen(columnsHeaderPath, nocache)
+	timestampsReader := filestream.MustOpen(timestampsPath, nocache)
+	fieldValuesReader := filestream.MustOpen(fieldValuesPath, nocache)
+	fieldBloomFilterReader := filestream.MustOpen(fieldBloomFilterPath, nocache)
+	messageValuesReader := filestream.MustOpen(messageValuesPath, nocache)
+	messageBloomFilterReader := filestream.MustOpen(messageBloomFilterPath, nocache)
+
+	// Initialize streamReaders
+	bsr.streamReaders.init(metaindexReader, indexReader, columnsHeaderReader, timestampsReader,
+		fieldValuesReader, fieldBloomFilterReader, messageValuesReader, messageBloomFilterReader)
+
+	// Read metaindex data
+	bsr.indexBlockHeaders = mustReadIndexBlockHeaders(bsr.indexBlockHeaders[:0], &bsr.streamReaders.metaindexReader)
+}
+
+// NextBlock reads the next block from bsr and puts it into bsr.blockData.
+//
+// false is returned if there are no other blocks.
+func (bsr *blockStreamReader) NextBlock() bool {
+	for bsr.nextBlockIdx >= len(bsr.blockHeaders) {
+		if !bsr.nextIndexBlock() {
+			return false
+		}
+	}
+	ih := &bsr.indexBlockHeaders[bsr.nextIndexBlockIdx-1]
+	bh := &bsr.blockHeaders[bsr.nextBlockIdx]
+	th := &bh.timestampsHeader
+
+	// Validate bh
+	if bh.streamID.less(&bsr.sidLast) {
+		logger.Panicf("FATAL: %s: blockHeader.streamID=%s cannot be smaller than the streamID from the previously read block: %s", bsr.Path(), &bh.streamID, &bsr.sidLast)
+	}
+	if bh.streamID.equal(&bsr.sidLast) && th.minTimestamp < bsr.minTimestampLast {
+		logger.Panicf("FATAL: %s: timestamps.minTimestamp=%d cannot be smaller than the minTimestamp for the previously read block for the same streamID: %d",
+			bsr.Path(), th.minTimestamp, bsr.minTimestampLast)
+	}
+	bsr.minTimestampLast = th.minTimestamp
+	bsr.sidLast = bh.streamID
+	if th.minTimestamp < ih.minTimestamp {
+		logger.Panicf("FATAL: %s: timestampsHeader.minTimestamp=%d cannot be smaller than indexBlockHeader.minTimestamp=%d", bsr.Path(), th.minTimestamp, ih.minTimestamp)
+	}
+	if th.maxTimestamp > ih.maxTimestamp {
+		logger.Panicf("FATAL: %s: timestampsHeader.maxTimestamp=%d cannot be bigger than indexBlockHeader.maxTimestamp=%d", bsr.Path(), th.maxTimestamp, ih.minTimestamp)
+	}
+
+	// Read bsr.blockData
+	bsr.blockData.mustReadFrom(bh, &bsr.streamReaders)
+
+	bsr.globalUncompressedSizeBytes += bh.uncompressedSizeBytes
+	bsr.globalRowsCount += bh.rowsCount
+	bsr.globalBlocksCount++
+	if bsr.globalUncompressedSizeBytes > bsr.ph.UncompressedSizeBytes {
+		logger.Panicf("FATAL: %s: too big size of entries read: %d; mustn't exceed partHeader.UncompressedSizeBytes=%d",
+			bsr.Path(), bsr.globalUncompressedSizeBytes, bsr.ph.UncompressedSizeBytes)
+	}
+	if bsr.globalRowsCount > bsr.ph.RowsCount {
+		logger.Panicf("FATAL: %s: too many log entries read so far: %d; mustn't exceed partHeader.RowsCount=%d", bsr.Path(), bsr.globalRowsCount, bsr.ph.RowsCount)
+	}
+	if bsr.globalBlocksCount > bsr.ph.BlocksCount {
+		logger.Panicf("FATAL: %s: too many blocks read so far: %d; mustn't exceed partHeader.BlocksCount=%d", bsr.Path(), bsr.globalBlocksCount, bsr.ph.BlocksCount)
+	}
+
+	// The block has been sucessfully read
+	bsr.nextBlockIdx++
+	return true
+}
+
+func (bsr *blockStreamReader) nextIndexBlock() bool {
+	// Advance to the next indexBlockHeader
+	if bsr.nextIndexBlockIdx >= len(bsr.indexBlockHeaders) {
+		// No more blocks left
+		// Validate bsr.ph
+		totalBytesRead := bsr.streamReaders.totalBytesRead()
+		if bsr.ph.CompressedSizeBytes != totalBytesRead {
+			logger.Panicf("FATAL: %s: partHeader.CompressedSizeBytes=%d must match the size of data read: %d", bsr.Path(), bsr.ph.CompressedSizeBytes, totalBytesRead)
+		}
+		if bsr.ph.UncompressedSizeBytes != bsr.globalUncompressedSizeBytes {
+			logger.Panicf("FATAL: %s: partHeader.UncompressedSizeBytes=%d must match the size of entries read: %d",
+				bsr.Path(), bsr.ph.UncompressedSizeBytes, bsr.globalUncompressedSizeBytes)
+		}
+		if bsr.ph.RowsCount != bsr.globalRowsCount {
+			logger.Panicf("FATAL: %s: partHeader.RowsCount=%d must match the number of log entries read: %d", bsr.Path(), bsr.ph.RowsCount, bsr.globalRowsCount)
+		}
+		if bsr.ph.BlocksCount != bsr.globalBlocksCount {
+			logger.Panicf("FATAL: %s: partHeader.BlocksCount=%d must match the number of blocks read: %d", bsr.Path(), bsr.ph.BlocksCount, bsr.globalBlocksCount)
+		}
+		return false
+	}
+	ih := &bsr.indexBlockHeaders[bsr.nextIndexBlockIdx]
+
+	// Validate ih
+	metaindexReader := &bsr.streamReaders.metaindexReader
+	if ih.minTimestamp < bsr.ph.MinTimestamp {
+		logger.Panicf("FATAL: %s: indexBlockHeader.minTimestamp=%d cannot be smaller than partHeader.MinTimestamp=%d",
+			metaindexReader.Path(), ih.minTimestamp, bsr.ph.MinTimestamp)
+	}
+	if ih.maxTimestamp > bsr.ph.MaxTimestamp {
+		logger.Panicf("FATAL: %s: indexBlockHeader.maxTimestamp=%d cannot be bigger than partHeader.MaxTimestamp=%d",
+			metaindexReader.Path(), ih.maxTimestamp, bsr.ph.MaxTimestamp)
+	}
+
+	// Read indexBlock for the given ih
+	bb := longTermBufPool.Get()
+	bb.B = ih.mustReadNextIndexBlock(bb.B[:0], &bsr.streamReaders)
+	bsr.blockHeaders = resetBlockHeaders(bsr.blockHeaders)
+	var err error
+	bsr.blockHeaders, err = unmarshalBlockHeaders(bsr.blockHeaders[:0], bb.B)
+	longTermBufPool.Put(bb)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot unmarshal blockHeader entries: %s", bsr.streamReaders.indexReader.Path(), err)
+	}
+
+	bsr.nextIndexBlockIdx++
+	bsr.nextBlockIdx = 0
+	return true
+}
+
+// MustClose closes bsr.
+func (bsr *blockStreamReader) MustClose() {
+	bsr.streamReaders.MustClose()
+	bsr.reset()
+}
+
+// getBlockStreamReader returns blockStreamReader.
+//
+// The returned blockStreamReader must be initialized with MustInit().
+// call putBlockStreamReader() when the retruend blockStreamReader is no longer needed.
+func getBlockStreamReader() *blockStreamReader {
+	v := blockStreamReaderPool.Get()
+	if v == nil {
+		v = &blockStreamReader{}
+	}
+	bsr := v.(*blockStreamReader)
+	return bsr
+}
+
+// putBlockStreamReader returns bsr to the pool.
+//
+// bsr cannot be used after returning to the pool.
+func putBlockStreamReader(bsr *blockStreamReader) {
+	bsr.reset()
+	blockStreamReaderPool.Put(bsr)
+}
+
+var blockStreamReaderPool sync.Pool
+
+// mustCloseBlockStreamReaders calls MustClose() on the given bsrs.
+func mustCloseBlockStreamReaders(bsrs []*blockStreamReader) {
+	for _, bsr := range bsrs {
+		bsr.MustClose()
+	}
+}
--- a/lib/logstorage/block_stream_writer.go
+++ b/lib/logstorage/block_stream_writer.go
@ -0,0 +1,362 @@
+package logstorage
+
+import (
+	"path/filepath"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// writerWithStats writes data to w and tracks the total amounts of data written at bytesWritten.
+type writerWithStats struct {
+	w            filestream.WriteCloser
+	bytesWritten uint64
+}
+
+func (w *writerWithStats) reset() {
+	w.w = nil
+	w.bytesWritten = 0
+}
+
+func (w *writerWithStats) init(wc filestream.WriteCloser) {
+	w.reset()
+
+	w.w = wc
+}
+
+func (w *writerWithStats) Path() string {
+	return w.w.Path()
+}
+
+func (w *writerWithStats) MustWrite(data []byte) {
+	fs.MustWriteData(w.w, data)
+	w.bytesWritten += uint64(len(data))
+}
+
+// MustClose closes the underlying w.
+func (w *writerWithStats) MustClose() {
+	w.w.MustClose()
+}
+
+// streamWriters contain writers for blockStreamWriter
+type streamWriters struct {
+	metaindexWriter          writerWithStats
+	indexWriter              writerWithStats
+	columnsHeaderWriter      writerWithStats
+	timestampsWriter         writerWithStats
+	fieldValuesWriter        writerWithStats
+	fieldBloomFilterWriter   writerWithStats
+	messageValuesWriter      writerWithStats
+	messageBloomFilterWriter writerWithStats
+}
+
+func (sw *streamWriters) reset() {
+	sw.metaindexWriter.reset()
+	sw.indexWriter.reset()
+	sw.columnsHeaderWriter.reset()
+	sw.timestampsWriter.reset()
+	sw.fieldValuesWriter.reset()
+	sw.fieldBloomFilterWriter.reset()
+	sw.messageValuesWriter.reset()
+	sw.messageBloomFilterWriter.reset()
+}
+
+func (sw *streamWriters) init(metaindexWriter, indexWriter, columnsHeaderWriter, timestampsWriter, fieldValuesWriter, fieldBloomFilterWriter,
+	messageValuesWriter, messageBloomFilterWriter filestream.WriteCloser,
+) {
+	sw.metaindexWriter.init(metaindexWriter)
+	sw.indexWriter.init(indexWriter)
+	sw.columnsHeaderWriter.init(columnsHeaderWriter)
+	sw.timestampsWriter.init(timestampsWriter)
+	sw.fieldValuesWriter.init(fieldValuesWriter)
+	sw.fieldBloomFilterWriter.init(fieldBloomFilterWriter)
+	sw.messageValuesWriter.init(messageValuesWriter)
+	sw.messageBloomFilterWriter.init(messageBloomFilterWriter)
+}
+
+func (sw *streamWriters) totalBytesWritten() uint64 {
+	n := uint64(0)
+	n += sw.metaindexWriter.bytesWritten
+	n += sw.indexWriter.bytesWritten
+	n += sw.columnsHeaderWriter.bytesWritten
+	n += sw.timestampsWriter.bytesWritten
+	n += sw.fieldValuesWriter.bytesWritten
+	n += sw.fieldBloomFilterWriter.bytesWritten
+	n += sw.messageValuesWriter.bytesWritten
+	n += sw.messageBloomFilterWriter.bytesWritten
+	return n
+}
+
+func (sw *streamWriters) MustClose() {
+	sw.metaindexWriter.MustClose()
+	sw.indexWriter.MustClose()
+	sw.columnsHeaderWriter.MustClose()
+	sw.timestampsWriter.MustClose()
+	sw.fieldValuesWriter.MustClose()
+	sw.fieldBloomFilterWriter.MustClose()
+	sw.messageValuesWriter.MustClose()
+	sw.messageBloomFilterWriter.MustClose()
+}
+
+// blockStreamWriter is used for writing blocks into the underlying storage in streaming manner.
+type blockStreamWriter struct {
+	// streamWriters contains writer for block data
+	streamWriters streamWriters
+
+	// sidLast is the streamID for the last written block
+	sidLast streamID
+
+	// sidFirst is the streamID for the first block in the current indexBlock
+	sidFirst streamID
+
+	// minTimestampLast is the minimum timestamp seen for the last written block
+	minTimestampLast int64
+
+	// minTimestamp is the minimum timestamp seen across written blocks for the current indexBlock
+	minTimestamp int64
+
+	// maxTimestamp is the maximum timestamp seen across written blocks for the current indexBlock
+	maxTimestamp int64
+
+	// hasWrittenBlocks is set to true if at least a single block is written to the current indexBlock
+	hasWrittenBlocks bool
+
+	// globalUncompressedSizeBytes is the total size of all the log entries written via bsw
+	globalUncompressedSizeBytes uint64
+
+	// globalRowsCount is the total number of log entries written via bsw
+	globalRowsCount uint64
+
+	// globalBlocksCount is the total number of blocks written to bsw
+	globalBlocksCount uint64
+
+	// globalMinTimestamp is the minimum timestamp seen across all the blocks written to bsw
+	globalMinTimestamp int64
+
+	// globalMaxTimestamp is the maximum timestamp seen across all the blocks written to bsw
+	globalMaxTimestamp int64
+
+	// indexBlockData contains marshaled blockHeader data, which isn't written yet to indexFilename
+	indexBlockData []byte
+
+	// metaindexData contains marshaled indexBlockHeader data, which isn't written yet to metaindexFilename
+	metaindexData []byte
+
+	// indexBlockHeader is used for marshaling the data to metaindexData
+	indexBlockHeader indexBlockHeader
+}
+
+// reset resets bsw for subsequent re-use.
+func (bsw *blockStreamWriter) reset() {
+	bsw.streamWriters.reset()
+	bsw.sidLast.reset()
+	bsw.sidFirst.reset()
+	bsw.minTimestampLast = 0
+	bsw.minTimestamp = 0
+	bsw.maxTimestamp = 0
+	bsw.hasWrittenBlocks = false
+	bsw.globalUncompressedSizeBytes = 0
+	bsw.globalRowsCount = 0
+	bsw.globalBlocksCount = 0
+	bsw.globalMinTimestamp = 0
+	bsw.globalMaxTimestamp = 0
+	bsw.indexBlockData = bsw.indexBlockData[:0]
+
+	if len(bsw.metaindexData) > 1024*1024 {
+		// The length of bsw.metaindexData is unbound, so drop too long buffer
+		// in order to conserve memory.
+		bsw.metaindexData = nil
+	} else {
+		bsw.metaindexData = bsw.metaindexData[:0]
+	}
+
+	bsw.indexBlockHeader.reset()
+}
+
+// MustInitFromInmemoryPart initializes bsw from mp
+func (bsw *blockStreamWriter) MustInitForInmemoryPart(mp *inmemoryPart) {
+	bsw.reset()
+	bsw.streamWriters.init(&mp.metaindex, &mp.index, &mp.columnsHeader, &mp.timestamps, &mp.fieldValues, &mp.fieldBloomFilter, &mp.messageValues, &mp.messageBloomFilter)
+}
+
+// MustInitForFilePart initializes bsw for writing data to file part located at path.
+//
+// if nocache is true, then the written data doesn't go to OS page cache.
+func (bsw *blockStreamWriter) MustInitForFilePart(path string, nocache bool) {
+	bsw.reset()
+
+	fs.MustMkdirFailIfExist(path)
+
+	metaindexPath := filepath.Join(path, metaindexFilename)
+	indexPath := filepath.Join(path, indexFilename)
+	columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
+	timestampsPath := filepath.Join(path, timestampsFilename)
+	fieldValuesPath := filepath.Join(path, fieldValuesFilename)
+	fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
+	messageValuesPath := filepath.Join(path, messageValuesFilename)
+	messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
+
+	// Always cache metaindex file, since it it re-read immediately after part creation
+	metaindexWriter := filestream.MustCreate(metaindexPath, false)
+
+	indexWriter := filestream.MustCreate(indexPath, nocache)
+	columnsHeaderWriter := filestream.MustCreate(columnsHeaderPath, nocache)
+	timestampsWriter := filestream.MustCreate(timestampsPath, nocache)
+	fieldValuesWriter := filestream.MustCreate(fieldValuesPath, nocache)
+	fieldBloomFilterWriter := filestream.MustCreate(fieldBloomFilterPath, nocache)
+	messageValuesWriter := filestream.MustCreate(messageValuesPath, nocache)
+	messageBloomFilterWriter := filestream.MustCreate(messageBloomFilterPath, nocache)
+
+	bsw.streamWriters.init(metaindexWriter, indexWriter, columnsHeaderWriter, timestampsWriter,
+		fieldValuesWriter, fieldBloomFilterWriter, messageValuesWriter, messageBloomFilterWriter)
+}
+
+// MustWriteRows writes timestamps with rows under the given sid to bsw.
+//
+// timestamps must be sorted.
+// sid must be bigger or equal to the sid for the previously written rs.
+func (bsw *blockStreamWriter) MustWriteRows(sid *streamID, timestamps []int64, rows [][]Field) {
+	if len(timestamps) == 0 {
+		return
+	}
+
+	b := getBlock()
+	b.MustInitFromRows(timestamps, rows)
+	bsw.MustWriteBlock(sid, b)
+	putBlock(b)
+}
+
+// MustWriteBlockData writes bd to bsw.
+//
+// The bd.streamID must be bigger or equal to the streamID for the previously written blocks.
+func (bsw *blockStreamWriter) MustWriteBlockData(bd *blockData) {
+	if bd.rowsCount == 0 {
+		return
+	}
+	bsw.mustWriteBlockInternal(&bd.streamID, nil, bd)
+}
+
+// MustWriteBlock writes b under the given sid to bsw.
+//
+// The sid must be bigger or equal to the sid for the previously written blocks.
+// The minimum timestamp in b must be bigger or equal to the minimum timestamp written to the same sid.
+func (bsw *blockStreamWriter) MustWriteBlock(sid *streamID, b *block) {
+	rowsCount := b.Len()
+	if rowsCount == 0 {
+		return
+	}
+	bsw.mustWriteBlockInternal(sid, b, nil)
+}
+
+func (bsw *blockStreamWriter) mustWriteBlockInternal(sid *streamID, b *block, bd *blockData) {
+	if sid.less(&bsw.sidLast) {
+		logger.Panicf("BUG: the sid=%s cannot be smaller than the previously written sid=%s", sid, &bsw.sidLast)
+	}
+	hasWrittenBlocks := bsw.hasWrittenBlocks
+	if !hasWrittenBlocks {
+		bsw.sidFirst = *sid
+		bsw.hasWrittenBlocks = true
+	}
+	isSeenSid := sid.equal(&bsw.sidLast)
+	bsw.sidLast = *sid
+
+	bh := getBlockHeader()
+	if b != nil {
+		b.mustWriteTo(sid, bh, &bsw.streamWriters)
+	} else {
+		bd.mustWriteTo(bh, &bsw.streamWriters)
+	}
+	th := &bh.timestampsHeader
+	if bsw.globalRowsCount == 0 || th.minTimestamp < bsw.globalMinTimestamp {
+		bsw.globalMinTimestamp = th.minTimestamp
+	}
+	if bsw.globalRowsCount == 0 || th.maxTimestamp > bsw.globalMaxTimestamp {
+		bsw.globalMaxTimestamp = th.maxTimestamp
+	}
+	if !hasWrittenBlocks || th.minTimestamp < bsw.minTimestamp {
+		bsw.minTimestamp = th.minTimestamp
+	}
+	if !hasWrittenBlocks || th.maxTimestamp > bsw.maxTimestamp {
+		bsw.maxTimestamp = th.maxTimestamp
+	}
+	if isSeenSid && th.minTimestamp < bsw.minTimestampLast {
+		logger.Panicf("BUG: the block for sid=%s cannot contain timestamp smaller than %d, but it contains timestamp %d", sid, bsw.minTimestampLast, th.minTimestamp)
+	}
+	bsw.minTimestampLast = th.minTimestamp
+
+	bsw.globalUncompressedSizeBytes += bh.uncompressedSizeBytes
+	bsw.globalRowsCount += bh.rowsCount
+	bsw.globalBlocksCount++
+
+	// Marshal bh
+	bsw.indexBlockData = bh.marshal(bsw.indexBlockData)
+	putBlockHeader(bh)
+	if len(bsw.indexBlockData) > maxUncompressedIndexBlockSize {
+		bsw.mustFlushIndexBlock(bsw.indexBlockData)
+		bsw.indexBlockData = bsw.indexBlockData[:0]
+	}
+}
+
+func (bsw *blockStreamWriter) mustFlushIndexBlock(data []byte) {
+	if len(data) > 0 {
+		bsw.indexBlockHeader.mustWriteIndexBlock(data, bsw.sidFirst, bsw.minTimestamp, bsw.maxTimestamp, &bsw.streamWriters)
+		bsw.metaindexData = bsw.indexBlockHeader.marshal(bsw.metaindexData)
+	}
+	bsw.hasWrittenBlocks = false
+	bsw.minTimestamp = 0
+	bsw.maxTimestamp = 0
+	bsw.sidFirst.reset()
+}
+
+// Finalize() finalizes the data write process and updates ph with the finalized stats
+//
+// It closes the writers passed to MustInit().
+//
+// bsw can be re-used after calling Finalize().
+func (bsw *blockStreamWriter) Finalize(ph *partHeader) {
+	ph.UncompressedSizeBytes = bsw.globalUncompressedSizeBytes
+	ph.RowsCount = bsw.globalRowsCount
+	ph.BlocksCount = bsw.globalBlocksCount
+	ph.MinTimestamp = bsw.globalMinTimestamp
+	ph.MaxTimestamp = bsw.globalMaxTimestamp
+
+	bsw.mustFlushIndexBlock(bsw.indexBlockData)
+
+	// Write metaindex data
+	bb := longTermBufPool.Get()
+	bb.B = encoding.CompressZSTDLevel(bb.B[:0], bsw.metaindexData, 1)
+	bsw.streamWriters.metaindexWriter.MustWrite(bb.B)
+	if len(bb.B) < 1024*1024 {
+		longTermBufPool.Put(bb)
+	}
+
+	ph.CompressedSizeBytes = bsw.streamWriters.totalBytesWritten()
+
+	bsw.streamWriters.MustClose()
+	bsw.reset()
+}
+
+var longTermBufPool bytesutil.ByteBufferPool
+
+// getBlockStreamWriter returns new blockStreamWriter from the pool.
+//
+// Return back the blockStreamWriter to the pool when it is no longer needed by calling putBlockStreamWriter.
+func getBlockStreamWriter() *blockStreamWriter {
+	v := blockStreamWriterPool.Get()
+	if v == nil {
+		return &blockStreamWriter{}
+	}
+	return v.(*blockStreamWriter)
+}
+
+// putBlockStreamWriter returns bsw to the pool.
+func putBlockStreamWriter(bsw *blockStreamWriter) {
+	bsw.reset()
+	blockStreamWriterPool.Put(bsw)
+}
+
+var blockStreamWriterPool sync.Pool
--- a/lib/logstorage/block_test.go
+++ b/lib/logstorage/block_test.go
@ -0,0 +1,179 @@
+package logstorage
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestBlockMustInitFromRows(t *testing.T) {
+	f := func(timestamps []int64, rows [][]Field, bExpected *block) {
+		t.Helper()
+		b := getBlock()
+		defer putBlock(b)
+
+		b.MustInitFromRows(timestamps, rows)
+		if b.uncompressedSizeBytes() >= maxUncompressedBlockSize {
+			t.Fatalf("expecting non-full block")
+		}
+		if !reflect.DeepEqual(b, bExpected) {
+			t.Fatalf("unexpected block;\ngot\n%v\nwant\n%v", b, bExpected)
+		}
+		if n := b.Len(); n != len(timestamps) {
+			t.Fatalf("unexpected block len; got %d; want %d", n, len(timestamps))
+		}
+		b.assertValid()
+	}
+
+	// An empty log entries
+	f(nil, nil, &block{})
+	f([]int64{}, [][]Field{}, &block{})
+
+	// A single row
+	timestamps := []int64{1234}
+	rows := [][]Field{
+		{
+			{
+				Name:  "msg",
+				Value: "foo",
+			},
+			{
+				Name:  "level",
+				Value: "error",
+			},
+		},
+	}
+	bExpected := &block{
+		timestamps: []int64{1234},
+		constColumns: []Field{
+			{
+				Name:  "level",
+				Value: "error",
+			},
+			{
+				Name:  "msg",
+				Value: "foo",
+			},
+		},
+	}
+	f(timestamps, rows, bExpected)
+
+	// Multiple log entries with the same set of fields
+	timestamps = []int64{3, 5}
+	rows = [][]Field{
+		{
+			{
+				Name:  "job",
+				Value: "foo",
+			},
+			{
+				Name:  "instance",
+				Value: "host1",
+			},
+		},
+		{
+			{
+				Name:  "job",
+				Value: "foo",
+			},
+			{
+				Name:  "instance",
+				Value: "host2",
+			},
+		},
+	}
+	bExpected = &block{
+		timestamps: []int64{3, 5},
+		columns: []column{
+			{
+				name:   "instance",
+				values: []string{"host1", "host2"},
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "job",
+				Value: "foo",
+			},
+		},
+	}
+	f(timestamps, rows, bExpected)
+
+	// Multiple log entries with distinct set of fields
+	timestamps = []int64{3, 5, 10}
+	rows = [][]Field{
+		{
+			{
+				Name:  "msg",
+				Value: "foo",
+			},
+			{
+				Name:  "b",
+				Value: "xyz",
+			},
+		},
+		{
+			{
+				Name:  "b",
+				Value: "xyz",
+			},
+			{
+				Name:  "a",
+				Value: "aaa",
+			},
+		},
+		{
+			{
+				Name:  "b",
+				Value: "xyz",
+			},
+		},
+	}
+	bExpected = &block{
+		timestamps: []int64{3, 5, 10},
+		columns: []column{
+			{
+				name:   "a",
+				values: []string{"", "aaa", ""},
+			},
+			{
+				name:   "msg",
+				values: []string{"foo", "", ""},
+			},
+		},
+		constColumns: []Field{
+			{
+				Name:  "b",
+				Value: "xyz",
+			},
+		},
+	}
+	f(timestamps, rows, bExpected)
+}
+
+func TestBlockMustInitFromRowsFullBlock(t *testing.T) {
+	const rowsCount = 2000
+	timestamps := make([]int64, rowsCount)
+	rows := make([][]Field, rowsCount)
+	for i := range timestamps {
+		fields := make([]Field, 10)
+		for j := range fields {
+			fields[j] = Field{
+				Name:  fmt.Sprintf("field_%d", j),
+				Value: "very very looooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong value",
+			}
+		}
+		rows[i] = fields
+	}
+
+	b := getBlock()
+	defer putBlock(b)
+	b.MustInitFromRows(timestamps, rows)
+	if n := b.Len(); n != len(rows) {
+		t.Fatalf("unexpected total log entries; got %d; want %d", n, len(rows))
+	}
+	if b.uncompressedSizeBytes() < maxUncompressedBlockSize {
+		t.Fatalf("expecting full block")
+	}
+	b.assertValid()
+}
--- a/lib/logstorage/block_timing_test.go
+++ b/lib/logstorage/block_timing_test.go
@ -0,0 +1,46 @@
+package logstorage
+
+import (
+	"fmt"
+	"testing"
+)
+
+func BenchmarkBlock_MustInitFromRows(b *testing.B) {
+	for _, rowsPerBlock := range []int{1, 10, 100, 1000, 10000} {
+		b.Run(fmt.Sprintf("rowsPerBlock_%d", rowsPerBlock), func(b *testing.B) {
+			benchmarkBlockMustInitFromRows(b, rowsPerBlock)
+		})
+	}
+}
+
+func benchmarkBlockMustInitFromRows(b *testing.B, rowsPerBlock int) {
+	timestamps, rows := newTestRows(rowsPerBlock, 10)
+	b.ReportAllocs()
+	b.SetBytes(int64(len(timestamps)))
+	b.RunParallel(func(pb *testing.PB) {
+		block := getBlock()
+		defer putBlock(block)
+		for pb.Next() {
+			block.MustInitFromRows(timestamps, rows)
+			if n := block.Len(); n != len(timestamps) {
+				panic(fmt.Errorf("unexpected block length; got %d; want %d", n, len(timestamps)))
+			}
+		}
+	})
+}
+
+func newTestRows(rowsCount, fieldsPerRow int) ([]int64, [][]Field) {
+	timestamps := make([]int64, rowsCount)
+	rows := make([][]Field, rowsCount)
+	for i := range timestamps {
+		timestamps[i] = int64(i) * 1e9
+		fields := make([]Field, fieldsPerRow)
+		for j := range fields {
+			f := &fields[j]
+			f.Name = fmt.Sprintf("field_%d", j)
+			f.Value = fmt.Sprintf("value_%d_%d", i, j)
+		}
+		rows[i] = fields
+	}
+	return timestamps, rows
+}
--- a/lib/logstorage/bloomfilter.go
+++ b/lib/logstorage/bloomfilter.go
@ -0,0 +1,176 @@
+package logstorage
+
+import (
+	"fmt"
+	"sync"
+	"unsafe"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/cespare/xxhash/v2"
+)
+
+// bloomFilterHashesCount is the number of different hashes to use for bloom filter.
+const bloomFilterHashesCount = 6
+
+// bloomFilterBitsPerItem is the number of bits to use per each token.
+const bloomFilterBitsPerItem = 16
+
+// bloomFilterMarshal appends marshaled bloom filter for tokens to dst and returns the result.
+func bloomFilterMarshal(dst []byte, tokens []string) []byte {
+	bf := getBloomFilter()
+	bf.mustInit(tokens)
+	dst = bf.marshal(dst)
+	putBloomFilter(bf)
+	return dst
+}
+
+type bloomFilter struct {
+	bits []uint64
+}
+
+func (bf *bloomFilter) reset() {
+	bits := bf.bits
+	for i := range bits {
+		bits[i] = 0
+	}
+	bf.bits = bits[:0]
+}
+
+// marshal appends marshaled bf to dst and returns the result.
+func (bf *bloomFilter) marshal(dst []byte) []byte {
+	bits := bf.bits
+	for _, word := range bits {
+		dst = encoding.MarshalUint64(dst, word)
+	}
+	return dst
+}
+
+// unmarshal unmarshals bf from src.
+func (bf *bloomFilter) unmarshal(src []byte) error {
+	if len(src)%8 != 0 {
+		return fmt.Errorf("cannot unmarshal bloomFilter from src with size not multiple by 8; len(src)=%d", len(src))
+	}
+	bf.reset()
+	wordsCount := len(src) / 8
+	bits := bf.bits
+	if n := wordsCount - cap(bits); n > 0 {
+		bits = append(bits[:cap(bits)], make([]uint64, n)...)
+	}
+	bits = bits[:wordsCount]
+	for i := range bits {
+		bits[i] = encoding.UnmarshalUint64(src)
+		src = src[8:]
+	}
+	bf.bits = bits
+	return nil
+}
+
+// mustInit initializes bf with the given tokens
+func (bf *bloomFilter) mustInit(tokens []string) {
+	bitsCount := len(tokens) * bloomFilterBitsPerItem
+	wordsCount := (bitsCount + 63) / 64
+	bits := bf.bits
+	if n := wordsCount - cap(bits); n > 0 {
+		bits = append(bits[:cap(bits)], make([]uint64, n)...)
+	}
+	bits = bits[:wordsCount]
+	bloomFilterAdd(bits, tokens)
+	bf.bits = bits
+}
+
+// bloomFilterAdd adds the given tokens to the bloom filter bits
+func bloomFilterAdd(bits []uint64, tokens []string) {
+	maxBits := uint64(len(bits)) * 64
+	var buf [8]byte
+	hp := (*uint64)(unsafe.Pointer(&buf[0]))
+	for _, token := range tokens {
+		*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
+		for i := 0; i < bloomFilterHashesCount; i++ {
+			hi := xxhash.Sum64(buf[:])
+			(*hp)++
+			idx := hi % maxBits
+			i := idx / 64
+			j := idx % 64
+			mask := uint64(1) << j
+			w := bits[i]
+			if (w & mask) == 0 {
+				bits[i] = w | mask
+			}
+		}
+	}
+}
+
+// containsAll returns true if bf contains all the given tokens.
+func (bf *bloomFilter) containsAll(tokens []string) bool {
+	bits := bf.bits
+	if len(bits) == 0 {
+		return true
+	}
+	maxBits := uint64(len(bits)) * 64
+	var buf [8]byte
+	hp := (*uint64)(unsafe.Pointer(&buf[0]))
+	for _, token := range tokens {
+		*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
+		for i := 0; i < bloomFilterHashesCount; i++ {
+			hi := xxhash.Sum64(buf[:])
+			(*hp)++
+			idx := hi % maxBits
+			i := idx / 64
+			j := idx % 64
+			mask := uint64(1) << j
+			w := bits[i]
+			if (w & mask) == 0 {
+				// The token is missing
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// containsAny returns true if bf contains at least a single token from the given tokens.
+func (bf *bloomFilter) containsAny(tokens []string) bool {
+	bits := bf.bits
+	if len(bits) == 0 {
+		return true
+	}
+	maxBits := uint64(len(bits)) * 64
+	var buf [8]byte
+	hp := (*uint64)(unsafe.Pointer(&buf[0]))
+nextToken:
+	for _, token := range tokens {
+		*hp = xxhash.Sum64(bytesutil.ToUnsafeBytes(token))
+		for i := 0; i < bloomFilterHashesCount; i++ {
+			hi := xxhash.Sum64(buf[:])
+			(*hp)++
+			idx := hi % maxBits
+			i := idx / 64
+			j := idx % 64
+			mask := uint64(1) << j
+			w := bits[i]
+			if (w & mask) == 0 {
+				// The token is missing. Check the next token
+				continue nextToken
+			}
+		}
+		// It is likely the token exists in the bloom filter
+		return true
+	}
+	return false
+}
+
+func getBloomFilter() *bloomFilter {
+	v := bloomFilterPool.Get()
+	if v == nil {
+		return &bloomFilter{}
+	}
+	return v.(*bloomFilter)
+}
+
+func putBloomFilter(bf *bloomFilter) {
+	bf.reset()
+	bloomFilterPool.Put(bf)
+}
+
+var bloomFilterPool sync.Pool
--- a/lib/logstorage/bloomfilter_test.go
+++ b/lib/logstorage/bloomfilter_test.go
@ -0,0 +1,84 @@
+package logstorage
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestBloomFilter(t *testing.T) {
+	f := func(tokens []string) {
+		t.Helper()
+		data := bloomFilterMarshal(nil, tokens)
+		bf := getBloomFilter()
+		defer putBloomFilter(bf)
+		if err := bf.unmarshal(data); err != nil {
+			t.Fatalf("unexpected error when unmarshaling bloom filter: %s", err)
+		}
+		for _, token := range tokens {
+			if !bf.containsAny([]string{token}) {
+				t.Fatalf("bloomFilterContains must return true for the added token %q", token)
+			}
+		}
+		if !bf.containsAll(tokens) {
+			t.Fatalf("bloomFilterContains must return true for the added tokens")
+		}
+	}
+	f(nil)
+	f([]string{"foo"})
+	f([]string{"foo", "bar", "baz"})
+
+	// 10k tokens
+	tokens := make([]string, 10000)
+	for i := range tokens {
+		tokens[i] = fmt.Sprintf("token_%d", i)
+	}
+	f(tokens)
+}
+
+func TestBloomFilterUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		bf := getBloomFilter()
+		defer putBloomFilter(bf)
+		if err := bf.unmarshal(data); err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+	}
+	f([]byte("a"))
+	f([]byte("foo"))
+}
+
+func TestBloomFilterUnmarshalGarbage(t *testing.T) {
+	data := []byte("01234567")
+	var bf bloomFilter
+	if err := bf.unmarshal(data); err != nil {
+		t.Fatalf("unexpected error: %s", err)
+	}
+}
+
+func TestBloomFilterFalsePositive(t *testing.T) {
+	tokens := make([]string, 20000)
+	for i := range tokens {
+		tokens[i] = fmt.Sprintf("token_%d", i)
+	}
+	data := bloomFilterMarshal(nil, tokens)
+	bf := getBloomFilter()
+	defer putBloomFilter(bf)
+	if err := bf.unmarshal(data); err != nil {
+		t.Fatalf("unexpected error when unmarshaling bloom filter: %s", err)
+	}
+
+	// count the number of false positives on 20K non-existing tokens
+	falsePositives := 0
+	for i := range tokens {
+		token := fmt.Sprintf("non-existing-token_%d", i)
+		if bf.containsAny([]string{token}) {
+			falsePositives++
+		}
+	}
+	p := float64(falsePositives) / float64(len(tokens))
+	maxFalsePositive := 0.0011
+	if p > maxFalsePositive {
+		t.Fatalf("too high false positive rate; got %.4f; want %.4f max", p, maxFalsePositive)
+	}
+}
--- a/lib/logstorage/consts.go
+++ b/lib/logstorage/consts.go
@ -0,0 +1,32 @@
+package logstorage
+
+// maxUncompressedIndexBlockSize contains the maximum length of uncompressed block with blockHeader entries aka index block.
+//
+// The real block length can exceed this value by a small percentage because of the block write details.
+const maxUncompressedIndexBlockSize = 128 * 1024
+
+// maxUncompressedBlockSize is the maximum size of uncompressed block in bytes.
+//
+// The real uncompressed block can exceed this value by up to 2 times because of block merge details.
+const maxUncompressedBlockSize = 2 * 1024 * 1024
+
+// maxRowsPerBlock is the maximum number of log entries a single block can contain.
+const maxRowsPerBlock = 8 * 1024 * 1024
+
+// maxColumnsPerBlock is the maximum number of columns per block.
+const maxColumnsPerBlock = 10000
+
+// maxIndexBlockSize is the maximum size of the block with blockHeader entries (aka indexBlock)
+const maxIndexBlockSize = 8 * 1024 * 1024
+
+// maxTimestampsBlockSize is the maximum size of timestamps block
+const maxTimestampsBlockSize = 8 * 1024 * 1024
+
+// maxValuesBlockSize is the maximum size of values block
+const maxValuesBlockSize = 8 * 1024 * 1024
+
+// maxBloomFilterBlockSize is the maximum size of bloom filter block
+const maxBloomFilterBlockSize = 8 * 1024 * 1024
+
+// maxColumnsHeaderSize is the maximum size of columnsHeader block
+const maxColumnsHeaderSize = 8 * 1024 * 1024
--- a/lib/logstorage/datadb.go
+++ b/lib/logstorage/datadb.go
@ -0,0 +1,990 @@
+package logstorage
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+)
+
+// Default number of parts to merge at once.
+//
+// This number has been obtained empirically - it gives the lowest possible overhead.
+// See appendPartsToMerge tests for details.
+const defaultPartsToMerge = 15
+
+// minMergeMultiplier is the minimum multiplier for the size of the output part
+// compared to the size of the maximum input part for the merge.
+//
+// Higher value reduces write amplification (disk write IO induced by the merge),
+// while increases the number of unmerged parts.
+// The 1.7 is good enough for production workloads.
+const minMergeMultiplier = 1.7
+
+// The maximum number of inmemory parts in the partition.
+//
+// If the number of inmemory parts reaches this value, then assisted merge runs during data ingestion.
+const maxInmemoryPartsPerPartition = 20
+
+// datadb represents a database with log data
+type datadb struct {
+	// pt is the partition the datadb belongs to
+	pt *partition
+
+	// mergeIdx is used for generating unique directory names for parts
+	mergeIdx uint64
+
+	// path is the path to the directory with log data
+	path string
+
+	// flushInterval is interval for flushing the inmemory parts to disk
+	flushInterval time.Duration
+
+	// inmemoryParts contains a list of inmemory parts
+	inmemoryParts []*partWrapper
+
+	// fileParts contains a list of file-based parts
+	fileParts []*partWrapper
+
+	// partsLock protects parts from concurrent access
+	partsLock sync.Mutex
+
+	// wg is used for determining when background workers stop
+	wg sync.WaitGroup
+
+	// stopCh is used for notifying background workers to stop
+	stopCh chan struct{}
+
+	// mergeDoneCond is used for pace-limiting the data ingestion rate
+	mergeDoneCond *sync.Cond
+
+	// inmemoryPartsFlushersCount is the number of currently running in-memory parts flushers
+	//
+	// This variable must be accessed under partsLock.
+	inmemoryPartsFlushersCount int
+
+	// mergeWorkersCount is the number of currently running merge workers
+	//
+	// This variable must be accessed under partsLock.
+	mergeWorkersCount int
+}
+
+// partWrapper is a wrapper for opened part.
+type partWrapper struct {
+	// refCount is the number of references to p.
+	//
+	// When the number of references reaches zero, then p is closed.
+	refCount int32
+
+	// The flag, which is set when the part must be deleted after refCount reaches zero.
+	mustBeDeleted uint32
+
+	// p is an opened part
+	p *part
+
+	// mp references inmemory part used for initializing p.
+	mp *inmemoryPart
+
+	// isInMerge is set to true if the part takes part in merge.
+	isInMerge bool
+
+	// The deadline when in-memory part must be flushed to disk.
+	flushDeadline time.Time
+}
+
+func (pw *partWrapper) incRef() {
+	atomic.AddInt32(&pw.refCount, 1)
+}
+
+func (pw *partWrapper) decRef() {
+	n := atomic.AddInt32(&pw.refCount, -1)
+	if n > 0 {
+		return
+	}
+
+	deletePath := ""
+	if pw.mp == nil {
+		if atomic.LoadUint32(&pw.mustBeDeleted) != 0 {
+			deletePath = pw.p.path
+		}
+	} else {
+		putInmemoryPart(pw.mp)
+		pw.mp = nil
+	}
+
+	mustClosePart(pw.p)
+	pw.p = nil
+
+	if deletePath != "" {
+		fs.MustRemoveAll(deletePath)
+	}
+}
+
+func mustCreateDatadb(path string) {
+	fs.MustMkdirFailIfExist(path)
+	mustWritePartNames(path, []string{})
+}
+
+// mustOpenDatadb opens datadb at the given path with the given flushInterval for in-memory data.
+func mustOpenDatadb(pt *partition, path string, flushInterval time.Duration) *datadb {
+	// Remove temporary directories, which may be left after unclean shutdown.
+	fs.MustRemoveTemporaryDirs(path)
+
+	partNames := mustReadPartNames(path)
+	mustRemoveUnusedDirs(path, partNames)
+
+	pws := make([]*partWrapper, len(partNames))
+	for i, partName := range partNames {
+		partPath := filepath.Join(path, partName)
+		p := mustOpenFilePart(pt, partPath)
+		pws[i] = newPartWrapper(p, nil, time.Time{})
+	}
+
+	ddb := &datadb{
+		pt:            pt,
+		mergeIdx:      uint64(time.Now().UnixNano()),
+		flushInterval: flushInterval,
+		path:          path,
+		fileParts:     pws,
+		stopCh:        make(chan struct{}),
+	}
+	ddb.mergeDoneCond = sync.NewCond(&ddb.partsLock)
+
+	// Start merge workers in the hope they'll merge the remaining parts
+	ddb.partsLock.Lock()
+	n := getMergeWorkersCount()
+	for i := 0; i < n; i++ {
+		ddb.startMergeWorkerLocked()
+	}
+	ddb.partsLock.Unlock()
+
+	return ddb
+}
+
+// startInmemoryPartsFlusherLocked starts flusher for in-memory parts to disk.
+//
+// This function must be called under partsLock.
+func (ddb *datadb) startInmemoryPartsFlusherLocked() {
+	if ddb.inmemoryPartsFlushersCount >= 1 {
+		return
+	}
+	ddb.inmemoryPartsFlushersCount++
+	ddb.wg.Add(1)
+	go func() {
+		ddb.flushInmemoryParts()
+		ddb.wg.Done()
+	}()
+}
+
+func (ddb *datadb) flushInmemoryParts() {
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	for {
+		ddb.partsLock.Lock()
+		pws := make([]*partWrapper, 0, len(ddb.inmemoryParts))
+		pws = appendNotInMergePartsLocked(pws, ddb.inmemoryParts)
+		currentTime := time.Now()
+		partsToFlush := pws[:0]
+		for _, pw := range pws {
+			if pw.flushDeadline.Before(currentTime) {
+				partsToFlush = append(partsToFlush, pw)
+			}
+		}
+		setInMergeLocked(partsToFlush)
+		if len(pws) == 0 {
+			ddb.inmemoryPartsFlushersCount--
+		}
+		ddb.partsLock.Unlock()
+
+		if len(pws) == 0 {
+			// There are no in-memory parts, so stop the flusher.
+			return
+		}
+		ddb.mustMergePartsFinal(partsToFlush)
+
+		select {
+		case <-ddb.stopCh:
+			return
+		case <-ticker.C:
+		}
+	}
+}
+
+// startMergeWorkerLocked starts a merge worker.
+//
+// This function must be called under locked partsLock.
+func (ddb *datadb) startMergeWorkerLocked() {
+	if ddb.mergeWorkersCount >= getMergeWorkersCount() {
+		return
+	}
+	ddb.mergeWorkersCount++
+	ddb.wg.Add(1)
+	go func() {
+		globalMergeLimitCh <- struct{}{}
+		ddb.mustMergeExistingParts()
+		<-globalMergeLimitCh
+		ddb.wg.Done()
+	}()
+}
+
+// globalMergeLimitCh limits the number of concurrent merges across all the partitions
+var globalMergeLimitCh = make(chan struct{}, getMergeWorkersCount())
+
+func getMergeWorkersCount() int {
+	n := cgroup.AvailableCPUs()
+	if n < 4 {
+		// Use bigger number of workers on systems with small number of CPU cores,
+		// since a single worker may become busy for long time when merging big parts.
+		// Then the remaining workers may continue performing merges
+		// for newly added small parts.
+		return 4
+	}
+	return n
+}
+
+func (ddb *datadb) mustMergeExistingParts() {
+	for !needStop(ddb.stopCh) {
+		maxOutBytes := ddb.availableDiskSpace()
+
+		ddb.partsLock.Lock()
+		parts := make([]*partWrapper, 0, len(ddb.inmemoryParts)+len(ddb.fileParts))
+		parts = appendNotInMergePartsLocked(parts, ddb.inmemoryParts)
+		parts = appendNotInMergePartsLocked(parts, ddb.fileParts)
+		pws := appendPartsToMerge(nil, parts, maxOutBytes)
+		setInMergeLocked(pws)
+		if len(pws) == 0 {
+			ddb.mergeWorkersCount--
+		}
+		ddb.partsLock.Unlock()
+
+		if len(pws) == 0 {
+			// Nothing to merge at the moment.
+			return
+		}
+
+		partsSize := getCompressedSize(pws)
+		if !ddb.reserveDiskSpace(partsSize) {
+			// There is no free disk space for the merge,
+			// because concurrent merge workers already reserved the disk space.
+			// Try again with smaller maxOutBytes.
+			ddb.releasePartsToMerge(pws)
+			continue
+		}
+		ddb.mustMergeParts(pws, false)
+		ddb.releaseDiskSpace(partsSize)
+	}
+}
+
+// appendNotInMergePartsLocked appends src parts with isInMerge=false to dst and returns the result.
+//
+// This function must be called under partsLock.
+func appendNotInMergePartsLocked(dst, src []*partWrapper) []*partWrapper {
+	for _, pw := range src {
+		if !pw.isInMerge {
+			dst = append(dst, pw)
+		}
+	}
+	return dst
+}
+
+// setInMergeLocked sets isInMerge flag for pws.
+//
+// This function must be called under partsLock.
+func setInMergeLocked(pws []*partWrapper) {
+	for _, pw := range pws {
+		if pw.isInMerge {
+			logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to true")
+		}
+		pw.isInMerge = true
+	}
+}
+
+func assertIsInMerge(pws []*partWrapper) {
+	for _, pw := range pws {
+		if !pw.isInMerge {
+			logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to false")
+		}
+	}
+}
+
+// mustMergeParts merges pws to a single resulting part.
+//
+// if isFinal is set, then the resulting part will be saved to disk.
+//
+// All the parts inside pws must have isInMerge field set to true.
+func (ddb *datadb) mustMergeParts(pws []*partWrapper, isFinal bool) {
+	if len(pws) == 0 {
+		// Nothing to merge.
+		return
+	}
+	assertIsInMerge(pws)
+
+	startTime := time.Now()
+
+	// Initialize destination paths.
+	dstPartType := ddb.getDstPartType(pws, isFinal)
+	mergeIdx := ddb.nextMergeIdx()
+	dstPartPath := ddb.getDstPartPath(dstPartType, mergeIdx)
+
+	if isFinal && len(pws) == 1 && pws[0].mp != nil {
+		// Fast path: flush a single in-memory part to disk.
+		mp := pws[0].mp
+		mp.MustStoreToDisk(dstPartPath)
+		pwNew := ddb.openCreatedPart(&mp.ph, pws, nil, dstPartPath)
+		ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
+		return
+	}
+
+	// Prepare blockStreamReaders for source parts.
+	bsrs := mustOpenBlockStreamReaders(pws)
+
+	// Prepare BlockStreamWriter for destination part.
+	srcSize := uint64(0)
+	srcRowsCount := uint64(0)
+	srcBlocksCount := uint64(0)
+	for _, pw := range pws {
+		srcSize += pw.p.ph.CompressedSizeBytes
+		srcRowsCount += pw.p.ph.RowsCount
+		srcBlocksCount += pw.p.ph.BlocksCount
+	}
+	bsw := getBlockStreamWriter()
+	var mpNew *inmemoryPart
+	if dstPartType == partInmemory {
+		mpNew = getInmemoryPart()
+		bsw.MustInitForInmemoryPart(mpNew)
+	} else {
+		nocache := !shouldUsePageCacheForPartSize(srcSize)
+		bsw.MustInitForFilePart(dstPartPath, nocache)
+	}
+
+	// Merge source parts to destination part.
+	var ph partHeader
+	stopCh := ddb.stopCh
+	if isFinal {
+		// The final merge shouldn't be stopped even if ddb.stopCh is closed.
+		stopCh = nil
+	}
+	mustMergeBlockStreams(&ph, bsw, bsrs, stopCh)
+	putBlockStreamWriter(bsw)
+	for _, bsr := range bsrs {
+		putBlockStreamReader(bsr)
+	}
+
+	// Persist partHeader for destination part after the merge.
+	if mpNew != nil {
+		mpNew.ph = ph
+	} else {
+		ph.mustWriteMetadata(dstPartPath)
+		// Make sure the created part directory listing is synced.
+		fs.MustSyncPath(dstPartPath)
+	}
+	if needStop(stopCh) {
+		ddb.releasePartsToMerge(pws)
+		ddb.mergeDoneCond.Broadcast()
+		// Remove incomplete destination part
+		if dstPartType == partFile {
+			fs.MustRemoveAll(dstPartPath)
+		}
+		return
+	}
+
+	// Atomically swap the source parts with the newly created part.
+	pwNew := ddb.openCreatedPart(&ph, pws, mpNew, dstPartPath)
+
+	dstSize := uint64(0)
+	dstRowsCount := uint64(0)
+	dstBlocksCount := uint64(0)
+	if pwNew != nil {
+		pDst := pwNew.p
+		dstSize = pDst.ph.CompressedSizeBytes
+		dstRowsCount = pDst.ph.RowsCount
+		dstBlocksCount = pDst.ph.BlocksCount
+	}
+
+	ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
+
+	d := time.Since(startTime)
+	if d <= 30*time.Second {
+		return
+	}
+
+	// Log stats for long merges.
+	durationSecs := d.Seconds()
+	rowsPerSec := int(float64(srcRowsCount) / durationSecs)
+	logger.Infof("merged (%d parts, %d rows, %d blocks, %d bytes) into (1 part, %d rows, %d blocks, %d bytes) in %.3f seconds at %d rows/sec to %q",
+		len(pws), srcRowsCount, srcBlocksCount, srcSize, dstRowsCount, dstBlocksCount, dstSize, durationSecs, rowsPerSec, dstPartPath)
+}
+
+func (ddb *datadb) nextMergeIdx() uint64 {
+	return atomic.AddUint64(&ddb.mergeIdx, 1)
+}
+
+type partType int
+
+var (
+	partInmemory = partType(0)
+	partFile     = partType(1)
+)
+
+func (ddb *datadb) getDstPartType(pws []*partWrapper, isFinal bool) partType {
+	if isFinal {
+		return partFile
+	}
+	dstPartSize := getCompressedSize(pws)
+	if dstPartSize > getMaxInmemoryPartSize() {
+		return partFile
+	}
+	if !areAllInmemoryParts(pws) {
+		// If at least a single source part is located in file,
+		// then the destination part must be in file for durability reasons.
+		return partFile
+	}
+	return partInmemory
+}
+
+func (ddb *datadb) getDstPartPath(dstPartType partType, mergeIdx uint64) string {
+	ptPath := ddb.path
+	dstPartPath := ""
+	if dstPartType != partInmemory {
+		dstPartPath = filepath.Join(ptPath, fmt.Sprintf("%016X", mergeIdx))
+	}
+	return dstPartPath
+}
+
+func (ddb *datadb) openCreatedPart(ph *partHeader, pws []*partWrapper, mpNew *inmemoryPart, dstPartPath string) *partWrapper {
+	// Open the created part.
+	if ph.RowsCount == 0 {
+		// The created part is empty. Remove it
+		if mpNew == nil {
+			fs.MustRemoveAll(dstPartPath)
+		}
+		return nil
+	}
+	var p *part
+	var flushDeadline time.Time
+	if mpNew != nil {
+		// Open the created part from memory.
+		p = mustOpenInmemoryPart(ddb.pt, mpNew)
+		flushDeadline = ddb.getFlushToDiskDeadline(pws)
+	} else {
+		// Open the created part from disk.
+		p = mustOpenFilePart(ddb.pt, dstPartPath)
+	}
+	return newPartWrapper(p, mpNew, flushDeadline)
+}
+
+func (ddb *datadb) mustAddRows(lr *LogRows) {
+	if len(lr.streamIDs) == 0 {
+		return
+	}
+
+	mp := getInmemoryPart()
+	mp.mustInitFromRows(lr)
+	p := mustOpenInmemoryPart(ddb.pt, mp)
+
+	flushDeadline := time.Now().Add(ddb.flushInterval)
+	pw := newPartWrapper(p, mp, flushDeadline)
+
+	ddb.partsLock.Lock()
+	ddb.inmemoryParts = append(ddb.inmemoryParts, pw)
+	ddb.startInmemoryPartsFlusherLocked()
+	if len(ddb.inmemoryParts) > defaultPartsToMerge {
+		ddb.startMergeWorkerLocked()
+	}
+	for len(ddb.inmemoryParts) > maxInmemoryPartsPerPartition {
+		// limit the pace for data ingestion if too many inmemory parts are created
+		ddb.mergeDoneCond.Wait()
+	}
+	ddb.partsLock.Unlock()
+}
+
+// DatadbStats contains various stats for datadb.
+type DatadbStats struct {
+	// InmemoryRowsCount is the number of rows, which weren't flushed to disk yet.
+	InmemoryRowsCount uint64
+
+	// FileRowsCount is the number of rows stored on disk.
+	FileRowsCount uint64
+
+	// InmemoryParts is the number of in-memory parts, which weren't flushed to disk yet.
+	InmemoryParts uint64
+
+	// FileParts is the number of file-based parts stored on disk.
+	FileParts uint64
+
+	// InmemoryBlocks is the number of in-memory blocks, which weren't flushed to disk yet.
+	InmemoryBlocks uint64
+
+	// FileBlocks is the number of file-based blocks stored on disk.
+	FileBlocks uint64
+
+	// CompressedInmemorySize is the size of compressed data stored in memory.
+	CompressedInmemorySize uint64
+
+	// CompressedFileSize is the size of compressed data stored on disk.
+	CompressedFileSize uint64
+
+	// UncompressedInmemorySize is the size of uncompressed data stored in memory.
+	UncompressedInmemorySize uint64
+
+	// UncompressedFileSize is the size of uncompressed data stored on disk.
+	UncompressedFileSize uint64
+}
+
+func (s *DatadbStats) reset() {
+	*s = DatadbStats{}
+}
+
+// RowsCount returns the number of rows stored in datadb.
+func (s *DatadbStats) RowsCount() uint64 {
+	return s.InmemoryRowsCount + s.FileRowsCount
+}
+
+// updateStats updates s with ddb stats
+func (ddb *datadb) updateStats(s *DatadbStats) {
+	ddb.partsLock.Lock()
+
+	s.InmemoryRowsCount += getRowsCount(ddb.inmemoryParts)
+	s.FileRowsCount += getRowsCount(ddb.fileParts)
+
+	s.InmemoryParts += uint64(len(ddb.inmemoryParts))
+	s.FileParts += uint64(len(ddb.fileParts))
+
+	s.InmemoryBlocks += getBlocksCount(ddb.inmemoryParts)
+	s.FileBlocks += getBlocksCount(ddb.fileParts)
+
+	s.CompressedInmemorySize += getCompressedSize(ddb.inmemoryParts)
+	s.CompressedFileSize += getCompressedSize(ddb.fileParts)
+
+	s.UncompressedInmemorySize += getUncompressedSize(ddb.inmemoryParts)
+	s.UncompressedFileSize += getUncompressedSize(ddb.fileParts)
+
+	ddb.partsLock.Unlock()
+}
+
+// debugFlush() makes sure that the recently ingested data is availalbe for search.
+func (ddb *datadb) debugFlush() {
+	// Nothing to do, since all the ingested data is available for search via ddb.inmemoryParts.
+}
+
+func (ddb *datadb) mustMergePartsFinal(pws []*partWrapper) {
+	assertIsInMerge(pws)
+
+	var pwsChunk []*partWrapper
+	for len(pws) > 0 {
+		pwsChunk = appendPartsToMerge(pwsChunk[:0], pws, (1<<64)-1)
+		if len(pwsChunk) == 0 {
+			pwsChunk = append(pwsChunk[:0], pws...)
+		}
+		ddb.mustMergeParts(pwsChunk, true)
+
+		partsToRemove := partsToMap(pwsChunk)
+		removedParts := 0
+		pws, removedParts = removeParts(pws, partsToRemove)
+		if removedParts != len(pwsChunk) {
+			logger.Panicf("BUG: unexpected number of parts removed; got %d; want %d", removedParts, len(pwsChunk))
+		}
+	}
+}
+
+func partsToMap(pws []*partWrapper) map[*partWrapper]struct{} {
+	m := make(map[*partWrapper]struct{}, len(pws))
+	for _, pw := range pws {
+		m[pw] = struct{}{}
+	}
+	if len(m) != len(pws) {
+		logger.Panicf("BUG: %d duplicate parts found out of %d parts", len(pws)-len(m), len(pws))
+	}
+	return m
+}
+
+func (ddb *datadb) swapSrcWithDstParts(pws []*partWrapper, pwNew *partWrapper, dstPartType partType) {
+	// Atomically unregister old parts and add new part to pt.
+	partsToRemove := partsToMap(pws)
+	removedInmemoryParts := 0
+	removedFileParts := 0
+
+	ddb.partsLock.Lock()
+
+	ddb.inmemoryParts, removedInmemoryParts = removeParts(ddb.inmemoryParts, partsToRemove)
+	ddb.fileParts, removedFileParts = removeParts(ddb.fileParts, partsToRemove)
+	if pwNew != nil {
+		switch dstPartType {
+		case partInmemory:
+			ddb.inmemoryParts = append(ddb.inmemoryParts, pwNew)
+			ddb.startInmemoryPartsFlusherLocked()
+		case partFile:
+			ddb.fileParts = append(ddb.fileParts, pwNew)
+		default:
+			logger.Panicf("BUG: unknown partType=%d", dstPartType)
+		}
+		if len(ddb.inmemoryParts)+len(ddb.fileParts) > defaultPartsToMerge {
+			ddb.startMergeWorkerLocked()
+		}
+	}
+
+	// Atomically store the updated list of file-based parts on disk.
+	// This must be performed under partsLock in order to prevent from races
+	// when multiple concurrently running goroutines update the list.
+	if removedFileParts > 0 || pwNew != nil && dstPartType == partFile {
+		partNames := getPartNames(ddb.fileParts)
+		mustWritePartNames(ddb.path, partNames)
+	}
+
+	ddb.partsLock.Unlock()
+
+	removedParts := removedInmemoryParts + removedFileParts
+	if removedParts != len(partsToRemove) {
+		logger.Panicf("BUG: unexpected number of parts removed; got %d, want %d", removedParts, len(partsToRemove))
+	}
+
+	// Mark old parts as must be deleted and decrement reference count,
+	// so they are eventually closed and deleted.
+	for _, pw := range pws {
+		atomic.StoreUint32(&pw.mustBeDeleted, 1)
+		pw.decRef()
+	}
+
+	ddb.mergeDoneCond.Broadcast()
+}
+
+func removeParts(pws []*partWrapper, partsToRemove map[*partWrapper]struct{}) ([]*partWrapper, int) {
+	dst := pws[:0]
+	for _, pw := range pws {
+		if _, ok := partsToRemove[pw]; !ok {
+			dst = append(dst, pw)
+		}
+	}
+	for i := len(dst); i < len(pws); i++ {
+		pws[i] = nil
+	}
+	return dst, len(pws) - len(dst)
+}
+
+func mustOpenBlockStreamReaders(pws []*partWrapper) []*blockStreamReader {
+	bsrs := make([]*blockStreamReader, 0, len(pws))
+	for _, pw := range pws {
+		bsr := getBlockStreamReader()
+		if pw.mp != nil {
+			bsr.MustInitFromInmemoryPart(pw.mp)
+		} else {
+			bsr.MustInitFromFilePart(pw.p.path)
+		}
+		bsrs = append(bsrs, bsr)
+	}
+	return bsrs
+}
+
+func newPartWrapper(p *part, mp *inmemoryPart, flushDeadline time.Time) *partWrapper {
+	pw := &partWrapper{
+		p:  p,
+		mp: mp,
+
+		flushDeadline: flushDeadline,
+	}
+
+	// Increase reference counter for newly created part - it is decreased when the part
+	// is removed from the list of open parts.
+	pw.incRef()
+
+	return pw
+}
+
+func (ddb *datadb) getFlushToDiskDeadline(pws []*partWrapper) time.Time {
+	d := time.Now().Add(ddb.flushInterval)
+	for _, pw := range pws {
+		if pw.mp != nil && pw.flushDeadline.Before(d) {
+			d = pw.flushDeadline
+		}
+	}
+	return d
+}
+
+func getMaxInmemoryPartSize() uint64 {
+	// Allocate 10% of allowed memory for in-memory parts.
+	n := uint64(0.1 * float64(memory.Allowed()) / maxInmemoryPartsPerPartition)
+	if n < 1e6 {
+		n = 1e6
+	}
+	return n
+}
+
+func areAllInmemoryParts(pws []*partWrapper) bool {
+	for _, pw := range pws {
+		if pw.mp == nil {
+			return false
+		}
+	}
+	return true
+}
+
+func (ddb *datadb) releasePartsToMerge(pws []*partWrapper) {
+	ddb.partsLock.Lock()
+	for _, pw := range pws {
+		if !pw.isInMerge {
+			logger.Panicf("BUG: missing isInMerge flag on the part %q", pw.p.path)
+		}
+		pw.isInMerge = false
+	}
+	ddb.partsLock.Unlock()
+}
+
+func (ddb *datadb) availableDiskSpace() uint64 {
+	available := fs.MustGetFreeSpace(ddb.path)
+	reserved := atomic.LoadUint64(&reservedDiskSpace)
+	if available < reserved {
+		return 0
+	}
+	return available - reserved
+}
+
+func (ddb *datadb) reserveDiskSpace(n uint64) bool {
+	available := fs.MustGetFreeSpace(ddb.path)
+	reserved := atomic.AddUint64(&reservedDiskSpace, n)
+	if available > reserved {
+		return true
+	}
+	ddb.releaseDiskSpace(n)
+	return false
+}
+
+func (ddb *datadb) releaseDiskSpace(n uint64) {
+	atomic.AddUint64(&reservedDiskSpace, -n)
+}
+
+// reservedDiskSpace tracks global reserved disk space for currently executed
+// background merges across all the partitions.
+//
+// It should allow avoiding background merges when there is no free disk space.
+var reservedDiskSpace uint64
+
+func needStop(stopCh <-chan struct{}) bool {
+	select {
+	case <-stopCh:
+		return true
+	default:
+		return false
+	}
+}
+
+// mustCloseDatadb can be called only when nobody accesses ddb.
+func mustCloseDatadb(ddb *datadb) {
+	// Stop background workers
+	close(ddb.stopCh)
+	ddb.wg.Wait()
+
+	// flush in-memory data to disk
+	pws := append([]*partWrapper{}, ddb.inmemoryParts...)
+	setInMergeLocked(pws)
+	ddb.mustMergePartsFinal(pws)
+
+	// There is no need in using ddb.partsLock here, since nobody should acces ddb now.
+	for _, pw := range ddb.inmemoryParts {
+		pw.decRef()
+		if pw.refCount != 0 {
+			logger.Panicf("BUG: there are %d references to inmemoryPart", pw.refCount)
+		}
+	}
+	ddb.inmemoryParts = nil
+
+	for _, pw := range ddb.fileParts {
+		pw.decRef()
+		if pw.refCount != 0 {
+			logger.Panicf("BUG: ther are %d references to filePart", pw.refCount)
+		}
+	}
+	ddb.fileParts = nil
+
+	ddb.path = ""
+	ddb.pt = nil
+}
+
+func getPartNames(pws []*partWrapper) []string {
+	partNames := make([]string, 0, len(pws))
+	for _, pw := range pws {
+		if pw.mp != nil {
+			// Skip in-memory parts
+			continue
+		}
+		partName := filepath.Base(pw.p.path)
+		partNames = append(partNames, partName)
+	}
+	sort.Strings(partNames)
+	return partNames
+}
+
+func mustWritePartNames(path string, partNames []string) {
+	data, err := json.Marshal(partNames)
+	if err != nil {
+		logger.Panicf("BUG: cannot marshal partNames to JSON: %s", err)
+	}
+	partNamesPath := filepath.Join(path, partsFilename)
+	fs.MustWriteAtomic(partNamesPath, data, true)
+}
+
+func mustReadPartNames(path string) []string {
+	partNamesPath := filepath.Join(path, partsFilename)
+	data, err := os.ReadFile(partNamesPath)
+	if err != nil {
+		logger.Panicf("FATAL: cannot read %s: %s", partNamesPath, err)
+	}
+	var partNames []string
+	if err := json.Unmarshal(data, &partNames); err != nil {
+		logger.Panicf("FATAL: cannot parse %s: %s", partNamesPath, err)
+	}
+	return partNames
+}
+
+// mustRemoveUnusedDirs removes dirs at path, which are missing in partNames.
+//
+// These dirs may be left after unclean shutdown.
+func mustRemoveUnusedDirs(path string, partNames []string) {
+	des := fs.MustReadDir(path)
+	m := make(map[string]struct{}, len(partNames))
+	for _, partName := range partNames {
+		m[partName] = struct{}{}
+	}
+	removedDirs := 0
+	for _, de := range des {
+		if !fs.IsDirOrSymlink(de) {
+			// Skip non-directories.
+			continue
+		}
+		fn := de.Name()
+		if _, ok := m[fn]; !ok {
+			deletePath := filepath.Join(path, fn)
+			fs.MustRemoveAll(deletePath)
+			removedDirs++
+		}
+	}
+	if removedDirs > 0 {
+		fs.MustSyncPath(path)
+	}
+}
+
+// appendPartsToMerge finds optimal parts to merge from src,
+// appends them to dst and returns the result.
+func appendPartsToMerge(dst, src []*partWrapper, maxOutBytes uint64) []*partWrapper {
+	if len(src) < 2 {
+		// There is no need in merging zero or one part :)
+		return dst
+	}
+
+	// Filter out too big parts.
+	// This should reduce N for O(N^2) algorithm below.
+	maxInPartBytes := uint64(float64(maxOutBytes) / minMergeMultiplier)
+	tmp := make([]*partWrapper, 0, len(src))
+	for _, pw := range src {
+		if pw.p.ph.CompressedSizeBytes > maxInPartBytes {
+			continue
+		}
+		tmp = append(tmp, pw)
+	}
+	src = tmp
+
+	sortPartsForOptimalMerge(src)
+
+	maxSrcParts := defaultPartsToMerge
+	if maxSrcParts > len(src) {
+		maxSrcParts = len(src)
+	}
+	minSrcParts := (maxSrcParts + 1) / 2
+	if minSrcParts < 2 {
+		minSrcParts = 2
+	}
+
+	// Exhaustive search for parts giving the lowest write amplification when merged.
+	var pws []*partWrapper
+	maxM := float64(0)
+	for i := minSrcParts; i <= maxSrcParts; i++ {
+		for j := 0; j <= len(src)-i; j++ {
+			a := src[j : j+i]
+			if a[0].p.ph.CompressedSizeBytes*uint64(len(a)) < a[len(a)-1].p.ph.CompressedSizeBytes {
+				// Do not merge parts with too big difference in size,
+				// since this results in unbalanced merges.
+				continue
+			}
+			outSize := getCompressedSize(a)
+			if outSize > maxOutBytes {
+				// There is no need in verifying remaining parts with bigger sizes.
+				break
+			}
+			m := float64(outSize) / float64(a[len(a)-1].p.ph.CompressedSizeBytes)
+			if m < maxM {
+				continue
+			}
+			maxM = m
+			pws = a
+		}
+	}
+
+	minM := float64(defaultPartsToMerge) / 2
+	if minM < minMergeMultiplier {
+		minM = minMergeMultiplier
+	}
+	if maxM < minM {
+		// There is no sense in merging parts with too small m,
+		// since this leads to high disk write IO.
+		return dst
+	}
+	return append(dst, pws...)
+}
+
+func sortPartsForOptimalMerge(pws []*partWrapper) {
+	// Sort src parts by size and backwards timestamp.
+	// This should improve adjanced points' locality in the merged parts.
+	sort.Slice(pws, func(i, j int) bool {
+		a := &pws[i].p.ph
+		b := &pws[j].p.ph
+		if a.CompressedSizeBytes == b.CompressedSizeBytes {
+			return a.MinTimestamp > b.MinTimestamp
+		}
+		return a.CompressedSizeBytes < b.CompressedSizeBytes
+	})
+}
+
+func getCompressedSize(pws []*partWrapper) uint64 {
+	n := uint64(0)
+	for _, pw := range pws {
+		n += pw.p.ph.CompressedSizeBytes
+	}
+	return n
+}
+
+func getUncompressedSize(pws []*partWrapper) uint64 {
+	n := uint64(0)
+	for _, pw := range pws {
+		n += pw.p.ph.UncompressedSizeBytes
+	}
+	return n
+}
+
+func getRowsCount(pws []*partWrapper) uint64 {
+	n := uint64(0)
+	for _, pw := range pws {
+		n += pw.p.ph.RowsCount
+	}
+	return n
+}
+
+func getBlocksCount(pws []*partWrapper) uint64 {
+	n := uint64(0)
+	for _, pw := range pws {
+		n += pw.p.ph.BlocksCount
+	}
+	return n
+}
+
+func shouldUsePageCacheForPartSize(size uint64) bool {
+	mem := memory.Remaining() / defaultPartsToMerge
+	return size <= uint64(mem)
+}
--- a/lib/logstorage/datadb_test.go
+++ b/lib/logstorage/datadb_test.go
@ -0,0 +1,91 @@
+package logstorage
+
+import (
+	"math/rand"
+	"testing"
+)
+
+func TestAppendPartsToMergeManyParts(t *testing.T) {
+	// Verify that big number of parts are merged into minimal number of parts
+	// using minimum merges.
+	var sizes []uint64
+	maxOutSize := uint64(0)
+	r := rand.New(rand.NewSource(1))
+	for i := 0; i < 1024; i++ {
+		n := uint64(uint32(r.NormFloat64() * 1e9))
+		n++
+		maxOutSize += n
+		sizes = append(sizes, n)
+	}
+	pws := newTestPartWrappersForSizes(sizes)
+
+	iterationsCount := 0
+	sizeMergedTotal := uint64(0)
+	for {
+		pms := appendPartsToMerge(nil, pws, maxOutSize)
+		if len(pms) == 0 {
+			break
+		}
+		m := make(map[*partWrapper]bool)
+		for _, pw := range pms {
+			m[pw] = true
+		}
+		var pwsNew []*partWrapper
+		size := uint64(0)
+		for _, pw := range pws {
+			if m[pw] {
+				size += pw.p.ph.CompressedSizeBytes
+			} else {
+				pwsNew = append(pwsNew, pw)
+			}
+		}
+		pw := &partWrapper{
+			p: &part{
+				ph: partHeader{
+					CompressedSizeBytes: size,
+				},
+			},
+		}
+		sizeMergedTotal += size
+		pwsNew = append(pwsNew, pw)
+		pws = pwsNew
+		iterationsCount++
+	}
+	sizes = newTestSizesFromPartWrappers(pws)
+	sizeTotal := uint64(0)
+	for _, size := range sizes {
+		sizeTotal += uint64(size)
+	}
+	overhead := float64(sizeMergedTotal) / float64(sizeTotal)
+	if overhead > 2.1 {
+		t.Fatalf("too big overhead; sizes=%d, iterationsCount=%d, sizeTotal=%d, sizeMergedTotal=%d, overhead=%f",
+			sizes, iterationsCount, sizeTotal, sizeMergedTotal, overhead)
+	}
+	if len(sizes) > 18 {
+		t.Fatalf("too many sizes %d; sizes=%d, iterationsCount=%d, sizeTotal=%d, sizeMergedTotal=%d, overhead=%f",
+			len(sizes), sizes, iterationsCount, sizeTotal, sizeMergedTotal, overhead)
+	}
+}
+
+func newTestSizesFromPartWrappers(pws []*partWrapper) []uint64 {
+	var sizes []uint64
+	for _, pw := range pws {
+		sizes = append(sizes, pw.p.ph.CompressedSizeBytes)
+	}
+	return sizes
+}
+
+func newTestPartWrappersForSizes(sizes []uint64) []*partWrapper {
+	var pws []*partWrapper
+	for _, size := range sizes {
+		pw := &partWrapper{
+			p: &part{
+				ph: partHeader{
+					CompressedSizeBytes: size,
+				},
+			},
+		}
+		pws = append(pws, pw)
+	}
+	return pws
+}
--- a/lib/logstorage/encoding.go
+++ b/lib/logstorage/encoding.go
@ -0,0 +1,314 @@
+package logstorage
+
+import (
+	"fmt"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+// marshalStringsBlock marshals a and appends the result to dst.
+//
+// The marshaled strings block can be unmarshaled with stringsBlockUnmarshaler.
+func marshalStringsBlock(dst []byte, a []string) []byte {
+	// Encode string lengths
+	u64s := encoding.GetUint64s(len(a))
+	aLens := u64s.A[:0]
+	for _, s := range a {
+		aLens = append(aLens, uint64(len(s)))
+	}
+	u64s.A = aLens
+	dst = marshalUint64Block(dst, u64s.A)
+	encoding.PutUint64s(u64s)
+
+	// Encode strings
+	bb := bbPool.Get()
+	b := bb.B
+	for _, s := range a {
+		b = append(b, s...)
+	}
+	bb.B = b
+	dst = marshalBytesBlock(dst, bb.B)
+	bbPool.Put(bb)
+
+	return dst
+}
+
+// stringsBlockUnmarshaler is used for unmarshaling the block returned from marshalStringsBlock()
+//
+// use getStringsBlockUnmarshaler() for obtaining the unmarshaler from the pool in order to save memory allocations.
+type stringsBlockUnmarshaler struct {
+	// data contains the data for the unmarshaled values
+	data []byte
+}
+
+func (sbu *stringsBlockUnmarshaler) reset() {
+	sbu.data = sbu.data[:0]
+}
+
+// unmarshal unmarshals itemsCount strings from src, appends them to dst and returns the result.
+//
+// The returned strings are valid until sbu.reset() call.
+func (sbu *stringsBlockUnmarshaler) unmarshal(dst []string, src []byte, itemsCount uint64) ([]string, error) {
+	u64s := encoding.GetUint64s(0)
+	defer encoding.PutUint64s(u64s)
+
+	// Decode string lengths
+	var tail []byte
+	var err error
+	u64s.A, tail, err = unmarshalUint64Block(u64s.A[:0], src, itemsCount)
+	if err != nil {
+		return dst, fmt.Errorf("cannot unmarshal string lengths: %w", err)
+	}
+	aLens := u64s.A
+	src = tail
+
+	// Read bytes block into sbu.data
+	dataLen := len(sbu.data)
+	sbu.data, tail, err = unmarshalBytesBlock(sbu.data, src)
+	if err != nil {
+		return dst, fmt.Errorf("cannot unmarshal bytes block with strings: %w", err)
+	}
+	if len(tail) > 0 {
+		return dst, fmt.Errorf("unexpected non-empty tail after reading bytes block with strings; len(tail)=%d", len(tail))
+	}
+
+	// Decode strings from sbu.data into dst
+	data := sbu.data[dataLen:]
+	for _, sLen := range aLens {
+		if uint64(len(data)) < sLen {
+			return dst, fmt.Errorf("cannot unmarshal a string with the length %d bytes from %d bytes", sLen, len(data))
+		}
+		s := bytesutil.ToUnsafeString(data[:sLen])
+		data = data[sLen:]
+		dst = append(dst, s)
+	}
+
+	return dst, nil
+}
+
+// marshalUint64Block appends marshaled a to dst and returns the result.
+func marshalUint64Block(dst []byte, a []uint64) []byte {
+	bb := bbPool.Get()
+	bb.B = marshalUint64Items(bb.B[:0], a)
+	dst = marshalBytesBlock(dst, bb.B)
+	bbPool.Put(bb)
+	return dst
+}
+
+// unmarshalUint64Block appends unmarshaled from src itemsCount uint64 items to dst and returns the result.
+func unmarshalUint64Block(dst []uint64, src []byte, itemsCount uint64) ([]uint64, []byte, error) {
+	bb := bbPool.Get()
+	defer bbPool.Put(bb)
+
+	// Unmarshal the underlying bytes block
+	var err error
+	bb.B, src, err = unmarshalBytesBlock(bb.B[:0], src)
+	if err != nil {
+		return dst, src, fmt.Errorf("cannot unmarshal bytes block: %w", err)
+	}
+
+	// Unmarshal the items from bb.
+	dst, err = unmarshalUint64Items(dst, bb.B, itemsCount)
+	if err != nil {
+		return dst, src, fmt.Errorf("cannot unmarshal %d uint64 items from bytes block of length %d bytes: %w", itemsCount, len(bb.B), err)
+	}
+	return dst, src, nil
+}
+
+const (
+	uintBlockType8  = 0
+	uintBlockType16 = 1
+	uintBlockType32 = 2
+	uintBlockType64 = 3
+)
+
+// marshalUint64Items appends the marshaled a items to dst and returns the result.
+func marshalUint64Items(dst []byte, a []uint64) []byte {
+	// Do not marshal len(a), since it is expected that unmarshaler knows it.
+	nMax := uint64(0)
+	for _, n := range a {
+		if n > nMax {
+			nMax = n
+		}
+	}
+	switch {
+	case nMax < (1 << 8):
+		dst = append(dst, uintBlockType8)
+		for _, n := range a {
+			dst = append(dst, byte(n))
+		}
+	case nMax < (1 << 16):
+		dst = append(dst, uintBlockType16)
+		for _, n := range a {
+			dst = encoding.MarshalUint16(dst, uint16(n))
+		}
+	case nMax < (1 << 32):
+		dst = append(dst, uintBlockType32)
+		for _, n := range a {
+			dst = encoding.MarshalUint32(dst, uint32(n))
+		}
+	default:
+		dst = append(dst, uintBlockType64)
+		for _, n := range a {
+			dst = encoding.MarshalUint64(dst, uint64(n))
+		}
+	}
+	return dst
+}
+
+// unmarshalUint64Items appends unmarshaled from src itemsCount uint64 items to dst and returns the result.
+func unmarshalUint64Items(dst []uint64, src []byte, itemsCount uint64) ([]uint64, error) {
+	// Unmarshal block type
+	if len(src) < 1 {
+		return dst, fmt.Errorf("cannot unmarshal uint64 block type from empty src")
+	}
+	blockType := src[0]
+	src = src[1:]
+
+	switch blockType {
+	case uintBlockType8:
+		// A block with items smaller than 1<<8 bytes
+		if uint64(len(src)) != itemsCount {
+			return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), itemsCount)
+		}
+		for _, v := range src {
+			dst = append(dst, uint64(v))
+		}
+	case uintBlockType16:
+		// A block with items smaller than 1<<16 bytes
+		if uint64(len(src)) != 2*itemsCount {
+			return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 2*itemsCount)
+		}
+		for len(src) > 0 {
+			v := encoding.UnmarshalUint16(src)
+			src = src[2:]
+			dst = append(dst, uint64(v))
+		}
+	case uintBlockType32:
+		// A block with items smaller than 1<<32 bytes
+		if uint64(len(src)) != 4*itemsCount {
+			return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 4*itemsCount)
+		}
+		for len(src) > 0 {
+			v := encoding.UnmarshalUint32(src)
+			src = src[4:]
+			dst = append(dst, uint64(v))
+		}
+	case uintBlockType64:
+		// A block with items smaller than 1<<64 bytes
+		if uint64(len(src)) != 8*itemsCount {
+			return dst, fmt.Errorf("unexpected block length for %d items; got %d bytes; want %d bytes", itemsCount, len(src), 8*itemsCount)
+		}
+		for len(src) > 0 {
+			v := encoding.UnmarshalUint64(src)
+			src = src[8:]
+			dst = append(dst, v)
+		}
+	default:
+		return dst, fmt.Errorf("unexpected uint64 block type: %d; want 0, 1, 2 or 3", blockType)
+	}
+	return dst, nil
+}
+
+const (
+	marshalBytesTypePlain = 0
+	marshalBytesTypeZSTD  = 1
+)
+
+func marshalBytesBlock(dst, src []byte) []byte {
+	if len(src) < 128 {
+		// Marshal the block in plain without compression
+		dst = append(dst, marshalBytesTypePlain)
+		dst = append(dst, byte(len(src)))
+		return append(dst, src...)
+	}
+
+	// Compress the block
+	dst = append(dst, marshalBytesTypeZSTD)
+	bb := bbPool.Get()
+	bb.B = encoding.CompressZSTDLevel(bb.B[:0], src, 1)
+	dst = encoding.MarshalVarUint64(dst, uint64(len(bb.B)))
+	dst = append(dst, bb.B...)
+	bbPool.Put(bb)
+	return dst
+}
+
+func unmarshalBytesBlock(dst, src []byte) ([]byte, []byte, error) {
+	if len(src) < 1 {
+		return dst, src, fmt.Errorf("cannot unmarshal block type from empty src")
+	}
+	blockType := src[0]
+	src = src[1:]
+	switch blockType {
+	case marshalBytesTypePlain:
+		// Plain block
+
+		// Read block length
+		if len(src) < 1 {
+			return dst, src, fmt.Errorf("cannot unmarshal plain block size from empty src")
+		}
+		blockLen := int(src[0])
+		src = src[1:]
+		if len(src) < blockLen {
+			return dst, src, fmt.Errorf("cannot read plain block with the size %d bytes from %b bytes", blockLen, len(src))
+		}
+
+		// Copy the block to dst
+		dst = append(dst, src[:blockLen]...)
+		src = src[blockLen:]
+		return dst, src, nil
+	case marshalBytesTypeZSTD:
+		// Compressed block
+
+		// Read block length
+		tail, blockLen, err := encoding.UnmarshalVarUint64(src)
+		if err != nil {
+			return dst, src, fmt.Errorf("cannot unmarshal compressed block size: %w", err)
+		}
+		src = tail
+		if uint64(len(src)) < blockLen {
+			return dst, src, fmt.Errorf("cannot read compressed block with the size %d bytes from %d bytes", blockLen, len(src))
+		}
+		compressedBlock := src[:blockLen]
+		src = src[blockLen:]
+
+		// Decompress the block
+		bb := bbPool.Get()
+		bb.B, err = encoding.DecompressZSTD(bb.B[:0], compressedBlock)
+		if err != nil {
+			return dst, src, fmt.Errorf("cannot decompress block: %w", err)
+		}
+
+		// Copy the decompressed block to dst.
+		dst = append(dst, bb.B...)
+		bbPool.Put(bb)
+		return dst, src, nil
+	default:
+		return dst, src, fmt.Errorf("unexpected block type: %d; supported types: 0, 1", blockType)
+	}
+}
+
+var bbPool bytesutil.ByteBufferPool
+
+// getStringsBlockUnmarshaler returns stringsBlockUnmarshaler from the pool.
+//
+// Return back the stringsBlockUnmarshaler to the pool by calling putStringsBlockUnmarshaler().
+func getStringsBlockUnmarshaler() *stringsBlockUnmarshaler {
+	v := sbuPool.Get()
+	if v == nil {
+		return &stringsBlockUnmarshaler{}
+	}
+	return v.(*stringsBlockUnmarshaler)
+}
+
+// putStringsBlockUnmarshaler returns back sbu to the pool.
+//
+// sbu mustn't be used after returning to the pool.
+func putStringsBlockUnmarshaler(sbu *stringsBlockUnmarshaler) {
+	sbu.reset()
+	sbuPool.Put(sbu)
+}
+
+var sbuPool sync.Pool
--- a/lib/logstorage/encoding_test.go
+++ b/lib/logstorage/encoding_test.go
@ -0,0 +1,86 @@
+package logstorage
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestMarshalUnmarshalStringsBlock(t *testing.T) {
+	f := func(logs string, blockLenExpected int) {
+		t.Helper()
+		var a []string
+		if logs != "" {
+			a = strings.Split(logs, "\n")
+		}
+		data := marshalStringsBlock(nil, a)
+		if len(data) != blockLenExpected {
+			t.Fatalf("unexpected block length; got %d; want %d; block=%q", len(data), blockLenExpected, data)
+		}
+		sbu := getStringsBlockUnmarshaler()
+		values, err := sbu.unmarshal(nil, data, uint64(len(a)))
+		if err != nil {
+			t.Fatalf("cannot unmarshal strings block: %s", err)
+		}
+		if !reflect.DeepEqual(values, a) {
+			t.Fatalf("unexpected strings after unmarshaling;\ngot\n%q\nwant\n%q", values, a)
+		}
+		putStringsBlockUnmarshaler(sbu)
+	}
+	f("", 5)
+	f("foo", 9)
+	f(`foo
+bar
+baz
+`, 18)
+	f(`
+Apr 28 13:39:06 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
+Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
+Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
+Apr 28 13:40:05 localhost kernel: [35544.823503] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 1 (2437/0 MHz)
+Apr 28 13:40:15 localhost kernel: [35554.295612] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 2 (2447/0 MHz)
+Apr 28 13:43:37 localhost NetworkManager[1516]: <info>  [1651142617.3668] manager: NetworkManager state is now CONNECTED_GLOBAL
+Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Activating via systemd: service name='org.freedesktop.nm_dispatcher' unit='dbus-org.freedesktop.nm-dispatcher.service' requested by ':1.13' (uid=0 pid=1516 comm="/usr/sbin/NetworkManager --no-daemon " label="unconfined")
+Apr 28 13:43:37 localhost systemd[1]: Starting Network Manager Script Dispatcher Service...
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] The default IPv4 route is: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Not a paid data plan: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Found usable connection: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Successfully activated service 'org.freedesktop.nm_dispatcher'
+Apr 28 13:43:37 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
+Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
+Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
+Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
+Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
+Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
+Apr 28 13:48:01 localhost kernel: [36020.497807] CPU2: Core temperature above threshold, cpu clock throttled (total events = 22034)
+Apr 28 13:48:01 localhost kernel: [36020.497809] CPU1: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497810] CPU3: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497810] CPU2: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497812] CPU0: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.499855] CPU2: Core temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499855] CPU0: Core temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499856] CPU1: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499857] CPU3: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499858] CPU0: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499859] CPU2: Package temperature/speed normal
+`, 951)
+
+	// Generate a string longer than 1<<16 bytes
+	s := "foo"
+	for len(s) < (1 << 16) {
+		s += s
+	}
+	s += "\n"
+	lines := s
+	f(lines, 36)
+	lines += s
+	f(lines, 52)
+
+	// Generate more than 256 strings
+	lines = ""
+	for i := 0; i < 1000; i++ {
+		lines += fmt.Sprintf("line %d\n", i)
+	}
+	f(lines, 766)
+}
--- a/lib/logstorage/encoding_timing_test.go
+++ b/lib/logstorage/encoding_timing_test.go
@ -0,0 +1,73 @@
+package logstorage
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+func BenchmarkMarshalStringsBlock(b *testing.B) {
+	block := strings.Split(benchLogs, "\n")
+
+	b.SetBytes(int64(len(benchLogs)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		var buf []byte
+		for pb.Next() {
+			buf = marshalStringsBlock(buf[:0], block)
+		}
+	})
+}
+
+func BenchmarkStringsBlockUnmarshaler_Unmarshal(b *testing.B) {
+	block := strings.Split(benchLogs, "\n")
+	data := marshalStringsBlock(nil, block)
+
+	b.SetBytes(int64(len(benchLogs)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		sbu := getStringsBlockUnmarshaler()
+		var values []string
+		for pb.Next() {
+			var err error
+			values, err = sbu.unmarshal(values[:0], data, uint64(len(block)))
+			if err != nil {
+				panic(fmt.Errorf("unexpected error: %w", err))
+			}
+			sbu.reset()
+		}
+		putStringsBlockUnmarshaler(sbu)
+	})
+}
+
+const benchLogs = `
+Apr 28 13:39:06 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
+Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
+Apr 28 13:39:06 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
+Apr 28 13:40:05 localhost kernel: [35544.823503] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 1 (2437/0 MHz)
+Apr 28 13:40:15 localhost kernel: [35554.295612] wlp4s0: AP c8:ea:f8:00:6a:31 changed bandwidth, new config is 2437 MHz, width 2 (2447/0 MHz)
+Apr 28 13:43:37 localhost NetworkManager[1516]: <info>  [1651142617.3668] manager: NetworkManager state is now CONNECTED_GLOBAL
+Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Activating via systemd: service name='org.freedesktop.nm_dispatcher' unit='dbus-org.freedesktop.nm-dispatcher.service' requested by ':1.13' (uid=0 pid=1516 comm="/usr/sbin/NetworkManager --no-daemon " label="unconfined")
+Apr 28 13:43:37 localhost systemd[1]: Starting Network Manager Script Dispatcher Service...
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] The default IPv4 route is: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Not a paid data plan: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost whoopsie[2812]: [13:43:37] Found usable connection: /org/freedesktop/NetworkManager/ActiveConnection/10
+Apr 28 13:43:37 localhost dbus-daemon[1475]: [system] Successfully activated service 'org.freedesktop.nm_dispatcher'
+Apr 28 13:43:37 localhost systemd[1]: Started Network Manager Script Dispatcher Service.
+Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': new request (2 scripts)
+Apr 28 13:43:37 localhost nm-dispatcher: req:1 'connectivity-change': start running ordered scripts...
+Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
+Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
+Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
+Apr 28 13:48:01 localhost kernel: [36020.497807] CPU2: Core temperature above threshold, cpu clock throttled (total events = 22034)
+Apr 28 13:48:01 localhost kernel: [36020.497809] CPU1: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497810] CPU3: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497810] CPU2: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.497812] CPU0: Package temperature above threshold, cpu clock throttled (total events = 27400)
+Apr 28 13:48:01 localhost kernel: [36020.499855] CPU2: Core temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499855] CPU0: Core temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499856] CPU1: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499857] CPU3: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499858] CPU0: Package temperature/speed normal
+Apr 28 13:48:01 localhost kernel: [36020.499859] CPU2: Package temperature/speed normal
+`
--- a/lib/logstorage/filenames.go
+++ b/lib/logstorage/filenames.go
@ -0,0 +1,22 @@
+package logstorage
+
+const (
+	metaindexFilename     = "metaindex.bin"
+	indexFilename         = "index.bin"
+	columnsHeaderFilename = "columns_header.bin"
+	timestampsFilename    = "timestamps.bin"
+	fieldValuesFilename   = "field_values.bin"
+	fieldBloomFilename    = "field_bloom.bin"
+	messageValuesFilename = "message_values.bin"
+	messageBloomFilename  = "message_bloom.bin"
+
+	metadataFilename = "metadata.json"
+	partsFilename    = "parts.json"
+
+	streamIDCacheFilename = "stream_id.bin"
+
+	indexdbDirname    = "indexdb"
+	datadbDirname     = "datadb"
+	cacheDirname      = "cache"
+	partitionsDirname = "partitions"
+)
--- a/lib/logstorage/filters.go
+++ b/lib/logstorage/filters.go
--- a/lib/logstorage/filters_test.go
+++ b/lib/logstorage/filters_test.go
--- a/lib/logstorage/hash128.go
+++ b/lib/logstorage/hash128.go
@ -0,0 +1,38 @@
+package logstorage
+
+import (
+	"sync"
+
+	"github.com/cespare/xxhash/v2"
+)
+
+func hash128(data []byte) u128 {
+	h := getHasher()
+	_, _ = h.Write(data)
+	hi := h.Sum64()
+	_, _ = h.Write(magicSuffixForHash)
+	lo := h.Sum64()
+	putHasher(h)
+
+	return u128{
+		hi: hi,
+		lo: lo,
+	}
+}
+
+var magicSuffixForHash = []byte("magic!")
+
+func getHasher() *xxhash.Digest {
+	v := hasherPool.Get()
+	if v == nil {
+		return xxhash.New()
+	}
+	return v.(*xxhash.Digest)
+}
+
+func putHasher(h *xxhash.Digest) {
+	h.Reset()
+	hasherPool.Put(h)
+}
+
+var hasherPool sync.Pool
--- a/lib/logstorage/hash128_test.go
+++ b/lib/logstorage/hash128_test.go
@ -0,0 +1,24 @@
+package logstorage
+
+import (
+	"testing"
+)
+
+func TestHash128(t *testing.T) {
+	f := func(data string, hashExpected u128) {
+		t.Helper()
+		h := hash128([]byte(data))
+		if !h.equal(&hashExpected) {
+			t.Fatalf("unexpected hash; got %s; want %s", &h, &hashExpected)
+		}
+	}
+	f("", u128{
+		hi: 17241709254077376921,
+		lo: 13138662262368978769,
+	})
+
+	f("abc", u128{
+		hi: 4952883123889572249,
+		lo: 3255951525518405514,
+	})
+}
--- a/lib/logstorage/hash128_timing_test.go
+++ b/lib/logstorage/hash128_timing_test.go
@ -0,0 +1,29 @@
+package logstorage
+
+import (
+	"fmt"
+	"sync/atomic"
+	"testing"
+)
+
+func BenchmarkHash128(b *testing.B) {
+	a := make([][]byte, 100)
+	for i := range a {
+		a[i] = []byte(fmt.Sprintf("some string %d", i))
+	}
+	b.ReportAllocs()
+	b.SetBytes(int64(len(a)))
+	b.RunParallel(func(pb *testing.PB) {
+		var n uint64
+		for pb.Next() {
+			for _, b := range a {
+				h := hash128(b)
+				n += h.hi
+				n += h.lo
+			}
+		}
+		atomic.AddUint64(&GlobalSinkU64, n)
+	})
+}
+
+var GlobalSinkU64 uint64
--- a/lib/logstorage/index_block_header.go
+++ b/lib/logstorage/index_block_header.go
@ -0,0 +1,164 @@
+package logstorage
+
+import (
+	"fmt"
+	"io"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// indexBlockHeader contains index information about multiple blocks.
+//
+// It allows locating the block by streamID and/or by time range.
+type indexBlockHeader struct {
+	// streamID is the minimum streamID covered by the indexBlockHeader
+	streamID streamID
+
+	// minTimestamp is the mimumum timestamp seen across blocks covered by the indexBlockHeader
+	minTimestamp int64
+
+	// maxTimestamp is the maximum timestamp seen across blocks covered by the indexBlockHeader
+	maxTimestamp int64
+
+	// indexBlockOffset is an offset of the linked index block at indexFilename
+	indexBlockOffset uint64
+
+	// indexBlockSize is the size of the linked index block at indexFilename
+	indexBlockSize uint64
+}
+
+// reset resets ih for subsequent re-use.
+func (ih *indexBlockHeader) reset() {
+	ih.streamID.reset()
+	ih.minTimestamp = 0
+	ih.maxTimestamp = 0
+	ih.indexBlockOffset = 0
+	ih.indexBlockSize = 0
+}
+
+// mustWriteIndexBlock writes data with the given additioanl args to sw and updates ih accordingly.
+func (ih *indexBlockHeader) mustWriteIndexBlock(data []byte, sidFirst streamID, minTimestamp, maxTimestamp int64, sw *streamWriters) {
+	ih.streamID = sidFirst
+	ih.minTimestamp = minTimestamp
+	ih.maxTimestamp = maxTimestamp
+
+	bb := longTermBufPool.Get()
+	bb.B = encoding.CompressZSTDLevel(bb.B[:0], data, 1)
+	ih.indexBlockOffset = sw.indexWriter.bytesWritten
+	ih.indexBlockSize = uint64(len(bb.B))
+	sw.indexWriter.MustWrite(bb.B)
+	longTermBufPool.Put(bb)
+}
+
+// mustReadNextIndexBlock reads the next index block associated with ih from src, appends it to dst and returns the result.
+func (ih *indexBlockHeader) mustReadNextIndexBlock(dst []byte, sr *streamReaders) []byte {
+	indexReader := &sr.indexReader
+
+	indexBlockSize := ih.indexBlockSize
+	if indexBlockSize > maxIndexBlockSize {
+		logger.Panicf("FATAL: %s: indexBlockHeader.indexBlockSize=%d cannot exceed %d bytes", indexReader.Path(), indexBlockSize, maxIndexBlockSize)
+	}
+	if ih.indexBlockOffset != indexReader.bytesRead {
+		logger.Panicf("FATAL: %s: indexBlockHeader.indexBlockOffset=%d must equal to %d", indexReader.Path(), ih.indexBlockOffset, indexReader.bytesRead)
+	}
+	bbCompressed := longTermBufPool.Get()
+	bbCompressed.B = bytesutil.ResizeNoCopyMayOverallocate(bbCompressed.B, int(indexBlockSize))
+	indexReader.MustReadFull(bbCompressed.B)
+
+	// Decompress bbCompressed to dst
+	var err error
+	dst, err = encoding.DecompressZSTD(dst, bbCompressed.B)
+	longTermBufPool.Put(bbCompressed)
+	if err != nil {
+		logger.Panicf("FATAL: %s: cannot decompress indexBlock read at offset %d with size %d: %s", indexReader.Path(), ih.indexBlockOffset, indexBlockSize, err)
+	}
+	return dst
+}
+
+// marshal appends marshaled ih to dst and returns the result.
+func (ih *indexBlockHeader) marshal(dst []byte) []byte {
+	dst = ih.streamID.marshal(dst)
+	dst = encoding.MarshalUint64(dst, uint64(ih.minTimestamp))
+	dst = encoding.MarshalUint64(dst, uint64(ih.maxTimestamp))
+	dst = encoding.MarshalUint64(dst, ih.indexBlockOffset)
+	dst = encoding.MarshalUint64(dst, ih.indexBlockSize)
+	return dst
+}
+
+// unmarshal unmarshals ih from src and returns the tail left.
+func (ih *indexBlockHeader) unmarshal(src []byte) ([]byte, error) {
+	srcOrig := src
+
+	// unmarshal ih.streamID
+	tail, err := ih.streamID.unmarshal(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal streamID: %w", err)
+	}
+	src = tail
+
+	// unmarshal the rest of indexBlockHeader fields
+	if len(src) < 32 {
+		return srcOrig, fmt.Errorf("cannot unmarshal indexBlockHeader from %d bytes; need at least 32 bytes", len(src))
+	}
+	ih.minTimestamp = int64(encoding.UnmarshalUint64(src))
+	ih.maxTimestamp = int64(encoding.UnmarshalUint64(src[8:]))
+	ih.indexBlockOffset = encoding.UnmarshalUint64(src[16:])
+	ih.indexBlockSize = encoding.UnmarshalUint64(src[24:])
+
+	return src[32:], nil
+}
+
+// mustReadIndexBlockHeaders reads indexBlockHeader entries from r, appends them to dst and returns the result.
+func mustReadIndexBlockHeaders(dst []indexBlockHeader, r *readerWithStats) []indexBlockHeader {
+	data, err := io.ReadAll(r)
+	if err != nil {
+		logger.Panicf("FATAL: cannot read indexBlockHeader entries from %s: %s", r.Path(), err)
+	}
+
+	bb := longTermBufPool.Get()
+	bb.B, err = encoding.DecompressZSTD(bb.B[:0], data)
+	if err != nil {
+		logger.Panicf("FATAL: cannot decompress indexBlockHeader entries from %s: %s", r.Path(), err)
+	}
+	dst, err = unmarshalIndexBlockHeaders(dst, bb.B)
+	if len(bb.B) < 1024*1024 {
+		longTermBufPool.Put(bb)
+	}
+	if err != nil {
+		logger.Panicf("FATAL: cannot parse indexBlockHeader entries from %s: %s", r.Path(), err)
+	}
+	return dst
+}
+
+// unmarshalIndexBlockHeaders appends unmarshaled from src indexBlockHeader entries to dst and returns the result.
+func unmarshalIndexBlockHeaders(dst []indexBlockHeader, src []byte) ([]indexBlockHeader, error) {
+	dstOrig := dst
+	for len(src) > 0 {
+		if len(dst) < cap(dst) {
+			dst = dst[:len(dst)+1]
+		} else {
+			dst = append(dst, indexBlockHeader{})
+		}
+		ih := &dst[len(dst)-1]
+		tail, err := ih.unmarshal(src)
+		if err != nil {
+			return dstOrig, fmt.Errorf("cannot unmarshal indexBlockHeader %d: %w", len(dst)-len(dstOrig), err)
+		}
+		src = tail
+	}
+	if err := validateIndexBlockHeaders(dst[len(dstOrig):]); err != nil {
+		return dstOrig, err
+	}
+	return dst, nil
+}
+
+func validateIndexBlockHeaders(ihs []indexBlockHeader) error {
+	for i := 1; i < len(ihs); i++ {
+		if ihs[i].streamID.less(&ihs[i-1].streamID) {
+			return fmt.Errorf("unexpected indexBlockHeader with smaller streamID=%s after bigger streamID=%s", &ihs[i].streamID, &ihs[i-1].streamID)
+		}
+	}
+	return nil
+}
--- a/lib/logstorage/index_block_header_test.go
+++ b/lib/logstorage/index_block_header_test.go
@ -0,0 +1,138 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestIndexBlockHeaderMarshalUnmarshal(t *testing.T) {
+	f := func(ih *indexBlockHeader, marshaledLen int) {
+		t.Helper()
+		data := ih.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected marshaled length of indexBlockHeader; got %d; want %d", len(data), marshaledLen)
+		}
+		var ih2 indexBlockHeader
+		tail, err := ih2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("cannot unmarshal indexBlockHeader: %s", err)
+		}
+		if len(tail) > 0 {
+			t.Fatalf("unexpected non-empty tail left after unmarshaling indexBlockHeader: %X", tail)
+		}
+		if !reflect.DeepEqual(ih, &ih2) {
+			t.Fatalf("unexpected unmarshaled indexBlockHeader\ngot\n%v\nwant\n%v", &ih2, ih)
+		}
+	}
+	f(&indexBlockHeader{}, 56)
+	f(&indexBlockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				hi: 214,
+				lo: 2111,
+			},
+		},
+		minTimestamp:     1234,
+		maxTimestamp:     898943,
+		indexBlockOffset: 234,
+		indexBlockSize:   898,
+	}, 56)
+}
+
+func TestIndexBlockHeaderUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var ih indexBlockHeader
+		tail, err := ih.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+
+	ih := &indexBlockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				hi: 214,
+				lo: 2111,
+			},
+		},
+		minTimestamp:     1234,
+		maxTimestamp:     898943,
+		indexBlockOffset: 234,
+		indexBlockSize:   898,
+	}
+	data := ih.marshal(nil)
+	for len(data) > 0 {
+		data = data[:len(data)-1]
+		f(data)
+	}
+}
+
+func TestIndexBlockHeaderReset(t *testing.T) {
+	ih := &indexBlockHeader{
+		streamID: streamID{
+			tenantID: TenantID{
+				AccountID: 123,
+				ProjectID: 456,
+			},
+			id: u128{
+				hi: 214,
+				lo: 2111,
+			},
+		},
+		minTimestamp:     1234,
+		maxTimestamp:     898943,
+		indexBlockOffset: 234,
+		indexBlockSize:   898,
+	}
+	ih.reset()
+	ihZero := &indexBlockHeader{}
+	if !reflect.DeepEqual(ih, ihZero) {
+		t.Fatalf("unexpected non-zero indexBlockHeader after reset: %v", ih)
+	}
+}
+
+func TestMarshalUnmarshalIndexBlockHeaders(t *testing.T) {
+	f := func(ihs []indexBlockHeader, marshaledLen int) {
+		t.Helper()
+		var data []byte
+		for i := range ihs {
+			data = ihs[i].marshal(data)
+		}
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected marshaled length for indexBlockHeader entries; got %d; want %d", len(data), marshaledLen)
+		}
+		ihs2, err := unmarshalIndexBlockHeaders(nil, data)
+		if err != nil {
+			t.Fatalf("cannot unmarshal indexBlockHeader entries: %s", err)
+		}
+		if !reflect.DeepEqual(ihs, ihs2) {
+			t.Fatalf("unexpected indexBlockHeader entries after unmarshaling\ngot\n%v\nwant\n%v", ihs2, ihs)
+		}
+	}
+	f(nil, 0)
+	f([]indexBlockHeader{{}}, 56)
+	f([]indexBlockHeader{
+		{
+			indexBlockOffset: 234,
+			indexBlockSize:   5432,
+		},
+		{
+			minTimestamp: -123,
+		},
+	}, 112)
+}
--- a/lib/logstorage/indexdb.go
+++ b/lib/logstorage/indexdb.go
@ -0,0 +1,900 @@
+package logstorage
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sort"
+	"sync"
+	"sync/atomic"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/mergeset"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
+)
+
+const (
+	// (tenantID:streamID) entries have this prefix
+	//
+	// These entries are used for detecting whether the given stream is already registered
+	nsPrefixStreamID = 0
+
+	// (tenantID:streamID -> streamTagsCanonical) entries have this prefix
+	nsPrefixStreamIDToStreamTags = 1
+
+	// (tenantID:name:value => streamIDs) entries have this prefix
+	nsPrefixTagToStreamIDs = 2
+)
+
+// IndexdbStats contains indexdb stats
+type IndexdbStats struct {
+	// StreamsCreatedTotal is the number of log streams created since the indexdb initialization.
+	StreamsCreatedTotal uint64
+}
+
+type indexdb struct {
+	// streamsCreatedTotal is the number of log streams created since the indexdb intialization.
+	streamsCreatedTotal uint64
+
+	// path is the path to indexdb
+	path string
+
+	// partitionName is the name of the partition for the indexdb.
+	partitionName string
+
+	// tb is the storage for indexdb
+	tb *mergeset.Table
+
+	// indexSearchPool is a pool of indexSearch struct for the given indexdb
+	indexSearchPool sync.Pool
+
+	// the generation of the streamFilterCache.
+	// It is updated each time new item is added to tb.
+	streamFilterCacheGeneration uint32
+
+	// s is the storage where indexdb belongs to.
+	s *Storage
+}
+
+func mustCreateIndexdb(path string) {
+	fs.MustMkdirFailIfExist(path)
+}
+
+func mustOpenIndexdb(path, partitionName string, s *Storage) *indexdb {
+	idb := &indexdb{
+		path:          path,
+		partitionName: partitionName,
+		s:             s,
+	}
+	isReadOnly := uint32(0)
+	idb.tb = mergeset.MustOpenTable(path, idb.invalidateStreamFilterCache, mergeTagToStreamIDsRows, &isReadOnly)
+	return idb
+}
+
+func mustCloseIndexdb(idb *indexdb) {
+	idb.tb.MustClose()
+	idb.tb = nil
+	idb.s = nil
+	idb.partitionName = ""
+	idb.path = ""
+}
+
+func (idb *indexdb) debugFlush() {
+	idb.tb.DebugFlush()
+}
+
+func (idb *indexdb) updateStats(d *IndexdbStats) {
+	d.StreamsCreatedTotal += atomic.LoadUint64(&idb.streamsCreatedTotal)
+}
+
+func (idb *indexdb) appendStreamTagsByStreamID(dst []byte, sid *streamID) []byte {
+	is := idb.getIndexSearch()
+	defer idb.putIndexSearch(is)
+
+	ts := &is.ts
+	kb := &is.kb
+
+	kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixStreamIDToStreamTags, sid.tenantID)
+	kb.B = sid.id.marshal(kb.B)
+
+	if err := ts.FirstItemWithPrefix(kb.B); err != nil {
+		if err == io.EOF {
+			return dst
+		}
+		logger.Panicf("FATAL: unexpected error when searching for StreamTags by streamID=%s in indexdb: %s", sid, err)
+	}
+	data := ts.Item[len(kb.B):]
+	dst = append(dst, data...)
+	return dst
+}
+
+// hasStreamID returns true if streamID exists in idb
+func (idb *indexdb) hasStreamID(sid *streamID) bool {
+	is := idb.getIndexSearch()
+	defer idb.putIndexSearch(is)
+
+	ts := &is.ts
+	kb := &is.kb
+
+	kb.B = marshalCommonPrefix(kb.B, nsPrefixStreamID, sid.tenantID)
+	kb.B = sid.id.marshal(kb.B)
+
+	if err := ts.FirstItemWithPrefix(kb.B); err != nil {
+		if err == io.EOF {
+			return false
+		}
+		logger.Panicf("FATAL: unexpected error when searching for streamID=%s in indexdb: %s", sid, err)
+	}
+	return len(kb.B) == len(ts.Item)
+}
+
+type indexSearch struct {
+	idb *indexdb
+	ts  mergeset.TableSearch
+	kb  bytesutil.ByteBuffer
+}
+
+func (idb *indexdb) getIndexSearch() *indexSearch {
+	v := idb.indexSearchPool.Get()
+	if v == nil {
+		v = &indexSearch{
+			idb: idb,
+		}
+	}
+	is := v.(*indexSearch)
+	is.ts.Init(idb.tb)
+	return is
+}
+
+func (idb *indexdb) putIndexSearch(is *indexSearch) {
+	is.idb = nil
+	is.ts.MustClose()
+	is.kb.Reset()
+
+	idb.indexSearchPool.Put(is)
+}
+
+// searchStreamIDs returns streamIDs for the given tenantIDs and the given stream filters
+func (idb *indexdb) searchStreamIDs(tenantIDs []TenantID, sf *StreamFilter) []streamID {
+	// Try obtaining streamIDs from cache
+	streamIDs, ok := idb.loadStreamIDsFromCache(tenantIDs, sf)
+	if ok {
+		// Fast path - streamIDs found in the cache.
+		return streamIDs
+	}
+
+	// Slow path - collect streamIDs from indexdb.
+
+	// Collect streamIDs for all the specified tenantIDs.
+	is := idb.getIndexSearch()
+	m := make(map[streamID]struct{})
+	for _, tenantID := range tenantIDs {
+		for _, asf := range sf.orFilters {
+			is.updateStreamIDs(m, tenantID, asf)
+		}
+	}
+	idb.putIndexSearch(is)
+
+	// Convert the collected streamIDs from m to sorted slice.
+	streamIDs = make([]streamID, 0, len(m))
+	for streamID := range m {
+		streamIDs = append(streamIDs, streamID)
+	}
+	sortStreamIDs(streamIDs)
+
+	// Store the collected streamIDs to cache.
+	idb.storeStreamIDsToCache(tenantIDs, sf, streamIDs)
+
+	return streamIDs
+}
+
+func sortStreamIDs(streamIDs []streamID) {
+	sort.Slice(streamIDs, func(i, j int) bool {
+		return streamIDs[i].less(&streamIDs[j])
+	})
+}
+
+func (is *indexSearch) updateStreamIDs(dst map[streamID]struct{}, tenantID TenantID, asf *andStreamFilter) {
+	var m map[u128]struct{}
+	for _, tf := range asf.tagFilters {
+		ids := is.getStreamIDsForTagFilter(tenantID, tf)
+		if len(ids) == 0 {
+			// There is no need in checking the remaining filters,
+			// since the result will be empty in any case.
+			return
+		}
+		if m == nil {
+			m = ids
+		} else {
+			for id := range m {
+				if _, ok := ids[id]; !ok {
+					delete(m, id)
+				}
+			}
+		}
+	}
+
+	var sid streamID
+	for id := range m {
+		sid.tenantID = tenantID
+		sid.id = id
+		dst[sid] = struct{}{}
+	}
+}
+
+func (is *indexSearch) getStreamIDsForTagFilter(tenantID TenantID, tf *streamTagFilter) map[u128]struct{} {
+	switch tf.op {
+	case "=":
+		if tf.value == "" {
+			// (field="")
+			return is.getStreamIDsForEmptyTagValue(tenantID, tf.tagName)
+		}
+		// (field="value")
+		return is.getStreamIDsForNonEmptyTagValue(tenantID, tf.tagName, tf.value)
+	case "!=":
+		if tf.value == "" {
+			// (field!="")
+			return is.getStreamIDsForTagName(tenantID, tf.tagName)
+		}
+		// (field!="value") => (all and not field="value")
+		ids := is.getStreamIDsForTenant(tenantID)
+		idsForTag := is.getStreamIDsForNonEmptyTagValue(tenantID, tf.tagName, tf.value)
+		for id := range idsForTag {
+			delete(ids, id)
+		}
+		return ids
+	case "=~":
+		re := tf.getRegexp()
+		if re.MatchString("") {
+			// (field=~"|re") => (field="" or field=~"re")
+			ids := is.getStreamIDsForEmptyTagValue(tenantID, tf.tagName)
+			idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
+			for id := range idsForRe {
+				ids[id] = struct{}{}
+			}
+			return ids
+		}
+		return is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
+	case "!~":
+		re := tf.getRegexp()
+		if re.MatchString("") {
+			// (field!~"|re") => (field!="" and not field=~"re")
+			ids := is.getStreamIDsForTagName(tenantID, tf.tagName)
+			if len(ids) == 0 {
+				return ids
+			}
+			idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
+			for id := range idsForRe {
+				delete(ids, id)
+			}
+			return ids
+		}
+		// (field!~"re") => (all and not field=~"re")
+		ids := is.getStreamIDsForTenant(tenantID)
+		idsForRe := is.getStreamIDsForTagRegexp(tenantID, tf.tagName, re)
+		for id := range idsForRe {
+			delete(ids, id)
+		}
+		return ids
+	default:
+		logger.Panicf("BUG: unexpected operation in stream tag filter: %q", tf.op)
+		return nil
+	}
+}
+
+func (is *indexSearch) getStreamIDsForNonEmptyTagValue(tenantID TenantID, tagName, tagValue string) map[u128]struct{} {
+	ids := make(map[u128]struct{})
+	var sp tagToStreamIDsRowParser
+
+	ts := &is.ts
+	kb := &is.kb
+	kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
+	kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
+	kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagValue))
+	prefix := kb.B
+	ts.Seek(prefix)
+	for ts.NextItem() {
+		item := ts.Item
+		if !bytes.HasPrefix(item, prefix) {
+			break
+		}
+		tail := item[len(prefix):]
+		sp.UpdateStreamIDs(ids, tail)
+	}
+	if err := ts.Error(); err != nil {
+		logger.Panicf("FATAL: unexpected error: %s", err)
+	}
+
+	return ids
+}
+
+func (is *indexSearch) getStreamIDsForEmptyTagValue(tenantID TenantID, tagName string) map[u128]struct{} {
+	ids := is.getStreamIDsForTenant(tenantID)
+	idsForTag := is.getStreamIDsForTagName(tenantID, tagName)
+	for id := range idsForTag {
+		delete(ids, id)
+	}
+	return ids
+}
+
+func (is *indexSearch) getStreamIDsForTenant(tenantID TenantID) map[u128]struct{} {
+	ids := make(map[u128]struct{})
+	ts := &is.ts
+	kb := &is.kb
+	kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixStreamID, tenantID)
+	prefix := kb.B
+	ts.Seek(prefix)
+	var id u128
+	for ts.NextItem() {
+		item := ts.Item
+		if !bytes.HasPrefix(item, prefix) {
+			break
+		}
+		tail, err := id.unmarshal(item[len(prefix):])
+		if err != nil {
+			logger.Panicf("FATAL: cannot unmarshal streamID from (tenantID:streamID) entry: %s", err)
+		}
+		if len(tail) > 0 {
+			logger.Panicf("FATAL: unexpected non-empty tail left after unmarshaling streamID from (tenantID:streamID); tail len=%d", len(tail))
+		}
+		ids[id] = struct{}{}
+	}
+	if err := ts.Error(); err != nil {
+		logger.Panicf("FATAL: unexpected error: %s", err)
+	}
+
+	return ids
+}
+
+func (is *indexSearch) getStreamIDsForTagName(tenantID TenantID, tagName string) map[u128]struct{} {
+	ids := make(map[u128]struct{})
+	var sp tagToStreamIDsRowParser
+
+	ts := &is.ts
+	kb := &is.kb
+	kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
+	kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
+	prefix := kb.B
+	ts.Seek(prefix)
+	for ts.NextItem() {
+		item := ts.Item
+		if !bytes.HasPrefix(item, prefix) {
+			break
+		}
+		tail := item[len(prefix):]
+		n := bytes.IndexByte(tail, tagSeparatorChar)
+		if n < 0 {
+			logger.Panicf("FATAL: cannot find the end of tag value")
+		}
+		tail = tail[n+1:]
+		sp.UpdateStreamIDs(ids, tail)
+	}
+	if err := ts.Error(); err != nil {
+		logger.Panicf("FATAL: unexpected error: %s", err)
+	}
+
+	return ids
+}
+
+func (is *indexSearch) getStreamIDsForTagRegexp(tenantID TenantID, tagName string, re *regexutil.PromRegex) map[u128]struct{} {
+	ids := make(map[u128]struct{})
+	var sp tagToStreamIDsRowParser
+	var tagValue, prevMatchingTagValue []byte
+	var err error
+
+	ts := &is.ts
+	kb := &is.kb
+	kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToStreamIDs, tenantID)
+	kb.B = marshalTagValue(kb.B, bytesutil.ToUnsafeBytes(tagName))
+	prefix := kb.B
+	ts.Seek(prefix)
+	for ts.NextItem() {
+		item := ts.Item
+		if !bytes.HasPrefix(item, prefix) {
+			break
+		}
+		tail := item[len(prefix):]
+		tail, tagValue, err = unmarshalTagValue(tagValue[:0], tail)
+		if err != nil {
+			logger.Panicf("FATAL: cannot unmarshal tag value: %s", err)
+		}
+		if !bytes.Equal(tagValue, prevMatchingTagValue) {
+			if !re.MatchString(bytesutil.ToUnsafeString(tagValue)) {
+				continue
+			}
+			prevMatchingTagValue = append(prevMatchingTagValue[:0], tagValue...)
+		}
+		sp.UpdateStreamIDs(ids, tail)
+	}
+	if err := ts.Error(); err != nil {
+		logger.Panicf("FATAL: unexpected error: %s", err)
+	}
+
+	return ids
+}
+
+func (idb *indexdb) mustRegisterStream(streamID *streamID, streamTagsCanonical []byte) {
+	st := GetStreamTags()
+	mustUnmarshalStreamTags(st, streamTagsCanonical)
+	tenantID := streamID.tenantID
+
+	bi := getBatchItems()
+	buf := bi.buf[:0]
+	items := bi.items[:0]
+
+	// Register tenantID:streamID entry.
+	bufLen := len(buf)
+	buf = marshalCommonPrefix(buf, nsPrefixStreamID, tenantID)
+	buf = streamID.id.marshal(buf)
+	items = append(items, buf[bufLen:])
+
+	// Register tenantID:streamID -> streamTagsCanonical entry.
+	bufLen = len(buf)
+	buf = marshalCommonPrefix(buf, nsPrefixStreamIDToStreamTags, tenantID)
+	buf = streamID.id.marshal(buf)
+	buf = append(buf, streamTagsCanonical...)
+	items = append(items, buf[bufLen:])
+
+	// Register tenantID:name:value -> streamIDs entries.
+	tags := st.tags
+	for i := range tags {
+		bufLen = len(buf)
+		buf = marshalCommonPrefix(buf, nsPrefixTagToStreamIDs, tenantID)
+		buf = tags[i].indexdbMarshal(buf)
+		buf = streamID.id.marshal(buf)
+		items = append(items, buf[bufLen:])
+	}
+	PutStreamTags(st)
+
+	// Add items to the storage
+	idb.tb.AddItems(items)
+
+	bi.buf = buf
+	bi.items = items
+	putBatchItems(bi)
+
+	atomic.AddUint64(&idb.streamsCreatedTotal, 1)
+}
+
+func (idb *indexdb) invalidateStreamFilterCache() {
+	// This function must be fast, since it is called each
+	// time new indexdb entry is added.
+	atomic.AddUint32(&idb.streamFilterCacheGeneration, 1)
+}
+
+func (idb *indexdb) marshalStreamFilterCacheKey(dst []byte, tenantIDs []TenantID, sf *StreamFilter) []byte {
+	dst = encoding.MarshalUint32(dst, idb.streamFilterCacheGeneration)
+	dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(idb.partitionName))
+	dst = encoding.MarshalVarUint64(dst, uint64(len(tenantIDs)))
+	for i := range tenantIDs {
+		dst = tenantIDs[i].marshal(dst)
+	}
+	dst = sf.marshalForCacheKey(dst)
+	return dst
+}
+
+func (idb *indexdb) loadStreamIDsFromCache(tenantIDs []TenantID, sf *StreamFilter) ([]streamID, bool) {
+	bb := bbPool.Get()
+	bb.B = idb.marshalStreamFilterCacheKey(bb.B[:0], tenantIDs, sf)
+	data := idb.s.streamFilterCache.GetBig(nil, bb.B)
+	bbPool.Put(bb)
+	if len(data) == 0 {
+		// Cache miss
+		return nil, false
+	}
+	// Cache hit - unpack streamIDs from data.
+	tail, n, err := encoding.UnmarshalVarUint64(data)
+	if err != nil {
+		logger.Panicf("BUG: unexpected error when unmarshaling the number of streamIDs from cache: %s", err)
+	}
+	src := tail
+	streamIDs := make([]streamID, n)
+	for i := uint64(0); i < n; i++ {
+		tail, err = streamIDs[i].unmarshal(src)
+		if err != nil {
+			logger.Panicf("BUG: unexpected error when unmarshaling streamID #%d: %s", i, err)
+		}
+		src = tail
+	}
+	if len(src) > 0 {
+		logger.Panicf("BUG: unexpected non-empty tail left with len=%d", len(src))
+	}
+	return streamIDs, true
+}
+
+func (idb *indexdb) storeStreamIDsToCache(tenantIDs []TenantID, sf *StreamFilter, streamIDs []streamID) {
+	// marshal streamIDs
+	var b []byte
+	b = encoding.MarshalVarUint64(b, uint64(len(streamIDs)))
+	for i := 0; i < len(streamIDs); i++ {
+		b = streamIDs[i].marshal(b)
+	}
+
+	// Store marshaled streamIDs to cache.
+	bb := bbPool.Get()
+	bb.B = idb.marshalStreamFilterCacheKey(bb.B[:0], tenantIDs, sf)
+	idb.s.streamFilterCache.SetBig(bb.B, b)
+	bbPool.Put(bb)
+}
+
+type batchItems struct {
+	buf []byte
+
+	items [][]byte
+}
+
+func (bi *batchItems) reset() {
+	bi.buf = bi.buf[:0]
+
+	items := bi.items
+	for i := range items {
+		items[i] = nil
+	}
+	bi.items = items[:0]
+}
+
+func getBatchItems() *batchItems {
+	v := batchItemsPool.Get()
+	if v == nil {
+		return &batchItems{}
+	}
+	return v.(*batchItems)
+}
+
+func putBatchItems(bi *batchItems) {
+	bi.reset()
+	batchItemsPool.Put(bi)
+}
+
+var batchItemsPool sync.Pool
+
+func mergeTagToStreamIDsRows(data []byte, items []mergeset.Item) ([]byte, []mergeset.Item) {
+	// Perform quick checks whether items contain rows starting from nsPrefixTagToStreamIDs
+	// based on the fact that items are sorted.
+	if len(items) <= 2 {
+		// The first and the last row must remain unchanged.
+		return data, items
+	}
+	firstItem := items[0].Bytes(data)
+	if len(firstItem) > 0 && firstItem[0] > nsPrefixTagToStreamIDs {
+		return data, items
+	}
+	lastItem := items[len(items)-1].Bytes(data)
+	if len(lastItem) > 0 && lastItem[0] < nsPrefixTagToStreamIDs {
+		return data, items
+	}
+
+	// items contain at least one row starting from nsPrefixTagToStreamIDs. Merge rows with common tag.
+	tsm := getTagToStreamIDsRowsMerger()
+	tsm.dataCopy = append(tsm.dataCopy[:0], data...)
+	tsm.itemsCopy = append(tsm.itemsCopy[:0], items...)
+	sp := &tsm.sp
+	spPrev := &tsm.spPrev
+	dstData := data[:0]
+	dstItems := items[:0]
+	for i, it := range items {
+		item := it.Bytes(data)
+		if len(item) == 0 || item[0] != nsPrefixTagToStreamIDs || i == 0 || i == len(items)-1 {
+			// Write rows not starting with nsPrefixTagToStreamIDs as-is.
+			// Additionally write the first and the last row as-is in order to preserve
+			// sort order for adjacent blocks.
+			dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
+			dstData = append(dstData, item...)
+			dstItems = append(dstItems, mergeset.Item{
+				Start: uint32(len(dstData) - len(item)),
+				End:   uint32(len(dstData)),
+			})
+			continue
+		}
+		if err := sp.Init(item); err != nil {
+			logger.Panicf("FATAL: cannot parse row during merge: %s", err)
+		}
+		if sp.StreamIDsLen() >= maxStreamIDsPerRow {
+			dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
+			dstData = append(dstData, item...)
+			dstItems = append(dstItems, mergeset.Item{
+				Start: uint32(len(dstData) - len(item)),
+				End:   uint32(len(dstData)),
+			})
+			continue
+		}
+		if !sp.EqualPrefix(spPrev) {
+			dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
+		}
+		sp.ParseStreamIDs()
+		tsm.pendingStreamIDs = append(tsm.pendingStreamIDs, sp.StreamIDs...)
+		spPrev, sp = sp, spPrev
+		if len(tsm.pendingStreamIDs) >= maxStreamIDsPerRow {
+			dstData, dstItems = tsm.flushPendingStreamIDs(dstData, dstItems, spPrev)
+		}
+	}
+	if len(tsm.pendingStreamIDs) > 0 {
+		logger.Panicf("BUG: tsm.pendingStreamIDs must be empty at this point; got %d items", len(tsm.pendingStreamIDs))
+	}
+	if !checkItemsSorted(dstData, dstItems) {
+		// Items could become unsorted if initial items contain duplicate streamIDs:
+		//
+		//   item1: 1, 1, 5
+		//   item2: 1, 4
+		//
+		// Items could become the following after the merge:
+		//
+		//   item1: 1, 5
+		//   item2: 1, 4
+		//
+		// i.e. item1 > item2
+		//
+		// Leave the original items unmerged, so they can be merged next time.
+		// This case should be quite rare - if multiple data points are simultaneously inserted
+		// into the same new time series from multiple concurrent goroutines.
+		dstData = append(dstData[:0], tsm.dataCopy...)
+		dstItems = append(dstItems[:0], tsm.itemsCopy...)
+		if !checkItemsSorted(dstData, dstItems) {
+			logger.Panicf("BUG: the original items weren't sorted; items=%q", dstItems)
+		}
+	}
+	putTagToStreamIDsRowsMerger(tsm)
+	return dstData, dstItems
+}
+
+// maxStreamIDsPerRow limits the number of streamIDs in tenantID:name:value -> streamIDs row.
+//
+// This reduces overhead on index and metaindex in lib/mergeset.
+const maxStreamIDsPerRow = 32
+
+type u128Sorter []u128
+
+func (s u128Sorter) Len() int { return len(s) }
+func (s u128Sorter) Less(i, j int) bool {
+	return s[i].less(&s[j])
+}
+func (s u128Sorter) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+type tagToStreamIDsRowsMerger struct {
+	pendingStreamIDs u128Sorter
+	sp               tagToStreamIDsRowParser
+	spPrev           tagToStreamIDsRowParser
+
+	itemsCopy []mergeset.Item
+	dataCopy  []byte
+}
+
+func (tsm *tagToStreamIDsRowsMerger) Reset() {
+	tsm.pendingStreamIDs = tsm.pendingStreamIDs[:0]
+	tsm.sp.Reset()
+	tsm.spPrev.Reset()
+
+	tsm.itemsCopy = tsm.itemsCopy[:0]
+	tsm.dataCopy = tsm.dataCopy[:0]
+}
+
+func (tsm *tagToStreamIDsRowsMerger) flushPendingStreamIDs(dstData []byte, dstItems []mergeset.Item, sp *tagToStreamIDsRowParser) ([]byte, []mergeset.Item) {
+	if len(tsm.pendingStreamIDs) == 0 {
+		// Nothing to flush
+		return dstData, dstItems
+	}
+	// Use sort.Sort instead of sort.Slice in order to reduce memory allocations.
+	sort.Sort(&tsm.pendingStreamIDs)
+	tsm.pendingStreamIDs = removeDuplicateStreamIDs(tsm.pendingStreamIDs)
+
+	// Marshal pendingStreamIDs
+	dstDataLen := len(dstData)
+	dstData = sp.MarshalPrefix(dstData)
+	pendingStreamIDs := tsm.pendingStreamIDs
+	for i := range pendingStreamIDs {
+		dstData = pendingStreamIDs[i].marshal(dstData)
+	}
+	dstItems = append(dstItems, mergeset.Item{
+		Start: uint32(dstDataLen),
+		End:   uint32(len(dstData)),
+	})
+	tsm.pendingStreamIDs = tsm.pendingStreamIDs[:0]
+	return dstData, dstItems
+}
+
+func removeDuplicateStreamIDs(sortedStreamIDs []u128) []u128 {
+	if len(sortedStreamIDs) < 2 {
+		return sortedStreamIDs
+	}
+	hasDuplicates := false
+	for i := 1; i < len(sortedStreamIDs); i++ {
+		if sortedStreamIDs[i-1] == sortedStreamIDs[i] {
+			hasDuplicates = true
+			break
+		}
+	}
+	if !hasDuplicates {
+		return sortedStreamIDs
+	}
+	dstStreamIDs := sortedStreamIDs[:1]
+	for i := 1; i < len(sortedStreamIDs); i++ {
+		if sortedStreamIDs[i-1] == sortedStreamIDs[i] {
+			continue
+		}
+		dstStreamIDs = append(dstStreamIDs, sortedStreamIDs[i])
+	}
+	return dstStreamIDs
+}
+
+func getTagToStreamIDsRowsMerger() *tagToStreamIDsRowsMerger {
+	v := tsmPool.Get()
+	if v == nil {
+		return &tagToStreamIDsRowsMerger{}
+	}
+	return v.(*tagToStreamIDsRowsMerger)
+}
+
+func putTagToStreamIDsRowsMerger(tsm *tagToStreamIDsRowsMerger) {
+	tsm.Reset()
+	tsmPool.Put(tsm)
+}
+
+var tsmPool sync.Pool
+
+type tagToStreamIDsRowParser struct {
+	// TenantID contains TenantID of the parsed row
+	TenantID TenantID
+
+	// StreamIDs contains parsed StreamIDs after ParseStreamIDs call
+	StreamIDs []u128
+
+	// streamIDsParsed is set to true after ParseStreamIDs call
+	streamIDsParsed bool
+
+	// Tag contains parsed tag after Init call
+	Tag streamTag
+
+	// tail contains the remaining unparsed streamIDs
+	tail []byte
+}
+
+func (sp *tagToStreamIDsRowParser) Reset() {
+	sp.TenantID.Reset()
+	sp.StreamIDs = sp.StreamIDs[:0]
+	sp.streamIDsParsed = false
+	sp.Tag.reset()
+	sp.tail = nil
+}
+
+// Init initializes sp from b, which should contain encoded tenantID:name:value -> streamIDs row.
+//
+// b cannot be re-used until Reset call.
+//
+// ParseStreamIDs() must be called later for obtaining sp.StreamIDs from the given tail.
+func (sp *tagToStreamIDsRowParser) Init(b []byte) error {
+	tail, nsPrefix, err := unmarshalCommonPrefix(&sp.TenantID, b)
+	if err != nil {
+		return fmt.Errorf("invalid tenantID:name:value -> streamIDs row %q: %w", b, err)
+	}
+	if nsPrefix != nsPrefixTagToStreamIDs {
+		return fmt.Errorf("invalid prefix for tenantID:name:value -> streamIDs row %q; got %d; want %d", b, nsPrefix, nsPrefixTagToStreamIDs)
+	}
+	tail, err = sp.Tag.indexdbUnmarshal(tail)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal tag from tenantID:name:value -> streamIDs row %q: %w", b, err)
+	}
+	if err = sp.InitOnlyTail(tail); err != nil {
+		return fmt.Errorf("cannot initialize tail from tenantID:name:value -> streamIDs row %q: %w", b, err)
+	}
+	return nil
+}
+
+// MarshalPrefix marshals row prefix without tail to dst.
+func (sp *tagToStreamIDsRowParser) MarshalPrefix(dst []byte) []byte {
+	dst = marshalCommonPrefix(dst, nsPrefixTagToStreamIDs, sp.TenantID)
+	dst = sp.Tag.indexdbMarshal(dst)
+	return dst
+}
+
+// InitOnlyTail initializes sp.tail from tail, which must contain streamIDs.
+//
+// tail cannot be re-used until Reset call.
+//
+// ParseStreamIDs() must be called later for obtaining sp.StreamIDs from the given tail.
+func (sp *tagToStreamIDsRowParser) InitOnlyTail(tail []byte) error {
+	if len(tail) == 0 {
+		return fmt.Errorf("missing streamID in the tenantID:name:value -> streamIDs row")
+	}
+	if len(tail)%16 != 0 {
+		return fmt.Errorf("invalid tail length in the tenantID:name:value -> streamIDs row; got %d bytes; must be multiple of 16 bytes", len(tail))
+	}
+	sp.tail = tail
+	sp.streamIDsParsed = false
+	return nil
+}
+
+// EqualPrefix returns true if prefixes for sp and x are equal.
+//
+// Prefix contains (tenantID:name:value)
+func (sp *tagToStreamIDsRowParser) EqualPrefix(x *tagToStreamIDsRowParser) bool {
+	if !sp.TenantID.equal(&x.TenantID) {
+		return false
+	}
+	if !sp.Tag.equal(&x.Tag) {
+		return false
+	}
+	return true
+}
+
+// StreamIDsLen returns the number of StreamIDs in the sp.tail
+func (sp *tagToStreamIDsRowParser) StreamIDsLen() int {
+	return len(sp.tail) / 16
+}
+
+// ParseStreamIDs parses StreamIDs from sp.tail into sp.StreamIDs.
+func (sp *tagToStreamIDsRowParser) ParseStreamIDs() {
+	if sp.streamIDsParsed {
+		return
+	}
+	tail := sp.tail
+	n := len(tail) / 16
+	streamIDs := sp.StreamIDs[:0]
+	if n <= cap(streamIDs) {
+		streamIDs = streamIDs[:n]
+	} else {
+		streamIDs = append(streamIDs[:cap(streamIDs)], make([]u128, n-cap(streamIDs))...)
+	}
+	sp.StreamIDs = streamIDs
+	for i := 0; i < n; i++ {
+		var err error
+		tail, err = streamIDs[i].unmarshal(tail)
+		if err != nil {
+			logger.Panicf("FATAL: cannot unmarshal streamID: %s", err)
+		}
+	}
+	sp.streamIDsParsed = true
+}
+
+func (sp *tagToStreamIDsRowParser) UpdateStreamIDs(ids map[u128]struct{}, tail []byte) {
+	sp.Reset()
+	if err := sp.InitOnlyTail(tail); err != nil {
+		logger.Panicf("FATAL: cannot parse '(date, tag) -> streamIDs' row: %s", err)
+	}
+	sp.ParseStreamIDs()
+	for _, id := range sp.StreamIDs {
+		ids[id] = struct{}{}
+	}
+}
+
+// commonPrefixLen is the length of common prefix for indexdb rows
+// 1 byte for ns* prefix + 8 bytes for tenantID
+const commonPrefixLen = 1 + 8
+
+func marshalCommonPrefix(dst []byte, nsPrefix byte, tenantID TenantID) []byte {
+	dst = append(dst, nsPrefix)
+	dst = tenantID.marshal(dst)
+	return dst
+}
+
+func unmarshalCommonPrefix(dstTenantID *TenantID, src []byte) ([]byte, byte, error) {
+	if len(src) < commonPrefixLen {
+		return nil, 0, fmt.Errorf("cannot unmarshal common prefix from %d bytes; need at least %d bytes; data=%X", len(src), commonPrefixLen, src)
+	}
+	prefix := src[0]
+	src = src[1:]
+	tail, err := dstTenantID.unmarshal(src)
+	if err != nil {
+		return nil, 0, fmt.Errorf("cannot unmarshal tenantID: %s", err)
+	}
+	return tail, prefix, nil
+}
+
+func checkItemsSorted(data []byte, items []mergeset.Item) bool {
+	if len(items) == 0 {
+		return true
+	}
+	prevItem := items[0].String(data)
+	for _, it := range items[1:] {
+		currItem := it.String(data)
+		if prevItem > currItem {
+			return false
+		}
+		prevItem = currItem
+	}
+	return true
+}
--- a/lib/logstorage/indexdb_test.go
+++ b/lib/logstorage/indexdb_test.go
@ -0,0 +1,253 @@
+package logstorage
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+)
+
+func TestStorageSearchStreamIDs(t *testing.T) {
+	const path = "TestStorageSearchStreamIDs"
+	const partitionName = "foobar"
+	s := newTestStorage()
+	mustCreateIndexdb(path)
+	idb := mustOpenIndexdb(path, partitionName, s)
+
+	tenantID := TenantID{
+		AccountID: 123,
+		ProjectID: 567,
+	}
+	getStreamIDForTags := func(tags map[string]string) (streamID, []byte) {
+		st := GetStreamTags()
+		for k, v := range tags {
+			st.Add(k, v)
+		}
+		streamTagsCanonical := st.MarshalCanonical(nil)
+		PutStreamTags(st)
+		id := hash128(streamTagsCanonical)
+		sid := streamID{
+			tenantID: tenantID,
+			id:       id,
+		}
+		return sid, streamTagsCanonical
+	}
+
+	// Create indexdb entries
+	const jobsCount = 7
+	const instancesCount = 5
+	for i := 0; i < jobsCount; i++ {
+		for j := 0; j < instancesCount; j++ {
+			sid, streamTagsCanonical := getStreamIDForTags(map[string]string{
+				"job":      fmt.Sprintf("job-%d", i),
+				"instance": fmt.Sprintf("instance-%d", j),
+			})
+			idb.mustRegisterStream(&sid, streamTagsCanonical)
+		}
+	}
+	idb.debugFlush()
+
+	f := func(streamFilter string, expectedStreamIDs []streamID) {
+		t.Helper()
+		sf := mustNewStreamFilter(streamFilter)
+		if expectedStreamIDs == nil {
+			expectedStreamIDs = []streamID{}
+		}
+		sortStreamIDs(expectedStreamIDs)
+		for i := 0; i < 3; i++ {
+			streamIDs := idb.searchStreamIDs([]TenantID{tenantID}, sf)
+			if !reflect.DeepEqual(streamIDs, expectedStreamIDs) {
+				t.Fatalf("unexpected streamIDs on iteration %d; got %v; want %v", i, streamIDs, expectedStreamIDs)
+			}
+		}
+	}
+
+	t.Run("missing-tenant-id", func(t *testing.T) {
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 2,
+		}
+		sf := mustNewStreamFilter(`{job="job-0",instance="instance-0"}`)
+		for i := 0; i < 3; i++ {
+			streamIDs := idb.searchStreamIDs([]TenantID{tenantID}, sf)
+			if len(streamIDs) > 0 {
+				t.Fatalf("unexpected non-empty streamIDs on iteration %d: %d", i, len(streamIDs))
+			}
+		}
+	})
+	t.Run("missing-job", func(t *testing.T) {
+		f(`{job="non-existing-job",instance="instance-0"}`, nil)
+	})
+	t.Run("missing-job-re", func(t *testing.T) {
+		f(`{job=~"non-existing-job|",instance="instance-0"}`, nil)
+	})
+	t.Run("missing-job-negative-re", func(t *testing.T) {
+		f(`{job!~"job.+",instance="instance-0"}`, nil)
+	})
+	t.Run("empty-job", func(t *testing.T) {
+		f(`{job="",instance="instance-0"}`, nil)
+	})
+	t.Run("missing-instance", func(t *testing.T) {
+		f(`{job="job-0",instance="non-existing-instance"}`, nil)
+	})
+	t.Run("missing-instance-re", func(t *testing.T) {
+		f(`{job="job-0",instance=~"non-existing-instance|"}`, nil)
+	})
+	t.Run("missing-instance-negative-re", func(t *testing.T) {
+		f(`{job="job-0",instance!~"instance.+"}`, nil)
+	})
+	t.Run("empty-instance", func(t *testing.T) {
+		f(`{job="job-0",instance=""}`, nil)
+	})
+	t.Run("non-existing-tag", func(t *testing.T) {
+		f(`{job="job-0",instance="instance-0",non_existing_tag="foobar"}`, nil)
+	})
+	t.Run("non-existing-non-empty-tag", func(t *testing.T) {
+		f(`{job="job-0",instance="instance-0",non_existing_tag!=""}`, nil)
+	})
+	t.Run("non-existing-tag-re", func(t *testing.T) {
+		f(`{job="job-0",instance="instance-0",non_existing_tag=~"foo.+"}`, nil)
+	})
+	t.Run("non-existing-non-empty-tag-re", func(t *testing.T) {
+		f(`{job="job-0",instance="instance-0",non_existing_tag!~""}`, nil)
+	})
+
+	t.Run("match-job-instance", func(t *testing.T) {
+		sid, _ := getStreamIDForTags(map[string]string{
+			"instance": "instance-0",
+			"job":      "job-0",
+		})
+		f(`{job="job-0",instance="instance-0"}`, []streamID{sid})
+	})
+	t.Run("match-non-existing-tag", func(t *testing.T) {
+		sid, _ := getStreamIDForTags(map[string]string{
+			"instance": "instance-0",
+			"job":      "job-0",
+		})
+		f(`{job="job-0",instance="instance-0",non_existing_tag=~"foo|"}`, []streamID{sid})
+	})
+	t.Run("match-job", func(t *testing.T) {
+		var streamIDs []streamID
+		for i := 0; i < instancesCount; i++ {
+			sid, _ := getStreamIDForTags(map[string]string{
+				"instance": fmt.Sprintf("instance-%d", i),
+				"job":      "job-0",
+			})
+			streamIDs = append(streamIDs, sid)
+		}
+		f(`{job="job-0"}`, streamIDs)
+	})
+	t.Run("match-instance", func(t *testing.T) {
+		var streamIDs []streamID
+		for i := 0; i < jobsCount; i++ {
+			sid, _ := getStreamIDForTags(map[string]string{
+				"instance": "instance-1",
+				"job":      fmt.Sprintf("job-%d", i),
+			})
+			streamIDs = append(streamIDs, sid)
+		}
+		f(`{instance="instance-1"}`, streamIDs)
+	})
+	t.Run("match-re", func(t *testing.T) {
+		var streamIDs []streamID
+		for _, instanceID := range []int{3, 1} {
+			for _, jobID := range []int{0, 2} {
+				sid, _ := getStreamIDForTags(map[string]string{
+					"instance": fmt.Sprintf("instance-%d", instanceID),
+					"job":      fmt.Sprintf("job-%d", jobID),
+				})
+				streamIDs = append(streamIDs, sid)
+			}
+		}
+		f(`{job=~"job-(0|2)",instance=~"instance-[13]"}`, streamIDs)
+	})
+	t.Run("match-re-empty-match", func(t *testing.T) {
+		var streamIDs []streamID
+		for _, instanceID := range []int{3, 1} {
+			for _, jobID := range []int{0, 2} {
+				sid, _ := getStreamIDForTags(map[string]string{
+					"instance": fmt.Sprintf("instance-%d", instanceID),
+					"job":      fmt.Sprintf("job-%d", jobID),
+				})
+				streamIDs = append(streamIDs, sid)
+			}
+		}
+		f(`{job=~"job-(0|2)|",instance=~"instance-[13]"}`, streamIDs)
+	})
+	t.Run("match-negative-re", func(t *testing.T) {
+		var instanceIDs []int
+		for i := 0; i < instancesCount; i++ {
+			if i != 0 && i != 1 {
+				instanceIDs = append(instanceIDs, i)
+			}
+		}
+		var jobIDs []int
+		for i := 0; i < jobsCount; i++ {
+			if i > 2 {
+				jobIDs = append(jobIDs, i)
+			}
+		}
+		var streamIDs []streamID
+		for _, instanceID := range instanceIDs {
+			for _, jobID := range jobIDs {
+				sid, _ := getStreamIDForTags(map[string]string{
+					"instance": fmt.Sprintf("instance-%d", instanceID),
+					"job":      fmt.Sprintf("job-%d", jobID),
+				})
+				streamIDs = append(streamIDs, sid)
+			}
+		}
+		f(`{job!~"job-[0-2]",instance!~"instance-(0|1)"}`, streamIDs)
+	})
+	t.Run("match-negative-re-empty-match", func(t *testing.T) {
+		var instanceIDs []int
+		for i := 0; i < instancesCount; i++ {
+			if i != 0 && i != 1 {
+				instanceIDs = append(instanceIDs, i)
+			}
+		}
+		var jobIDs []int
+		for i := 0; i < jobsCount; i++ {
+			if i > 2 {
+				jobIDs = append(jobIDs, i)
+			}
+		}
+		var streamIDs []streamID
+		for _, instanceID := range instanceIDs {
+			for _, jobID := range jobIDs {
+				sid, _ := getStreamIDForTags(map[string]string{
+					"instance": fmt.Sprintf("instance-%d", instanceID),
+					"job":      fmt.Sprintf("job-%d", jobID),
+				})
+				streamIDs = append(streamIDs, sid)
+			}
+		}
+		f(`{job!~"job-[0-2]",instance!~"instance-(0|1)|"}`, streamIDs)
+	})
+	t.Run("match-negative-job", func(t *testing.T) {
+		instanceIDs := []int{2}
+		var jobIDs []int
+		for i := 0; i < jobsCount; i++ {
+			if i != 1 {
+				jobIDs = append(jobIDs, i)
+			}
+		}
+		var streamIDs []streamID
+		for _, instanceID := range instanceIDs {
+			for _, jobID := range jobIDs {
+				sid, _ := getStreamIDForTags(map[string]string{
+					"instance": fmt.Sprintf("instance-%d", instanceID),
+					"job":      fmt.Sprintf("job-%d", jobID),
+				})
+				streamIDs = append(streamIDs, sid)
+			}
+		}
+		f(`{instance="instance-2",job!="job-1"}`, streamIDs)
+	})
+
+	mustCloseIndexdb(idb)
+	fs.MustRemoveAll(path)
+
+	closeTestStorage(s)
+}
--- a/lib/logstorage/inmemory_part.go
+++ b/lib/logstorage/inmemory_part.go
@ -0,0 +1,155 @@
+package logstorage
+
+import (
+	"path/filepath"
+	"sort"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+)
+
+// inmemoryPart is an in-memory part.
+type inmemoryPart struct {
+	// ph contains partHeader information for the given in-memory part.
+	ph partHeader
+
+	metaindex          bytesutil.ByteBuffer
+	index              bytesutil.ByteBuffer
+	columnsHeader      bytesutil.ByteBuffer
+	timestamps         bytesutil.ByteBuffer
+	fieldValues        bytesutil.ByteBuffer
+	fieldBloomFilter   bytesutil.ByteBuffer
+	messageValues      bytesutil.ByteBuffer
+	messageBloomFilter bytesutil.ByteBuffer
+}
+
+// reset resets mp, so it can be re-used
+func (mp *inmemoryPart) reset() {
+	mp.ph.reset()
+
+	mp.metaindex.Reset()
+	mp.index.Reset()
+	mp.columnsHeader.Reset()
+	mp.timestamps.Reset()
+	mp.fieldValues.Reset()
+	mp.fieldBloomFilter.Reset()
+	mp.messageValues.Reset()
+	mp.messageBloomFilter.Reset()
+}
+
+// mustInitFromRows initializes mp from lr.
+func (mp *inmemoryPart) mustInitFromRows(lr *LogRows) {
+	mp.reset()
+
+	if len(lr.timestamps) == 0 {
+		return
+	}
+
+	sort.Sort(lr)
+
+	bsw := getBlockStreamWriter()
+	bsw.MustInitForInmemoryPart(mp)
+	trs := getTmpRows()
+	var sidPrev *streamID
+	uncompressedBlockSizeBytes := uint64(0)
+	timestamps := lr.timestamps
+	rows := lr.rows
+	streamIDs := lr.streamIDs
+	for i := range timestamps {
+		streamID := &streamIDs[i]
+		if sidPrev == nil {
+			sidPrev = streamID
+		}
+
+		if uncompressedBlockSizeBytes >= maxUncompressedBlockSize || !streamID.equal(sidPrev) {
+			bsw.MustWriteRows(sidPrev, trs.timestamps, trs.rows)
+			trs.reset()
+			sidPrev = streamID
+			uncompressedBlockSizeBytes = 0
+		}
+		fields := rows[i]
+		trs.timestamps = append(trs.timestamps, timestamps[i])
+		trs.rows = append(trs.rows, fields)
+		uncompressedBlockSizeBytes += uncompressedRowSizeBytes(fields)
+	}
+	bsw.MustWriteRows(sidPrev, trs.timestamps, trs.rows)
+	putTmpRows(trs)
+	bsw.Finalize(&mp.ph)
+	putBlockStreamWriter(bsw)
+}
+
+// MustStoreToDisk stores mp to disk at the given path.
+func (mp *inmemoryPart) MustStoreToDisk(path string) {
+	fs.MustMkdirFailIfExist(path)
+
+	metaindexPath := filepath.Join(path, metaindexFilename)
+	indexPath := filepath.Join(path, indexFilename)
+	columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
+	timestampsPath := filepath.Join(path, timestampsFilename)
+	fieldValuesPath := filepath.Join(path, fieldValuesFilename)
+	fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
+	messageValuesPath := filepath.Join(path, messageValuesFilename)
+	messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
+
+	fs.MustWriteSync(metaindexPath, mp.metaindex.B)
+	fs.MustWriteSync(indexPath, mp.index.B)
+	fs.MustWriteSync(columnsHeaderPath, mp.columnsHeader.B)
+	fs.MustWriteSync(timestampsPath, mp.timestamps.B)
+	fs.MustWriteSync(fieldValuesPath, mp.fieldValues.B)
+	fs.MustWriteSync(fieldBloomFilterPath, mp.fieldBloomFilter.B)
+	fs.MustWriteSync(messageValuesPath, mp.messageValues.B)
+	fs.MustWriteSync(messageBloomFilterPath, mp.messageBloomFilter.B)
+
+	mp.ph.mustWriteMetadata(path)
+
+	fs.MustSyncPath(path)
+	// Do not sync parent directory - it must be synced by the caller.
+}
+
+// tmpRows is used as a helper for inmemoryPart.mustInitFromRows()
+type tmpRows struct {
+	timestamps []int64
+
+	rows [][]Field
+}
+
+func (trs *tmpRows) reset() {
+	trs.timestamps = trs.timestamps[:0]
+
+	rows := trs.rows
+	for i := range rows {
+		rows[i] = nil
+	}
+	trs.rows = rows[:0]
+}
+
+func getTmpRows() *tmpRows {
+	v := tmpRowsPool.Get()
+	if v == nil {
+		return &tmpRows{}
+	}
+	return v.(*tmpRows)
+}
+
+func putTmpRows(trs *tmpRows) {
+	trs.reset()
+	tmpRowsPool.Put(trs)
+}
+
+var tmpRowsPool sync.Pool
+
+func getInmemoryPart() *inmemoryPart {
+	v := inmemoryPartPool.Get()
+	if v == nil {
+		return &inmemoryPart{}
+	}
+	return v.(*inmemoryPart)
+}
+
+func putInmemoryPart(mp *inmemoryPart) {
+	mp.reset()
+	inmemoryPartPool.Put(mp)
+}
+
+var inmemoryPartPool sync.Pool
--- a/lib/logstorage/inmemory_part_test.go
+++ b/lib/logstorage/inmemory_part_test.go
@ -0,0 +1,343 @@
+package logstorage
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"reflect"
+	"sort"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+func TestInmemoryPartMustInitFromRows(t *testing.T) {
+	f := func(lr *LogRows, blocksCountExpected int, compressionRateExpected float64) {
+		t.Helper()
+
+		uncompressedSizeBytesExpected := uncompressedRowsSizeBytes(lr.rows)
+		rowsCountExpected := len(lr.timestamps)
+		minTimestampExpected := int64(math.MaxInt64)
+		maxTimestampExpected := int64(math.MinInt64)
+
+		// make a copy of lr - it is used for comapring the results later,
+		// since lr may be modified by inmemoryPart.mustInitFromRows()
+		lrOrig := GetLogRows(nil, nil)
+		for i, timestamp := range lr.timestamps {
+			if timestamp < minTimestampExpected {
+				minTimestampExpected = timestamp
+			}
+			if timestamp > maxTimestampExpected {
+				maxTimestampExpected = timestamp
+			}
+			lrOrig.mustAddInternal(lr.streamIDs[i], timestamp, lr.rows[i], lr.streamTagsCanonicals[i])
+		}
+
+		// Create inmemory part from lr
+		mp := getInmemoryPart()
+		mp.mustInitFromRows(lr)
+
+		// Check mp.ph
+		ph := &mp.ph
+		checkCompressionRate(t, ph, compressionRateExpected)
+		if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
+			t.Fatalf("unexpected UncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
+		}
+		if ph.RowsCount != uint64(rowsCountExpected) {
+			t.Fatalf("unexpected rowsCount in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
+		}
+		if ph.BlocksCount != uint64(blocksCountExpected) {
+			t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
+		}
+		if ph.RowsCount > 0 {
+			if ph.MinTimestamp != minTimestampExpected {
+				t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
+			}
+			if ph.MaxTimestamp != maxTimestampExpected {
+				t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
+			}
+		}
+
+		// Read log entries from mp to rrsResult
+		sbu := getStringsBlockUnmarshaler()
+		defer putStringsBlockUnmarshaler(sbu)
+		vd := getValuesDecoder()
+		defer putValuesDecoder(vd)
+		lrResult := mp.readLogRows(sbu, vd)
+		putInmemoryPart(mp)
+
+		// compare lrOrig to lrResult
+		if err := checkEqualRows(lrResult, lrOrig); err != nil {
+			t.Fatalf("unequal log entries: %s", err)
+		}
+	}
+
+	f(GetLogRows(nil, nil), 0, 0)
+
+	// Check how inmemoryPart works with a single stream
+	f(newTestLogRows(1, 1, 0), 1, 0.8)
+	f(newTestLogRows(1, 2, 0), 1, 0.9)
+	f(newTestLogRows(1, 10, 0), 1, 2.0)
+	f(newTestLogRows(1, 1000, 0), 1, 7.1)
+	f(newTestLogRows(1, 20000, 0), 2, 7.2)
+
+	// Check how inmemoryPart works with multiple streams
+	f(newTestLogRows(2, 1, 0), 2, 0.8)
+	f(newTestLogRows(10, 1, 0), 10, 0.9)
+	f(newTestLogRows(100, 1, 0), 100, 1.0)
+	f(newTestLogRows(10, 5, 0), 10, 1.4)
+	f(newTestLogRows(10, 1000, 0), 10, 7.2)
+	f(newTestLogRows(100, 100, 0), 100, 5.0)
+}
+
+func checkCompressionRate(t *testing.T, ph *partHeader, compressionRateExpected float64) {
+	t.Helper()
+	compressionRate := float64(ph.UncompressedSizeBytes) / float64(ph.CompressedSizeBytes)
+	if math.Abs(compressionRate-compressionRateExpected) > 0.1 {
+		t.Fatalf("unexpected compression rate; got %.1f; want %.1f", compressionRate, compressionRateExpected)
+	}
+}
+
+func TestInmemoryPartInitFromBlockStreamReaders(t *testing.T) {
+	f := func(lrs []*LogRows, blocksCountExpected int, compressionRateExpected float64) {
+		t.Helper()
+
+		uncompressedSizeBytesExpected := uint64(0)
+		rowsCountExpected := 0
+		minTimestampExpected := int64(math.MaxInt64)
+		maxTimestampExpected := int64(math.MinInt64)
+
+		// make a copy of rrss in order to compare the results after merge.
+		lrOrig := GetLogRows(nil, nil)
+		for _, lr := range lrs {
+			uncompressedSizeBytesExpected += uncompressedRowsSizeBytes(lr.rows)
+			rowsCountExpected += len(lr.timestamps)
+			for j, timestamp := range lr.timestamps {
+				if timestamp < minTimestampExpected {
+					minTimestampExpected = timestamp
+				}
+				if timestamp > maxTimestampExpected {
+					maxTimestampExpected = timestamp
+				}
+				lrOrig.mustAddInternal(lr.streamIDs[j], timestamp, lr.rows[j], lr.streamTagsCanonicals[j])
+			}
+		}
+
+		// Initialize readers from lrs
+		var mpsSrc []*inmemoryPart
+		var bsrs []*blockStreamReader
+		for _, lr := range lrs {
+			mp := getInmemoryPart()
+			mp.mustInitFromRows(lr)
+			mpsSrc = append(mpsSrc, mp)
+
+			bsr := getBlockStreamReader()
+			bsr.MustInitFromInmemoryPart(mp)
+			bsrs = append(bsrs, bsr)
+		}
+		defer func() {
+			for _, bsr := range bsrs {
+				putBlockStreamReader(bsr)
+			}
+			for _, mp := range mpsSrc {
+				putInmemoryPart(mp)
+			}
+		}()
+
+		// Merge data from bsrs into mpDst
+		mpDst := getInmemoryPart()
+		bsw := getBlockStreamWriter()
+		bsw.MustInitForInmemoryPart(mpDst)
+		mustMergeBlockStreams(&mpDst.ph, bsw, bsrs, nil)
+		putBlockStreamWriter(bsw)
+
+		// Check mpDst.ph stats
+		ph := &mpDst.ph
+		checkCompressionRate(t, ph, compressionRateExpected)
+		if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
+			t.Fatalf("unexpected uncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
+		}
+		if ph.RowsCount != uint64(rowsCountExpected) {
+			t.Fatalf("unexpected number of entries in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
+		}
+		if ph.BlocksCount != uint64(blocksCountExpected) {
+			t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
+		}
+		if ph.RowsCount > 0 {
+			if ph.MinTimestamp != minTimestampExpected {
+				t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
+			}
+			if ph.MaxTimestamp != maxTimestampExpected {
+				t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
+			}
+		}
+
+		// Read log entries from mpDst to rrsResult
+		sbu := getStringsBlockUnmarshaler()
+		defer putStringsBlockUnmarshaler(sbu)
+		vd := getValuesDecoder()
+		defer putValuesDecoder(vd)
+		lrResult := mpDst.readLogRows(sbu, vd)
+		putInmemoryPart(mpDst)
+
+		// compare rrsOrig to rrsResult
+		if err := checkEqualRows(lrResult, lrOrig); err != nil {
+			t.Fatalf("unequal log entries: %s", err)
+		}
+	}
+
+	// Check empty readers
+	f(nil, 0, 0)
+	f([]*LogRows{GetLogRows(nil, nil)}, 0, 0)
+	f([]*LogRows{GetLogRows(nil, nil), GetLogRows(nil, nil)}, 0, 0)
+
+	// Check merge with a single reader
+	f([]*LogRows{newTestLogRows(1, 1, 0)}, 1, 0.8)
+	f([]*LogRows{newTestLogRows(1, 10, 0)}, 1, 2.0)
+	f([]*LogRows{newTestLogRows(1, 100, 0)}, 1, 4.9)
+	f([]*LogRows{newTestLogRows(1, 1000, 0)}, 1, 7.1)
+	f([]*LogRows{newTestLogRows(1, 10000, 0)}, 1, 7.4)
+	f([]*LogRows{newTestLogRows(10, 1, 0)}, 10, 0.9)
+	f([]*LogRows{newTestLogRows(100, 1, 0)}, 100, 1.0)
+	f([]*LogRows{newTestLogRows(1000, 1, 0)}, 1000, 1.0)
+	f([]*LogRows{newTestLogRows(10, 10, 0)}, 10, 2.1)
+	f([]*LogRows{newTestLogRows(10, 100, 0)}, 10, 4.9)
+
+	//Check merge with multiple readers
+	f([]*LogRows{
+		newTestLogRows(1, 1, 0),
+		newTestLogRows(1, 1, 1),
+	}, 2, 0.9)
+	f([]*LogRows{
+		newTestLogRows(2, 2, 0),
+		newTestLogRows(2, 2, 0),
+	}, 2, 1.8)
+	f([]*LogRows{
+		newTestLogRows(1, 20, 0),
+		newTestLogRows(1, 10, 1),
+		newTestLogRows(1, 5, 2),
+	}, 3, 2.2)
+	f([]*LogRows{
+		newTestLogRows(10, 20, 0),
+		newTestLogRows(20, 10, 1),
+		newTestLogRows(30, 5, 2),
+	}, 60, 2.0)
+	f([]*LogRows{
+		newTestLogRows(10, 20, 0),
+		newTestLogRows(20, 10, 1),
+		newTestLogRows(30, 5, 2),
+		newTestLogRows(20, 7, 3),
+		newTestLogRows(10, 9, 4),
+	}, 90, 1.9)
+}
+
+func newTestLogRows(streams, rowsPerStream int, seed int64) *LogRows {
+	streamTags := []string{
+		"some-stream-tag",
+	}
+	lr := GetLogRows(streamTags, nil)
+	rng := rand.New(rand.NewSource(seed))
+	var fields []Field
+	for i := 0; i < streams; i++ {
+		tenantID := TenantID{
+			AccountID: rng.Uint32(),
+			ProjectID: rng.Uint32(),
+		}
+		for j := 0; j < rowsPerStream; j++ {
+			// Add stream tags
+			fields = append(fields[:0], Field{
+				Name:  "some-stream-tag",
+				Value: fmt.Sprintf("some-stream-value-%d", i),
+			})
+			// Add the remaining tags
+			for k := 0; k < 5; k++ {
+				if rng.Float64() < 0.5 {
+					fields = append(fields, Field{
+						Name:  fmt.Sprintf("field_%d", k),
+						Value: fmt.Sprintf("value_%d_%d_%d", i, j, k),
+					})
+				}
+			}
+			// add a message field
+			fields = append(fields, Field{
+				Name:  "",
+				Value: fmt.Sprintf("some row number %d at stream %d", j, i),
+			})
+			// add a field with constant value
+			fields = append(fields, Field{
+				Name:  "job",
+				Value: "foobar",
+			})
+			// add a field with uint value
+			fields = append(fields, Field{
+				Name:  "response_size_bytes",
+				Value: fmt.Sprintf("%d", rng.Intn(1234)),
+			})
+			// shuffle fields in order to check de-shuffling algorithm
+			rng.Shuffle(len(fields), func(i, j int) {
+				fields[i], fields[j] = fields[j], fields[i]
+			})
+			timestamp := rng.Int63()
+			lr.MustAdd(tenantID, timestamp, fields)
+		}
+	}
+	return lr
+}
+
+func checkEqualRows(lrResult, lrOrig *LogRows) error {
+	if len(lrResult.timestamps) != len(lrOrig.timestamps) {
+		return fmt.Errorf("unexpected length LogRows; got %d; want %d", len(lrResult.timestamps), len(lrOrig.timestamps))
+	}
+
+	sort.Sort(lrResult)
+	sort.Sort(lrOrig)
+
+	sortFieldNames := func(fields []Field) {
+		sort.Slice(fields, func(i, j int) bool {
+			return fields[i].Name < fields[j].Name
+		})
+	}
+	for i := range lrOrig.timestamps {
+		if !lrOrig.streamIDs[i].equal(&lrResult.streamIDs[i]) {
+			return fmt.Errorf("unexpected streamID for log entry %d\ngot\n%s\nwant\n%s", i, &lrResult.streamIDs[i], &lrOrig.streamIDs[i])
+		}
+		if lrOrig.timestamps[i] != lrResult.timestamps[i] {
+			return fmt.Errorf("unexpected timestamp for log entry %d\ngot\n%d\nwant\n%d", i, lrResult.timestamps[i], lrOrig.timestamps[i])
+		}
+		fieldsOrig := lrOrig.rows[i]
+		fieldsResult := lrResult.rows[i]
+		if len(fieldsOrig) != len(fieldsResult) {
+			return fmt.Errorf("unexpected number of fields at log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
+		}
+		sortFieldNames(fieldsOrig)
+		sortFieldNames(fieldsResult)
+		if !reflect.DeepEqual(fieldsOrig, fieldsResult) {
+			return fmt.Errorf("unexpected fields for log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
+		}
+	}
+	return nil
+}
+
+// readLogRows reads log entries from mp.
+//
+// This function is for testing and debugging purposes only.
+func (mp *inmemoryPart) readLogRows(sbu *stringsBlockUnmarshaler, vd *valuesDecoder) *LogRows {
+	lr := GetLogRows(nil, nil)
+	bsr := getBlockStreamReader()
+	defer putBlockStreamReader(bsr)
+	bsr.MustInitFromInmemoryPart(mp)
+	var tmp rows
+	for bsr.NextBlock() {
+		bd := &bsr.blockData
+		streamID := bd.streamID
+		if err := bd.unmarshalRows(&tmp, sbu, vd); err != nil {
+			logger.Panicf("BUG: cannot unmarshal log entries from inmemoryPart: %s", err)
+		}
+		for i, timestamp := range tmp.timestamps {
+			lr.MustAdd(streamID.tenantID, timestamp, tmp.rows[i])
+			lr.streamIDs[len(lr.streamIDs)-1] = streamID
+		}
+		tmp.reset()
+	}
+	return lr
+}
--- a/lib/logstorage/inmemory_part_timing_test.go
+++ b/lib/logstorage/inmemory_part_timing_test.go
@ -0,0 +1,34 @@
+package logstorage
+
+import (
+	"fmt"
+	"testing"
+)
+
+func BenchmarkInmemoryPart_MustInitFromRows(b *testing.B) {
+	for _, streams := range []int{1, 10, 100} {
+		b.Run(fmt.Sprintf("streams_%d", streams), func(b *testing.B) {
+			for _, rowsPerStream := range []int{1, 10, 100, 1000} {
+				b.Run(fmt.Sprintf("rowsPerStream_%d", rowsPerStream), func(b *testing.B) {
+					benchmarkInmemoryPartMustInitFromRows(b, streams, rowsPerStream)
+				})
+			}
+		})
+	}
+}
+
+func benchmarkInmemoryPartMustInitFromRows(b *testing.B, streams, rowsPerStream int) {
+	b.ReportAllocs()
+	b.SetBytes(int64(streams * rowsPerStream))
+	b.RunParallel(func(pb *testing.PB) {
+		lr := newTestLogRows(streams, rowsPerStream, 0)
+		mp := getInmemoryPart()
+		for pb.Next() {
+			mp.mustInitFromRows(lr)
+			if mp.ph.RowsCount != uint64(len(lr.timestamps)) {
+				panic(fmt.Errorf("unexpecte number of entries in the output stream; got %d; want %d", mp.ph.RowsCount, len(lr.timestamps)))
+			}
+		}
+		putInmemoryPart(mp)
+	})
+}
--- a/lib/logstorage/log_rows.go
+++ b/lib/logstorage/log_rows.go
@ -0,0 +1,277 @@
+package logstorage
+
+import (
+	"sort"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+)
+
+// LogRows holds a set of rows needed for Storage.MustAddRows
+//
+// LogRows must be obtained via GetLogRows()
+type LogRows struct {
+	// buf holds all the bytes referred by items in LogRows
+	buf []byte
+
+	// fieldsBuf holds all the fields referred by items in LogRows
+	fieldsBuf []Field
+
+	// streamIDs holds streamIDs for rows added to LogRows
+	streamIDs []streamID
+
+	// streamTagsCanonicals holds streamTagsCanonical entries for rows added to LogRows
+	streamTagsCanonicals [][]byte
+
+	// timestamps holds stimestamps for rows added to LogRows
+	timestamps []int64
+
+	// rows holds fields for rows atted to LogRows.
+	rows [][]Field
+
+	// sf is a helper for sorting fields in every added row
+	sf sortedFields
+
+	// streamFields contains names for stream fields
+	streamFields map[string]struct{}
+
+	// ignoreFields contains names for log fields, which must be skipped during data ingestion
+	ignoreFields map[string]struct{}
+}
+
+type sortedFields []Field
+
+func (sf *sortedFields) Len() int {
+	return len(*sf)
+}
+
+func (sf *sortedFields) Less(i, j int) bool {
+	a := *sf
+	return a[i].Name < a[j].Name
+}
+
+func (sf *sortedFields) Swap(i, j int) {
+	a := *sf
+	a[i], a[j] = a[j], a[i]
+}
+
+// RowFormatter implementes fmt.Stringer for []Field aka a single log row
+type RowFormatter []Field
+
+// String returns user-readable representation for rf
+func (rf *RowFormatter) String() string {
+	b := append([]byte{}, '{')
+
+	fields := *rf
+	if len(fields) > 0 {
+		b = append(b, fields[0].String()...)
+		fields = fields[1:]
+		for _, field := range fields {
+			b = append(b, ',')
+			b = append(b, field.String()...)
+		}
+	}
+
+	b = append(b, '}')
+	return string(b)
+}
+
+// Reset resets lr
+func (lr *LogRows) Reset() {
+	lr.buf = lr.buf[:0]
+
+	fb := lr.fieldsBuf
+	for i := range fb {
+		fb[i].Reset()
+	}
+	lr.fieldsBuf = fb[:0]
+
+	sids := lr.streamIDs
+	for i := range sids {
+		sids[i].reset()
+	}
+	lr.streamIDs = sids[:0]
+
+	sns := lr.streamTagsCanonicals
+	for i := range sns {
+		sns[i] = nil
+	}
+	lr.streamTagsCanonicals = sns[:0]
+
+	lr.timestamps = lr.timestamps[:0]
+
+	rows := lr.rows
+	for i := range rows {
+		rows[i] = nil
+	}
+	lr.rows = rows[:0]
+
+	lr.sf = nil
+
+	sfs := lr.streamFields
+	for k := range sfs {
+		delete(sfs, k)
+	}
+
+	ifs := lr.ignoreFields
+	for k := range ifs {
+		delete(ifs, k)
+	}
+}
+
+// NeedFlush returns true if lr contains too much data, so it must be flushed to the storage.
+func (lr *LogRows) NeedFlush() bool {
+	return len(lr.buf) > (maxUncompressedBlockSize/8)*7
+}
+
+// MustAdd adds a log entry with the given args to lr.
+//
+// It is OK to modify the args after returning from the function,
+// since lr copies all the args to internal data.
+func (lr *LogRows) MustAdd(tenantID TenantID, timestamp int64, fields []Field) {
+	// Compose StreamTags from fields according to lr.streamFields
+	sfs := lr.streamFields
+	st := GetStreamTags()
+	for i := range fields {
+		f := &fields[i]
+		if _, ok := sfs[f.Name]; ok {
+			st.Add(f.Name, f.Value)
+		}
+	}
+
+	// Marshal StreamTags
+	bb := bbPool.Get()
+	bb.B = st.MarshalCanonical(bb.B)
+	PutStreamTags(st)
+
+	// Calculate the id for the StreamTags
+	var sid streamID
+	sid.tenantID = tenantID
+	sid.id = hash128(bb.B)
+
+	// Store the row
+	lr.mustAddInternal(sid, timestamp, fields, bb.B)
+	bbPool.Put(bb)
+}
+
+func (lr *LogRows) mustAddInternal(sid streamID, timestamp int64, fields []Field, streamTagsCanonical []byte) {
+	buf := lr.buf
+	bufLen := len(buf)
+	buf = append(buf, streamTagsCanonical...)
+
+	lr.streamTagsCanonicals = append(lr.streamTagsCanonicals, buf[bufLen:])
+	lr.streamIDs = append(lr.streamIDs, sid)
+	lr.timestamps = append(lr.timestamps, timestamp)
+
+	// Store all the fields
+	ifs := lr.ignoreFields
+	fb := lr.fieldsBuf
+	fieldsLen := len(fb)
+	for i := range fields {
+		f := &fields[i]
+
+		if _, ok := ifs[f.Name]; ok {
+			// Skip fields from the ifs map
+			continue
+		}
+		if f.Value == "" {
+			// Skip fields without values
+			continue
+		}
+
+		fb = append(fb, Field{})
+		dstField := &fb[len(fb)-1]
+
+		bufLen = len(buf)
+		if f.Name != "_msg" {
+			buf = append(buf, f.Name...)
+		}
+		dstField.Name = bytesutil.ToUnsafeString(buf[bufLen:])
+
+		bufLen = len(buf)
+		buf = append(buf, f.Value...)
+		dstField.Value = bytesutil.ToUnsafeString(buf[bufLen:])
+	}
+	lr.sf = fb[fieldsLen:]
+	sort.Sort(&lr.sf)
+	lr.rows = append(lr.rows, lr.sf)
+
+	lr.fieldsBuf = fb
+	lr.buf = buf
+}
+
+// GetLogRows returns LogRows from the pool for the given streamFields.
+//
+// streamFields is a set of field names, which must be associated with the stream.
+//
+// Return back it to the pool with PutLogRows() when it is no longer needed.
+func GetLogRows(streamFields, ignoreFields []string) *LogRows {
+	v := logRowsPool.Get()
+	if v == nil {
+		v = &LogRows{}
+	}
+	lr := v.(*LogRows)
+
+	// Initialize streamFields
+	sfs := lr.streamFields
+	if sfs == nil {
+		sfs = make(map[string]struct{}, len(streamFields))
+		lr.streamFields = sfs
+	}
+	for _, f := range streamFields {
+		sfs[f] = struct{}{}
+	}
+
+	// Initialize ignoreFields
+	ifs := lr.ignoreFields
+	if ifs == nil {
+		ifs = make(map[string]struct{}, len(ignoreFields))
+		lr.ignoreFields = ifs
+	}
+	for _, f := range ignoreFields {
+		if f != "" {
+			ifs[f] = struct{}{}
+		}
+	}
+
+	return lr
+}
+
+// PutLogRows returns lr to the pool.
+func PutLogRows(lr *LogRows) {
+	lr.Reset()
+	logRowsPool.Put(lr)
+}
+
+var logRowsPool sync.Pool
+
+// Len returns the number of items in lr.
+func (lr *LogRows) Len() int {
+	return len(lr.streamIDs)
+}
+
+// Less returns true if (streamID, timestamp) for row i is smaller than the (streamID, timestamp) for row j
+func (lr *LogRows) Less(i, j int) bool {
+	a := &lr.streamIDs[i]
+	b := &lr.streamIDs[j]
+	if !a.equal(b) {
+		return a.less(b)
+	}
+	return lr.timestamps[i] < lr.timestamps[j]
+}
+
+// Swap swaps rows i and j in lr.
+func (lr *LogRows) Swap(i, j int) {
+	a := &lr.streamIDs[i]
+	b := &lr.streamIDs[j]
+	*a, *b = *b, *a
+
+	tsA, tsB := &lr.timestamps[i], &lr.timestamps[j]
+	*tsA, *tsB = *tsB, *tsA
+
+	snA, snB := &lr.streamTagsCanonicals[i], &lr.streamTagsCanonicals[j]
+	*snA, *snB = *snB, *snA
+
+	fieldsA, fieldsB := &lr.rows[i], &lr.rows[j]
+	*fieldsA, *fieldsB = *fieldsB, *fieldsA
+}
--- a/lib/logstorage/log_rows_timing_test.go
+++ b/lib/logstorage/log_rows_timing_test.go
@ -0,0 +1,83 @@
+package logstorage
+
+import (
+	"testing"
+)
+
+func BenchmarkLogRowsMustAdd(b *testing.B) {
+	rows := newBenchRows(map[string]string{
+		"input.type":         "filestream",
+		"ecs.version":        "8.0.0",
+		"host.hostname":      "foobar-baz-abc",
+		"host.architecture":  "x86_64",
+		"host.name":          "foobar-baz-abc",
+		"host.os.codename":   "bionic",
+		"host.os.type":       "linux",
+		"host.os.platform":   "ubuntu",
+		"host.os.version":    "18.04.6 LTS (Bionic Beaver)",
+		"host.os.family":     "debian",
+		"host.os.name":       "Ubuntu",
+		"host.os.kernel":     "4.15.0-211-generic",
+		"host.id":            "a634d50249af449dbcb3ce724822568a",
+		"host.containerized": "false",
+		"host.ip":            `["10.0.0.42","10.224.112.1","172.20.0.1","172.18.0.1","172.19.0.1","fc00:f853:ccd:e793::1","fe80::1","172.21.0.1","172.17.0.1"]`,
+		"host.mac":           `["02-42-42-90-52-D9","02-42-C6-48-A6-84","02-42-FD-91-7E-17","52-54-00-F5-13-E7","54-E1-AD-89-1A-4C","F8-34-41-3C-C0-85"]`,
+		"agent.ephemeral_id": "6c251f67-7210-4cef-8f72-a9546cbb48cc",
+		"agent.id":           "e97243c5-5ef3-4dc1-8828-504f68731e87",
+		"agent.name":         "foobar-baz-abc",
+		"agent.type":         "filebeat",
+		"agent.version":      "8.8.0",
+		"log.file.path":      "/var/log/auth.log",
+		"log.offset":         "37908",
+	}, []string{
+		"Jun  4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=0)",
+		"Jun  4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=1)",
+		"Jun  4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=2)",
+		"Jun  4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=3)",
+		"Jun  4 20:34:07 foobar-baz-abc sudo: pam_unix(sudo:session): session opened for user root by (uid=4)",
+	})
+	streamFields := []string{
+		"host.hostname",
+		"agent.name",
+		"log.file.path",
+	}
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(rows)))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			benchmarkLogRowsMustAdd(rows, streamFields)
+		}
+	})
+}
+
+func benchmarkLogRowsMustAdd(rows [][]Field, streamFields []string) {
+	lr := GetLogRows(streamFields, nil)
+	var tid TenantID
+	for i, fields := range rows {
+		tid.AccountID = uint32(i)
+		tid.ProjectID = uint32(2 * i)
+		timestamp := int64(i) * 1000
+		lr.MustAdd(tid, timestamp, fields)
+	}
+	PutLogRows(lr)
+}
+
+func newBenchRows(constFields map[string]string, messages []string) [][]Field {
+	rows := make([][]Field, 0, len(messages))
+	for _, msg := range messages {
+		row := make([]Field, 0, len(constFields)+1)
+		for k, v := range constFields {
+			row = append(row, Field{
+				Name:  k,
+				Value: v,
+			})
+		}
+		row = append(row, Field{
+			Name:  "_msg",
+			Value: msg,
+		})
+		rows = append(rows, row)
+	}
+	return rows
+}
--- a/lib/logstorage/parser.go
+++ b/lib/logstorage/parser.go
--- a/lib/logstorage/parser_test.go
+++ b/lib/logstorage/parser_test.go
@ -0,0 +1,966 @@
+package logstorage
+
+import (
+	"math"
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestLexer(t *testing.T) {
+	f := func(s string, tokensExpected []string) {
+		t.Helper()
+		lex := newLexer(s)
+		for _, tokenExpected := range tokensExpected {
+			lex.nextToken()
+			if lex.token != tokenExpected {
+				t.Fatalf("unexpected token; got %q; want %q", lex.token, tokenExpected)
+			}
+		}
+		lex.nextToken()
+		if lex.token != "" {
+			t.Fatalf("unexpected tail token: %q", lex.token)
+		}
+	}
+
+	f("", nil)
+	f("  ", nil)
+	f("foo", []string{"foo"})
+	f("тест123", []string{"тест123"})
+	f("foo:bar", []string{"foo", ":", "bar"})
+	f(` re   (  "тест(\":"  )  `, []string{"re", "(", `тест(":`, ")"})
+	f(" `foo, bar`* AND baz:(abc or 'd\\'\"ЙЦУК `'*)", []string{"foo, bar", "*", "AND", "baz", ":", "(", "abc", "or", `d'"ЙЦУК ` + "`", "*", ")"})
+	f(`_stream:{foo="bar",a=~"baz", b != 'cd',"d,}a"!~abc}`,
+		[]string{"_stream", ":", "{", "foo", "=", "bar", ",", "a", "=~", "baz", ",", "b", "!=", "cd", ",", "d,}a", "!~", "abc", "}"})
+}
+
+func TestNewStreamFilterSuccess(t *testing.T) {
+	f := func(s, resultExpected string) {
+		t.Helper()
+		sf, err := newStreamFilter(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		result := sf.String()
+		if result != resultExpected {
+			t.Fatalf("unexpected StreamFilter; got %s; want %s", result, resultExpected)
+		}
+	}
+
+	f("{}", "{}")
+	f(`{foo="bar"}`, `{foo="bar"}`)
+	f(`{ "foo" =~ "bar.+" , baz!="a" or x="y"}`, `{foo=~"bar.+",baz!="a" or x="y"}`)
+	f(`{"a b"='c}"d' OR de="aaa"}`, `{"a b"="c}\"d" or de="aaa"}`)
+	f(`{a="b", c="d" or x="y"}`, `{a="b",c="d" or x="y"}`)
+}
+
+func TestNewStreamFilterFailure(t *testing.T) {
+	f := func(s string) {
+		t.Helper()
+		sf, err := newStreamFilter(s)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if sf != nil {
+			t.Fatalf("expecting nil sf; got %v", sf)
+		}
+	}
+
+	f("")
+	f("}")
+	f("{")
+	f("{foo")
+	f("{foo}")
+	f("{'foo")
+	f("{foo=")
+	f("{foo or bar}")
+	f("{foo=bar")
+	f("{foo=bar baz}")
+	f("{foo='bar' baz='x'}")
+}
+
+func TestParseTimeRange(t *testing.T) {
+	f := func(s string, minTimestampExpected, maxTimestampExpected int64) {
+		t.Helper()
+		q, err := ParseQuery("_time:" + s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		tf, ok := q.f.(*timeFilter)
+		if !ok {
+			t.Fatalf("unexpected filter; got %T; want *timeFilter; filter: %s", q.f, q.f)
+		}
+		if tf.stringRepr != s {
+			t.Fatalf("unexpected string represenation for timeFilter; got %q; want %q", tf.stringRepr, s)
+		}
+		if tf.minTimestamp != minTimestampExpected {
+			t.Fatalf("unexpected minTimestamp; got %s; want %s", timestampToString(tf.minTimestamp), timestampToString(minTimestampExpected))
+		}
+		if tf.maxTimestamp != maxTimestampExpected {
+			t.Fatalf("unexpected maxTimestamp; got %s; want %s", timestampToString(tf.maxTimestamp), timestampToString(maxTimestampExpected))
+		}
+	}
+
+	var minTimestamp, maxTimestamp int64
+
+	// _time:YYYY -> _time:[YYYY, YYYY+1)
+	minTimestamp = time.Date(2023, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2024, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023", minTimestamp, maxTimestamp)
+	f("2023Z", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-hh:mm -> _time:[YYYY-hh:mm, (YYYY+1)-hh:mm)
+	minTimestamp = time.Date(2023, time.January, 1, 2, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2024, time.January, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY+hh:mm -> _time:[YYYY+hh:mm, (YYYY+1)+hh:mm)
+	minTimestamp = time.Date(2022, time.December, 31, 22, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.December, 31, 22, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023+02:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM -> _time:[YYYY-MM, YYYY-MM+1)
+	minTimestamp = time.Date(2023, time.February, 1, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02", minTimestamp, maxTimestamp)
+	f("2023-02Z", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-hh:mm -> _time:[YYYY-MM-hh:mm, (YYYY-MM+1)-hh:mm)
+	minTimestamp = time.Date(2023, time.February, 1, 2, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-02:00", minTimestamp, maxTimestamp)
+	// March
+	minTimestamp = time.Date(2023, time.March, 1, 2, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.April, 1, 2, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03-02:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM+hh:mm -> _time:[YYYY-MM+hh:mm, (YYYY-MM+1)+hh:mm)
+	minTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 31, 21, 35, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03+02:25", minTimestamp, maxTimestamp)
+	// February with timezone offset
+	minTimestamp = time.Date(2023, time.January, 31, 21, 35, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02+02:25", minTimestamp, maxTimestamp)
+	// February with timezone offset at leap year
+	minTimestamp = time.Date(2024, time.January, 31, 21, 35, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2024, time.February, 29, 21, 35, 0, 0, time.UTC).UnixNano() - 1
+	f("2024-02+02:25", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DD
+	minTimestamp = time.Date(2023, time.February, 12, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.February, 13, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-12", minTimestamp, maxTimestamp)
+	f("2023-02-12Z", minTimestamp, maxTimestamp)
+	// February 28
+	minTimestamp = time.Date(2023, time.February, 28, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28", minTimestamp, maxTimestamp)
+	// January 31
+	minTimestamp = time.Date(2023, time.January, 31, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.February, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-01-31", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DD-hh:mm
+	minTimestamp = time.Date(2023, time.January, 31, 2, 25, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.February, 1, 2, 25, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-01-31-02:25", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DD+hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 21, 35, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 21, 35, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03-01+02:25", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH
+	minTimestamp = time.Date(2023, time.February, 28, 23, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28T23", minTimestamp, maxTimestamp)
+	f("2023-02-28T23Z", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH-hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 01, 25, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.February, 28, 02, 25, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-27T23-02:25", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH+hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 35, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 00, 35, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03-01T02+02:25", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28T23:59", minTimestamp, maxTimestamp)
+	f("2023-02-28T23:59Z", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM-hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28T22:59-01:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM+hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03-01T00:59+01:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM:SS-hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28T23:59:59", minTimestamp, maxTimestamp)
+	f("2023-02-28T23:59:59Z", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM:SS-hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-02-28T22:59:59-01:00", minTimestamp, maxTimestamp)
+
+	// _time:YYYY-MM-DDTHH:MM:SS+hh:mm
+	minTimestamp = time.Date(2023, time.February, 28, 23, 59, 59, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f("2023-03-01T00:59:59+01:00", minTimestamp, maxTimestamp)
+
+	// _time:(start, end)
+	minTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano() + 1
+	maxTimestamp = time.Date(2023, time.April, 6, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f(`(2023-03-01,2023-04-06)`, minTimestamp, maxTimestamp)
+
+	// _time:[start, end)
+	minTimestamp = time.Date(2023, time.March, 1, 0, 0, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.April, 6, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f(`[2023-03-01,2023-04-06)`, minTimestamp, maxTimestamp)
+
+	// _time:(start, end]
+	minTimestamp = time.Date(2023, time.March, 1, 21, 20, 0, 0, time.UTC).UnixNano() + 1
+	maxTimestamp = time.Date(2023, time.April, 7, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f(`(2023-03-01T21:20,2023-04-06]`, minTimestamp, maxTimestamp)
+
+	// _time:[start, end]
+	minTimestamp = time.Date(2023, time.February, 28, 21, 40, 0, 0, time.UTC).UnixNano()
+	maxTimestamp = time.Date(2023, time.April, 7, 0, 0, 0, 0, time.UTC).UnixNano() - 1
+	f(`[2023-03-01+02:20,2023-04-06T23]`, minTimestamp, maxTimestamp)
+}
+
+func TestParseSequenceFilter(t *testing.T) {
+	f := func(s, fieldNameExpected string, phrasesExpected []string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		sf, ok := q.f.(*sequenceFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *sequenceFilter; filter: %s", q.f, q.f)
+		}
+		if sf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", sf.fieldName, fieldNameExpected)
+		}
+		if !reflect.DeepEqual(sf.phrases, phrasesExpected) {
+			t.Fatalf("unexpected phrases\ngot\n%q\nwant\n%q", sf.phrases, phrasesExpected)
+		}
+	}
+
+	f(`seq()`, ``, nil)
+	f(`foo:seq(foo)`, `foo`, []string{"foo"})
+	f(`_msg:seq("foo bar,baz")`, `_msg`, []string{"foo bar,baz"})
+	f(`seq(foo,bar-baz.aa"bb","c,)d")`, ``, []string{"foo", `bar-baz.aa"bb"`, "c,)d"})
+}
+
+func TestParseInFilter(t *testing.T) {
+	f := func(s, fieldNameExpected string, valuesExpected []string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		f, ok := q.f.(*inFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *inFilter; filter: %s", q.f, q.f)
+		}
+		if f.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", f.fieldName, fieldNameExpected)
+		}
+		if !reflect.DeepEqual(f.values, valuesExpected) {
+			t.Fatalf("unexpected values\ngot\n%q\nwant\n%q", f.values, valuesExpected)
+		}
+	}
+
+	f(`in()`, ``, nil)
+	f(`foo:in(foo)`, `foo`, []string{"foo"})
+	f(`:in("foo bar,baz")`, ``, []string{"foo bar,baz"})
+	f(`ip:in(1.2.3.4, 5.6.7.8, 9.10.11.12)`, `ip`, []string{"1.2.3.4", "5.6.7.8", "9.10.11.12"})
+	f(`foo-bar:in(foo,bar-baz.aa"bb","c,)d")`, `foo-bar`, []string{"foo", `bar-baz.aa"bb"`, "c,)d"})
+}
+
+func TestParseIPv4RangeFilter(t *testing.T) {
+	f := func(s, fieldNameExpected string, minValueExpected, maxValueExpected uint32) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		rf, ok := q.f.(*ipv4RangeFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *ipv4RangeFilter; filter: %s", q.f, q.f)
+		}
+		if rf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
+		}
+		if rf.minValue != minValueExpected {
+			t.Fatalf("unexpected minValue; got %08x; want %08x", rf.minValue, minValueExpected)
+		}
+		if rf.maxValue != maxValueExpected {
+			t.Fatalf("unexpected maxValue; got %08x; want %08x", rf.maxValue, maxValueExpected)
+		}
+	}
+
+	f(`ipv4_range(1.2.3.4, 5.6.7.8)`, ``, 0x01020304, 0x05060708)
+	f(`_msg:ipv4_range("0.0.0.0", 255.255.255.255)`, `_msg`, 0, 0xffffffff)
+	f(`ip:ipv4_range(1.2.3.0/24)`, `ip`, 0x01020300, 0x010203ff)
+	f(`:ipv4_range("1.2.3.34/24")`, ``, 0x01020300, 0x010203ff)
+	f(`ipv4_range("1.2.3.34/20")`, ``, 0x01020000, 0x01020fff)
+	f(`ipv4_range("1.2.3.15/32")`, ``, 0x0102030f, 0x0102030f)
+	f(`ipv4_range(1.2.3.34/0)`, ``, 0, 0xffffffff)
+}
+
+func TestParseStringRangeFilter(t *testing.T) {
+	f := func(s, fieldNameExpected, minValueExpected, maxValueExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		rf, ok := q.f.(*stringRangeFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *stringRangeFilter; filter: %s", q.f, q.f)
+		}
+		if rf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
+		}
+		if rf.minValue != minValueExpected {
+			t.Fatalf("unexpected minValue; got %q; want %q", rf.minValue, minValueExpected)
+		}
+		if rf.maxValue != maxValueExpected {
+			t.Fatalf("unexpected maxValue; got %q; want %q", rf.maxValue, maxValueExpected)
+		}
+	}
+
+	f("string_range(foo, bar)", ``, "foo", "bar")
+	f(`abc:string_range("foo,bar", "baz) !")`, `abc`, `foo,bar`, `baz) !`)
+}
+
+func TestParseRegexpFilter(t *testing.T) {
+	f := func(s, reExpected string) {
+		t.Helper()
+		q, err := ParseQuery("re(" + s + ")")
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		rf, ok := q.f.(*regexpFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *regexpFilter; filter: %s", q.f, q.f)
+		}
+		if reString := rf.re.String(); reString != reExpected {
+			t.Fatalf("unexpected regexp; got %q; want %q", reString, reExpected)
+		}
+	}
+
+	f(`""`, ``)
+	f(`foo`, `foo`)
+	f(`"foo.+|bar.*"`, `foo.+|bar.*`)
+	f(`"foo(bar|baz),x[y]"`, `foo(bar|baz),x[y]`)
+}
+
+func TestParseAnyCasePhraseFilter(t *testing.T) {
+	f := func(s, fieldNameExpected, phraseExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		pf, ok := q.f.(*anyCasePhraseFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *anyCasePhraseFilter; filter: %s", q.f, q.f)
+		}
+		if pf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
+		}
+		if pf.phrase != phraseExpected {
+			t.Fatalf("unexpected phrase; got %q; want %q", pf.phrase, phraseExpected)
+		}
+	}
+
+	f(`i("")`, ``, ``)
+	f(`i(foo)`, ``, `foo`)
+	f(`abc-de.fg:i(foo-bar+baz)`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":i("foo-bar+baz")`, `abc-de.fg`, `foo-bar+baz`)
+}
+
+func TestParseAnyCasePrefixFilter(t *testing.T) {
+	f := func(s, fieldNameExpected, prefixExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		pf, ok := q.f.(*anyCasePrefixFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *anyCasePrefixFilter; filter: %s", q.f, q.f)
+		}
+		if pf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
+		}
+		if pf.prefix != prefixExpected {
+			t.Fatalf("unexpected prefix; got %q; want %q", pf.prefix, prefixExpected)
+		}
+	}
+
+	f(`i(*)`, ``, ``)
+	f(`i(""*)`, ``, ``)
+	f(`i(foo*)`, ``, `foo`)
+	f(`abc-de.fg:i(foo-bar+baz*)`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":i("foo-bar+baz"*)`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":i("foo-bar*baz *"*)`, `abc-de.fg`, `foo-bar*baz *`)
+}
+
+func TestParsePhraseFilter(t *testing.T) {
+	f := func(s, fieldNameExpected, phraseExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		pf, ok := q.f.(*phraseFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *phraseFilter; filter: %s", q.f, q.f)
+		}
+		if pf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
+		}
+		if pf.phrase != phraseExpected {
+			t.Fatalf("unexpected prefix; got %q; want %q", pf.phrase, phraseExpected)
+		}
+	}
+
+	f(`""`, ``, ``)
+	f(`foo`, ``, `foo`)
+	f(`abc-de.fg:foo-bar+baz`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":"foo-bar+baz"`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":"foo-bar*baz *"`, `abc-de.fg`, `foo-bar*baz *`)
+	f(`"foo:bar*,( baz"`, ``, `foo:bar*,( baz`)
+}
+
+func TestParsePrefixFilter(t *testing.T) {
+	f := func(s, fieldNameExpected, prefixExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		pf, ok := q.f.(*prefixFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *prefixFilter; filter: %s", q.f, q.f)
+		}
+		if pf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", pf.fieldName, fieldNameExpected)
+		}
+		if pf.prefix != prefixExpected {
+			t.Fatalf("unexpected prefix; got %q; want %q", pf.prefix, prefixExpected)
+		}
+	}
+
+	f(`*`, ``, ``)
+	f(`""*`, ``, ``)
+	f(`foo*`, ``, `foo`)
+	f(`abc-de.fg:foo-bar+baz*`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":"foo-bar+baz"*`, `abc-de.fg`, `foo-bar+baz`)
+	f(`"abc-de.fg":"foo-bar*baz *"*`, `abc-de.fg`, `foo-bar*baz *`)
+}
+
+func TestParseRangeFilter(t *testing.T) {
+	f := func(s, fieldNameExpected string, minValueExpected, maxValueExpected float64) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		rf, ok := q.f.(*rangeFilter)
+		if !ok {
+			t.Fatalf("unexpected filter type; got %T; want *ipv4RangeFilter; filter: %s", q.f, q.f)
+		}
+		if rf.fieldName != fieldNameExpected {
+			t.Fatalf("unexpected fieldName; got %q; want %q", rf.fieldName, fieldNameExpected)
+		}
+		if rf.minValue != minValueExpected {
+			t.Fatalf("unexpected minValue; got %v; want %v", rf.minValue, minValueExpected)
+		}
+		if rf.maxValue != maxValueExpected {
+			t.Fatalf("unexpected maxValue; got %v; want %v", rf.maxValue, maxValueExpected)
+		}
+	}
+
+	f(`range[-1.234, +2e5]`, ``, -1.234, 2e5)
+	f(`foo:range[-1.234e-5, 2e5]`, `foo`, -1.234e-5, 2e5)
+	f(`range:range["-1.234e5", "-2e-5"]`, `range`, -1.234e5, -2e-5)
+
+	f(`_msg:range[1, 2]`, `_msg`, 1, 2)
+	f(`:range(1, 2)`, ``, math.Nextafter(1, math.Inf(1)), math.Nextafter(2, math.Inf(-1)))
+	f(`range[1, 2)`, ``, 1, math.Nextafter(2, math.Inf(-1)))
+	f(`range("1", 2]`, ``, math.Nextafter(1, math.Inf(1)), 2)
+}
+
+func TestParseQuerySuccess(t *testing.T) {
+	f := func(s, resultExpected string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		result := q.String()
+		if result != resultExpected {
+			t.Fatalf("unexpected result;\ngot\n%s\nwant\n%s", result, resultExpected)
+		}
+	}
+
+	f("foo", "foo")
+	f(":foo", "foo")
+	f(`"":foo`, "foo")
+	f(`"" bar`, `"" bar`)
+	f(`!''`, `!""`)
+	f(`foo:""`, `foo:""`)
+	f(`!foo:""`, `!foo:""`)
+	f(`not foo:""`, `!foo:""`)
+	f(`not(foo)`, `!foo`)
+	f(`not (foo)`, `!foo`)
+	f(`not ( foo or bar )`, `!(foo or bar)`)
+	f(`foo:!""`, `!foo:""`)
+	f("_msg:foo", "foo")
+	f("'foo:bar'", `"foo:bar"`)
+	f("'!foo'", `"!foo"`)
+	f("foo 'and' and bar", `foo "and" bar`)
+	f("foo bar", "foo bar")
+	f("foo and bar", "foo bar")
+	f("foo AND bar", "foo bar")
+	f("foo or bar", "foo or bar")
+	f("foo OR bar", "foo or bar")
+	f("not foo", "!foo")
+	f("! foo", "!foo")
+	f("not !`foo bar`", `"foo bar"`)
+	f("foo or bar and not baz", "foo or bar !baz")
+	f("'foo bar' !baz", `"foo bar" !baz`)
+	f("foo:!bar", `!foo:bar`)
+	f(`foo and bar and baz or x or y or z and zz`, `foo bar baz or x or y or z zz`)
+	f(`foo and bar and (baz or x or y or z) and zz`, `foo bar (baz or x or y or z) zz`)
+	f(`(foo or bar or baz) and x and y and (z or zz)`, `(foo or bar or baz) x y (z or zz)`)
+	f(`(foo or bar or baz) and x and y and not (z or zz)`, `(foo or bar or baz) x y !(z or zz)`)
+	f(`NOT foo AND bar OR baz`, `!foo bar or baz`)
+	f(`NOT (foo AND bar) OR baz`, `!(foo bar) or baz`)
+	f(`foo OR bar AND baz`, `foo or bar baz`)
+	f(`(foo OR bar) AND baz`, `(foo or bar) baz`)
+
+	// parens
+	f(`foo:(bar baz or not :xxx)`, `foo:bar foo:baz or !foo:xxx`)
+	f(`(foo:bar and (foo:baz or aa:bb) and xx) and y`, `foo:bar (foo:baz or aa:bb) xx y`)
+	f("level:error and _msg:(a or b)", "level:error (a or b)")
+	f("level: ( ((error or warn*) and re(foo))) (not (bar))", `(level:error or level:warn*) level:re("foo") !bar`)
+	f("!(foo bar or baz and not aa*)", `!(foo bar or baz !aa*)`)
+
+	// prefix search
+	f(`'foo'* and (a:x* and x:* or y:i(""*)) and i("abc def"*)`, `foo* (a:x* x:* or y:i(*)) i("abc def"*)`)
+
+	// This isn't a prefix search - it equals to `foo AND *`
+	f(`foo *`, `foo *`)
+	f(`"foo" *`, `foo *`)
+
+	// empty filter
+	f(`"" or foo:"" and not bar:""`, `"" or foo:"" !bar:""`)
+
+	// _stream filters
+	f(`_stream:{}`, ``)
+	f(`_stream:{foo="bar", baz=~"x" OR or!="b", "x=},"="d}{"}`, `_stream:{foo="bar",baz=~"x" or "or"!="b","x=},"="d}{"}`)
+	f(`_stream:{or=a or ","="b"}`, `_stream:{"or"="a" or ","="b"}`)
+	f("_stream : { foo =  bar , }  ", `_stream:{foo="bar"}`)
+
+	// _time filters
+	f(`_time:[-5m,now)`, `_time:[-5m,now)`)
+	f(`_time:(  now-1h  , now-5m34s5ms]`, `_time:(now-1h,now-5m34s5ms]`)
+	f(`_time:[2023, 2023-01)`, `_time:[2023,2023-01)`)
+	f(`_time:[2023-01-02, 2023-02-03T04)`, `_time:[2023-01-02,2023-02-03T04)`)
+	f(`_time:[2023-01-02T04:05, 2023-02-03T04:05:06)`, `_time:[2023-01-02T04:05,2023-02-03T04:05:06)`)
+	f(`_time:[2023-01-02T04:05:06Z, 2023-02-03T04:05:06.234Z)`, `_time:[2023-01-02T04:05:06Z,2023-02-03T04:05:06.234Z)`)
+	f(`_time:[2023-01-02T04:05:06+02:30, 2023-02-03T04:05:06.234-02:45)`, `_time:[2023-01-02T04:05:06+02:30,2023-02-03T04:05:06.234-02:45)`)
+	f(`_time:[2023-06-07T23:56:34.3456-02:30, now)`, `_time:[2023-06-07T23:56:34.3456-02:30,now)`)
+	f(`_time:("2024-01-02+02:00", now)`, `_time:(2024-01-02+02:00,now)`)
+	f(`_time:now`, `_time:now`)
+	f(`_time:"now"`, `_time:now`)
+	f(`_time:2024Z`, `_time:2024Z`)
+	f(`_time:2024-02:30`, `_time:2024-02:30`)
+	f(`_time:2024-01-02:30`, `_time:2024-01-02:30`)
+	f(`_time:2024-01-02:30`, `_time:2024-01-02:30`)
+	f(`_time:2024-01-02+03:30`, `_time:2024-01-02+03:30`)
+	f(`_time:2024-01-02T10+03:30`, `_time:2024-01-02T10+03:30`)
+	f(`_time:2024-01-02T10:20+03:30`, `_time:2024-01-02T10:20+03:30`)
+	f(`_time:2024-01-02T10:20:40+03:30`, `_time:2024-01-02T10:20:40+03:30`)
+	f(`_time:2024-01-02T10:20:40-03:30`, `_time:2024-01-02T10:20:40-03:30`)
+	f(`_time:"2024-01-02T10:20:40Z"`, `_time:2024-01-02T10:20:40Z`)
+	f(`_time:2023-01-02T04:05:06.789Z`, `_time:2023-01-02T04:05:06.789Z`)
+	f(`_time:2023-01-02T04:05:06.789-02:30`, `_time:2023-01-02T04:05:06.789-02:30`)
+	f(`_time:2023-01-02T04:05:06.789+02:30`, `_time:2023-01-02T04:05:06.789+02:30`)
+	f(`_time:[1234567890, 1400000000]`, `_time:[1234567890,1400000000]`)
+
+	// reserved keywords
+	f("and", `"and"`)
+	f("and and or", `"and" "or"`)
+	f("AnD", `"AnD"`)
+	f("or", `"or"`)
+	f("re 'and' `or` 'not'", `"re" "and" "or" "not"`)
+	f("foo:and", `foo:"and"`)
+	f("'re':or or x", `"re":"or" or x`)
+	f(`"-"`, `"-"`)
+	f(`"!"`, `"!"`)
+	f(`"not"`, `"not"`)
+	f(`''`, `""`)
+
+	// reserved functions
+	f("exact", `"exact"`)
+	f("exact:a", `"exact":a`)
+	f("exact-foo", `exact-foo`)
+	f("a:exact", `a:"exact"`)
+	f("a:exact-foo", `a:exact-foo`)
+	f("exact-foo:b", `exact-foo:b`)
+	f("exact_prefix", `"exact_prefix"`)
+	f("exact_prefix:a", `"exact_prefix":a`)
+	f("exact_prefix-foo", `exact_prefix-foo`)
+	f("a:exact_prefix", `a:"exact_prefix"`)
+	f("a:exact_prefix-foo", `a:exact_prefix-foo`)
+	f("exact_prefix-foo:b", `exact_prefix-foo:b`)
+	f("i", `"i"`)
+	f("i-foo", `i-foo`)
+	f("a:i-foo", `a:i-foo`)
+	f("i-foo:b", `i-foo:b`)
+	f("in", `"in"`)
+	f("in:a", `"in":a`)
+	f("in-foo", `in-foo`)
+	f("a:in", `a:"in"`)
+	f("a:in-foo", `a:in-foo`)
+	f("in-foo:b", `in-foo:b`)
+	f("ipv4_range", `"ipv4_range"`)
+	f("ipv4_range:a", `"ipv4_range":a`)
+	f("ipv4_range-foo", `ipv4_range-foo`)
+	f("a:ipv4_range", `a:"ipv4_range"`)
+	f("a:ipv4_range-foo", `a:ipv4_range-foo`)
+	f("ipv4_range-foo:b", `ipv4_range-foo:b`)
+	f("len_range", `"len_range"`)
+	f("len_range:a", `"len_range":a`)
+	f("len_range-foo", `len_range-foo`)
+	f("a:len_range", `a:"len_range"`)
+	f("a:len_range-foo", `a:len_range-foo`)
+	f("len_range-foo:b", `len_range-foo:b`)
+	f("range", `"range"`)
+	f("range:a", `"range":a`)
+	f("range-foo", `range-foo`)
+	f("a:range", `a:"range"`)
+	f("a:range-foo", `a:range-foo`)
+	f("range-foo:b", `range-foo:b`)
+	f("re", `"re"`)
+	f("re-bar", `re-bar`)
+	f("a:re-bar", `a:re-bar`)
+	f("re-bar:a", `re-bar:a`)
+	f("seq", `"seq"`)
+	f("seq-a", `seq-a`)
+	f("x:seq-a", `x:seq-a`)
+	f("seq-a:x", `seq-a:x`)
+	f("string_range", `"string_range"`)
+	f("string_range-a", `string_range-a`)
+	f("x:string_range-a", `x:string_range-a`)
+	f("string_range-a:x", `string_range-a:x`)
+
+	// exact filter
+	f("exact(foo)", `exact(foo)`)
+	f("exact('foo bar),|baz')", `exact("foo bar),|baz")`)
+	f(`exact(foo-bar,)`, `exact(foo-bar)`)
+	f(`exact(foo|b:ar)`, `exact("foo|b:ar")`)
+	f(`foo:exact(f,)`, `foo:exact(f)`)
+
+	// exact_prefix filter
+	f("exact_prefix(foo)", `exact_prefix(foo)`)
+	f(`exact_prefix("foo bar")`, `exact_prefix("foo bar")`)
+	f(`exact_prefix(foo-bar,)`, `exact_prefix(foo-bar)`)
+	f(`exact_prefix(foo|b:ar)`, `exact_prefix("foo|b:ar")`)
+	f(`foo:exact_prefix(f,)`, `foo:exact_prefix(f)`)
+
+	// i filter
+	f("i(foo)", `i(foo)`)
+	f("i(foo*)", `i(foo*)`)
+	f("i(`foo`* )", `i(foo*)`)
+	f("i(' foo ) bar')", `i(" foo ) bar")`)
+	f("i('foo bar'*)", `i("foo bar"*)`)
+	f(`foo:i(foo:bar-baz|aa+bb)`, `foo:i("foo:bar-baz|aa+bb")`)
+
+	// in filter
+	f(`in()`, `in()`)
+	f(`in(foo)`, `in(foo)`)
+	f(`in(foo, bar)`, `in(foo,bar)`)
+	f(`in("foo bar", baz)`, `in("foo bar",baz)`)
+	f(`foo:in(foo-bar|baz)`, `foo:in("foo-bar|baz")`)
+
+	// ipv4_range filter
+	f(`ipv4_range(1.2.3.4, "5.6.7.8")`, `ipv4_range(1.2.3.4, 5.6.7.8)`)
+	f(`foo:ipv4_range(1.2.3.4, "5.6.7.8" , )`, `foo:ipv4_range(1.2.3.4, 5.6.7.8)`)
+	f(`ipv4_range(1.2.3.4)`, `ipv4_range(1.2.3.4, 1.2.3.4)`)
+	f(`ipv4_range(1.2.3.4/20)`, `ipv4_range(1.2.0.0, 1.2.15.255)`)
+	f(`ipv4_range(1.2.3.4,)`, `ipv4_range(1.2.3.4, 1.2.3.4)`)
+
+	// len_range filter
+	f(`len_range(10, 20)`, `len_range(10,20)`)
+	f(`foo:len_range("10", 20, )`, `foo:len_range(10,20)`)
+
+	// range filter
+	f(`range(1.234, 5656.43454)`, `range(1.234,5656.43454)`)
+	f(`foo:range(-2343.344, 2343.4343)`, `foo:range(-2343.344,2343.4343)`)
+	f(`range(-1.234e-5  , 2.34E+3)`, `range(-1.234e-5,2.34E+3)`)
+	f(`range[123, 456)`, `range[123,456)`)
+	f(`range(123, 445]`, `range(123,445]`)
+	f(`range("1.234e-4", -23)`, `range(1.234e-4,-23)`)
+
+	// re filter
+	f("re('foo|ba(r.+)')", `re("foo|ba(r.+)")`)
+	f("re(foo)", `re("foo")`)
+	f(`foo:re(foo-bar|baz.)`, `foo:re("foo-bar|baz.")`)
+
+	// seq filter
+	f(`seq()`, `seq()`)
+	f(`seq(foo)`, `seq(foo)`)
+	f(`seq("foo, bar", baz, abc)`, `seq("foo, bar",baz,abc)`)
+	f(`foo:seq(foo"bar-baz+aa, b)`, `foo:seq("foo\"bar-baz+aa",b)`)
+
+	// string_range filter
+	f(`string_range(foo, bar)`, `string_range(foo, bar)`)
+	f(`foo:string_range("foo, bar", baz)`, `foo:string_range("foo, bar", baz)`)
+
+	// reserved field names
+	f(`"_stream"`, `_stream`)
+	f(`"_time"`, `_time`)
+	f(`"_msg"`, `_msg`)
+	f(`_stream and _time or _msg`, `_stream _time or _msg`)
+
+	// invalid rune
+	f("\xff", `"\xff"`)
+
+	// ip addresses in the query
+	f("1.2.3.4 or ip:5.6.7.9", "1.2.3.4 or ip:5.6.7.9")
+
+	// '-' and '.' chars in field name and search phrase
+	f("trace-id.foo.bar:baz", `trace-id.foo.bar:baz`)
+	f(`custom-Time:2024-01-02T03:04:05+08:00    fooBar OR !baz:xxx`, `custom-Time:"2024-01-02T03:04:05+08:00" fooBar or !baz:xxx`)
+	f("foo-bar+baz*", `"foo-bar+baz"*`)
+	f("foo- bar", `foo- bar`)
+	f("foo -bar", `foo -bar`)
+	f("foo!bar", `"foo!bar"`)
+	f("foo:aa!bb:cc", `foo:"aa!bb:cc"`)
+	f(`foo:bar:baz`, `foo:"bar:baz"`)
+	f(`foo:(bar baz:xxx)`, `foo:bar foo:"baz:xxx"`)
+	f(`foo:(_time:abc or not z)`, `foo:"_time:abc" or !foo:z`)
+	f(`foo:(_msg:a :x _stream:{c="d"})`, `foo:"_msg:a" foo:x foo:"_stream:{c=\"d\"}"`)
+	f(`:(_msg:a:b c)`, `"a:b" c`)
+	f(`"foo"bar baz:"a'b"c`, `"\"foo\"bar" baz:"\"a'b\"c"`)
+
+	// complex queries
+	f(`_time:[-1h, now] _stream:{job="foo",env=~"prod|staging"} level:(error or warn*) and not "connection reset by peer"`,
+		`_time:[-1h,now] _stream:{job="foo",env=~"prod|staging"} (level:error or level:warn*) !"connection reset by peer"`)
+	f(`(_time:(2023-04-20, now] or _time:[-10m, -1m))
+		and (_stream:{job="a"} or _stream:{instance!="b"})
+		and (err* or ip:(ipv4_range(1.2.3.0, 1.2.3.255) and not 1.2.3.4))`,
+		`(_time:(2023-04-20,now] or _time:[-10m,-1m)) (_stream:{job="a"} or _stream:{instance!="b"}) (err* or ip:ipv4_range(1.2.3.0, 1.2.3.255) !ip:1.2.3.4)`)
+}
+
+func TestParseQueryFailure(t *testing.T) {
+	f := func(s string) {
+		t.Helper()
+		q, err := ParseQuery(s)
+		if q != nil {
+			t.Fatalf("expecting nil result; got %s", q)
+		}
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+	}
+
+	f("")
+	f("|")
+	f("foo|")
+	f("foo|bar")
+	f("foo and")
+	f("foo OR ")
+	f("not")
+	f("NOT")
+	f("not (abc")
+	f("!")
+
+	// invalid parens
+	f("(")
+	f("foo (bar ")
+	f("(foo:'bar")
+
+	// missing filter
+	f(":")
+	f(":  ")
+	f("foo:  ")
+	f("_msg :   ")
+	f(`"":   `)
+
+	// invalid quoted strings
+	f(`"foo`)
+	f(`'foo`)
+	f("`foo")
+
+	// invalid _stream filters
+	f("_stream:")
+	f("_stream:{")
+	f("_stream:(")
+	f("_stream:{foo")
+	f("_stream:{foo}")
+	f("_stream:{foo=")
+	f("_stream:{foo='bar")
+	f("_stream:{foo='bar}")
+	f("_stream:{foo=bar or")
+	f("_stream:{foo=bar or}")
+	f("_stream:{foo=bar or baz}")
+	f("_stream:{foo=bar baz x=y}")
+	f("_stream:{foo=bar,")
+	f("_stream:{foo=bar")
+	f("_stream:foo")
+	f("_stream:(foo)")
+	f("_stream:[foo]")
+
+	// invalid _time filters
+	f("_time:")
+	f("_time:[")
+	f("_time:foo")
+	f("_time:{}")
+	f("_time:[foo,bar)")
+	f("_time:(now)")
+	f("_time:[now,")
+	f("_time:(now, not now]")
+	f("_time:(-5m, -1m}")
+	f("_time:[-")
+	f("_time:[now-foo,-bar]")
+	f("_time:[2023-ab,2023]")
+	f("_time:[fooo-02,2023]")
+	f("_time:[2023-01-02T04:05:06+12,2023]")
+	f("_time:[2023-01-02T04:05:06-12,2023]")
+	f("_time:2023-01-02T04:05:06.789")
+
+	// long query with error
+	f(`very long query with error aaa ffdfd fdfdfd fdfd:( ffdfdfdfdfd`)
+
+	// query with unexpected tail
+	f(`foo | bar`)
+
+	// unexpected comma
+	f(`foo,bar`)
+	f(`foo, bar`)
+	f(`foo ,bar`)
+
+	// unexpected token
+	f(`[foo`)
+	f(`foo]bar`)
+	f(`foo] bar`)
+	f(`foo ]bar`)
+	f(`) foo`)
+	f(`foo)bar`)
+
+	// unknown function
+	f(`unknown_function(foo)`)
+
+	// invalid exact
+	f(`exact(`)
+	f(`exact(f, b)`)
+	f(`exact(foo`)
+	f(`exact(foo,`)
+	f(`exact(foo*)`)
+	f(`exact(foo bar)`)
+	f(`exact(foo, bar`)
+
+	// invalid i
+	f(`i(`)
+	f(`i(aa`)
+	f(`i(aa, bb)`)
+	f(`i(*`)
+	f(`i(aaa*`)
+	f(`i(a**)`)
+	f(`i("foo`)
+	f(`i(foo bar)`)
+
+	// invalid in
+	f(`in(`)
+	f(`in(,)`)
+	f(`in(f, b c)`)
+	f(`in(foo`)
+	f(`in(foo,`)
+	f(`in(foo*)`)
+	f(`in(foo, "bar baz"*)`)
+	f(`in(foo, "bar baz"*, abc)`)
+	f(`in(foo bar)`)
+	f(`in(foo, bar`)
+
+	// invalid ipv4_range
+	f(`ipv4_range(`)
+	f(`ipv4_range(foo,bar)`)
+	f(`ipv4_range(1.2.3.4*)`)
+	f(`ipv4_range("1.2.3.4"*)`)
+	f(`ipv4_range(1.2.3.4`)
+	f(`ipv4_range(1.2.3.4,`)
+	f(`ipv4_range(1.2.3.4, 5.6.7)`)
+	f(`ipv4_range(1.2.3.4, 5.6.7.8`)
+	f(`ipv4_range(1.2.3.4, 5.6.7.8,`)
+	f(`ipv4_range(1.2.3.4, 5.6.7.8,,`)
+	f(`ipv4_range(1.2.3.4, 5.6.7.8,5.3.2.1)`)
+
+	// invalid len_range
+	f(`len_range(`)
+	f(`len_range(1)`)
+	f(`len_range(foo, bar)`)
+	f(`len_range(1, bar)`)
+	f(`len_range(1, 2`)
+	f(`len_range(1.2, 3.4)`)
+
+	// invalid range
+	f(`range(`)
+	f(`range(foo,bar)`)
+	f(`range(1"`)
+	f(`range(1,`)
+	f(`range(1)`)
+	f(`range(1,)`)
+	f(`range(1,2,`)
+	f(`range[1,foo)`)
+	f(`range[1,2,3)`)
+	f(`range(1)`)
+
+	// invalid re
+	f("re(")
+	f("re(a, b)")
+	f("foo:re(bar")
+	f("re(`ab(`)")
+	f(`re(a b)`)
+
+	// invalid seq
+	f(`seq(`)
+	f(`seq(,)`)
+	f(`seq(foo`)
+	f(`seq(foo,`)
+	f(`seq(foo*)`)
+	f(`seq(foo*, bar)`)
+	f(`seq(foo bar)`)
+	f(`seq(foo, bar`)
+
+	// invalid string_range
+	f(`string_range(`)
+	f(`string_range(,)`)
+	f(`string_range(foo`)
+	f(`string_range(foo,`)
+	f(`string_range(foo*)`)
+	f(`string_range(foo bar)`)
+	f(`string_range(foo, bar`)
+	f(`string_range(foo)`)
+	f(`string_range(foo, bar, baz)`)
+}
--- a/lib/logstorage/part.go
+++ b/lib/logstorage/part.go
@ -0,0 +1,102 @@
+package logstorage
+
+import (
+	"path/filepath"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+)
+
+type part struct {
+	// pt is the partition the part belongs to
+	pt *partition
+
+	// path is the path to the part on disk.
+	//
+	// If the part is in-memory then the path is empty.
+	path string
+
+	// ph contains partHeader for the given part.
+	ph partHeader
+
+	// indexBlockHeaders contains a list of indexBlockHeader entries for the given part.
+	indexBlockHeaders []indexBlockHeader
+
+	indexFile              fs.MustReadAtCloser
+	columnsHeaderFile      fs.MustReadAtCloser
+	timestampsFile         fs.MustReadAtCloser
+	fieldValuesFile        fs.MustReadAtCloser
+	fieldBloomFilterFile   fs.MustReadAtCloser
+	messageValuesFile      fs.MustReadAtCloser
+	messageBloomFilterFile fs.MustReadAtCloser
+}
+
+func mustOpenInmemoryPart(pt *partition, mp *inmemoryPart) *part {
+	var p part
+	p.pt = pt
+	p.path = ""
+	p.ph = mp.ph
+
+	// Read metaindex
+	metaindexReader := mp.metaindex.NewReader()
+	var mrs readerWithStats
+	mrs.init(metaindexReader)
+	p.indexBlockHeaders = mustReadIndexBlockHeaders(p.indexBlockHeaders[:0], &mrs)
+
+	// Open data files
+	p.indexFile = &mp.index
+	p.columnsHeaderFile = &mp.columnsHeader
+	p.timestampsFile = &mp.timestamps
+	p.fieldValuesFile = &mp.fieldValues
+	p.fieldBloomFilterFile = &mp.fieldBloomFilter
+	p.messageValuesFile = &mp.messageValues
+	p.messageBloomFilterFile = &mp.messageBloomFilter
+
+	return &p
+}
+
+func mustOpenFilePart(pt *partition, path string) *part {
+	var p part
+	p.pt = pt
+	p.path = path
+	p.ph.mustReadMetadata(path)
+
+	metaindexPath := filepath.Join(path, metaindexFilename)
+	indexPath := filepath.Join(path, indexFilename)
+	columnsHeaderPath := filepath.Join(path, columnsHeaderFilename)
+	timestampsPath := filepath.Join(path, timestampsFilename)
+	fieldValuesPath := filepath.Join(path, fieldValuesFilename)
+	fieldBloomFilterPath := filepath.Join(path, fieldBloomFilename)
+	messageValuesPath := filepath.Join(path, messageValuesFilename)
+	messageBloomFilterPath := filepath.Join(path, messageBloomFilename)
+
+	// Read metaindex
+	metaindexReader := filestream.MustOpen(metaindexPath, true)
+	var mrs readerWithStats
+	mrs.init(metaindexReader)
+	p.indexBlockHeaders = mustReadIndexBlockHeaders(p.indexBlockHeaders[:0], &mrs)
+	mrs.MustClose()
+
+	// Open data files
+	p.indexFile = fs.MustOpenReaderAt(indexPath)
+	p.columnsHeaderFile = fs.MustOpenReaderAt(columnsHeaderPath)
+	p.timestampsFile = fs.MustOpenReaderAt(timestampsPath)
+	p.fieldValuesFile = fs.MustOpenReaderAt(fieldValuesPath)
+	p.fieldBloomFilterFile = fs.MustOpenReaderAt(fieldBloomFilterPath)
+	p.messageValuesFile = fs.MustOpenReaderAt(messageValuesPath)
+	p.messageBloomFilterFile = fs.MustOpenReaderAt(messageBloomFilterPath)
+
+	return &p
+}
+
+func mustClosePart(p *part) {
+	p.indexFile.MustClose()
+	p.columnsHeaderFile.MustClose()
+	p.timestampsFile.MustClose()
+	p.fieldValuesFile.MustClose()
+	p.fieldBloomFilterFile.MustClose()
+	p.messageValuesFile.MustClose()
+	p.messageBloomFilterFile.MustClose()
+
+	p.pt = nil
+}
--- a/lib/logstorage/part_header.go
+++ b/lib/logstorage/part_header.go
@ -0,0 +1,84 @@
+package logstorage
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// partHeader contains the information about a single part
+type partHeader struct {
+	// CompressedSizeBytes is physical size of the part
+	CompressedSizeBytes uint64
+
+	// UncompressedSizeBytes is the original size of log entries stored in the part
+	UncompressedSizeBytes uint64
+
+	// RowsCount is the number of log entries in the part
+	RowsCount uint64
+
+	// BlocksCount is the number of blocks in the part
+	BlocksCount uint64
+
+	// MinTimestamp is the minimum timestamp seen in the part
+	MinTimestamp int64
+
+	// MaxTimestamp is the maximum timestamp seen in the part
+	MaxTimestamp int64
+}
+
+// reset resets ph for subsequent re-use
+func (ph *partHeader) reset() {
+	ph.CompressedSizeBytes = 0
+	ph.UncompressedSizeBytes = 0
+	ph.RowsCount = 0
+	ph.BlocksCount = 0
+	ph.MinTimestamp = 0
+	ph.MaxTimestamp = 0
+}
+
+// String returns string represenation for ph.
+func (ph *partHeader) String() string {
+	return fmt.Sprintf("{CompressedSizeBytes=%d, UncompressedSizeBytes=%d, RowsCount=%d, BlocksCount=%d, MinTimestamp=%s, MaxTimestamp=%s}",
+		ph.CompressedSizeBytes, ph.UncompressedSizeBytes, ph.RowsCount, ph.BlocksCount, timestampToString(ph.MinTimestamp), timestampToString(ph.MaxTimestamp))
+}
+
+func (ph *partHeader) mustReadMetadata(partPath string) {
+	ph.reset()
+
+	metadataPath := filepath.Join(partPath, metadataFilename)
+	metadata, err := os.ReadFile(metadataPath)
+	if err != nil {
+		logger.Panicf("FATAL: cannot read %q: %s", metadataPath, err)
+	}
+	if err := json.Unmarshal(metadata, ph); err != nil {
+		logger.Panicf("FATAL: cannot parse %q: %s", metadataPath, err)
+	}
+
+	// Perform various checks
+	if ph.MinTimestamp > ph.MaxTimestamp {
+		logger.Panicf("FATAL: MinTimestamp cannot exceed MaxTimestamp; got %d vs %d", ph.MinTimestamp, ph.MaxTimestamp)
+	}
+}
+
+func (ph *partHeader) mustWriteMetadata(partPath string) {
+	metadata, err := json.Marshal(ph)
+	if err != nil {
+		logger.Panicf("BUG: cannot marshal partHeader: %s", err)
+	}
+	metadataPath := filepath.Join(partPath, metadataFilename)
+	fs.MustWriteSync(metadataPath, metadata)
+}
+
+func timestampToString(timestamp int64) string {
+	t := time.Unix(0, timestamp).UTC()
+	return strings.Replace(t.Format(timestampForPathname), ".", "", 1)
+}
+
+const timestampForPathname = "20060102150405.000000000"
--- a/lib/logstorage/part_header_test.go
+++ b/lib/logstorage/part_header_test.go
@ -0,0 +1,21 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestPartHeaderReset(t *testing.T) {
+	ph := &partHeader{
+		CompressedSizeBytes:   123,
+		UncompressedSizeBytes: 234,
+		RowsCount:             1234,
+		MinTimestamp:          3434,
+		MaxTimestamp:          32434,
+	}
+	ph.reset()
+	phZero := &partHeader{}
+	if !reflect.DeepEqual(ph, phZero) {
+		t.Fatalf("unexpected non-zero partHeader after reset: %v", ph)
+	}
+}
--- a/lib/logstorage/partition.go
+++ b/lib/logstorage/partition.go
@ -0,0 +1,237 @@
+package logstorage
+
+import (
+	"bytes"
+	"path/filepath"
+	"sort"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// PartitionStats contains stats for the partition.
+type PartitionStats struct {
+	DatadbStats
+	IndexdbStats
+}
+
+type partition struct {
+	// s is the parent storage for the partition
+	s *Storage
+
+	// path is the path to the partition directory
+	path string
+
+	// name is the partition name. It is basically the directory name obtained from path.
+	// It is used for creating keys for partition caches.
+	name string
+
+	// idb is indexdb used for the given partition
+	idb *indexdb
+
+	// ddb is the datadb used for the given partition
+	ddb *datadb
+}
+
+// mustCreatePartition creates a partition at the given path.
+//
+// The created partition can be opened with mustOpenPartition() after is has been created.
+//
+// The created partition can be deleted with mustDeletePartition() when it is no longer needed.
+func mustCreatePartition(path string) {
+	fs.MustMkdirFailIfExist(path)
+
+	indexdbPath := filepath.Join(path, indexdbDirname)
+	mustCreateIndexdb(indexdbPath)
+
+	datadbPath := filepath.Join(path, datadbDirname)
+	mustCreateDatadb(datadbPath)
+}
+
+// mustDeletePartition deletes partition at the given path.
+//
+// The partition must be closed with MustClose before deleting it.
+func mustDeletePartition(path string) {
+	fs.MustRemoveAll(path)
+}
+
+// mustOpenPartition opens partition at the given path for the given Storage.
+//
+// The returned partition must be closed when no longer needed with mustClosePartition() call.
+func mustOpenPartition(s *Storage, path string) *partition {
+	name := filepath.Base(path)
+
+	// Open indexdb
+	indexdbPath := filepath.Join(path, indexdbDirname)
+	idb := mustOpenIndexdb(indexdbPath, name, s)
+
+	// Start initializing the partition
+	pt := &partition{
+		s:    s,
+		path: path,
+		name: name,
+		idb:  idb,
+	}
+
+	// Open datadb
+	datadbPath := filepath.Join(path, datadbDirname)
+	pt.ddb = mustOpenDatadb(pt, datadbPath, s.flushInterval)
+
+	return pt
+}
+
+// mustClosePartition closes pt.
+//
+// The caller must ensure that pt is no longer used before the call to mustClosePartition().
+//
+// The partition can be deleted if needed after it is closed via mustDeletePartition() call.
+func mustClosePartition(pt *partition) {
+	// Close indexdb
+	mustCloseIndexdb(pt.idb)
+	pt.idb = nil
+
+	// Close datadb
+	mustCloseDatadb(pt.ddb)
+	pt.ddb = nil
+
+	pt.name = ""
+	pt.path = ""
+	pt.s = nil
+}
+
+func (pt *partition) mustAddRows(lr *LogRows) {
+	// Register rows in indexdb
+	var pendingRows []int
+	streamIDs := lr.streamIDs
+	for i := range lr.timestamps {
+		streamID := &streamIDs[i]
+		if pt.hasStreamIDInCache(streamID) {
+			continue
+		}
+		if len(pendingRows) == 0 || !streamIDs[pendingRows[len(pendingRows)-1]].equal(streamID) {
+			pendingRows = append(pendingRows, i)
+		}
+	}
+	if len(pendingRows) > 0 {
+		logNewStreams := pt.s.logNewStreams
+		streamTagsCanonicals := lr.streamTagsCanonicals
+		sort.Slice(pendingRows, func(i, j int) bool {
+			return streamIDs[pendingRows[i]].less(&streamIDs[pendingRows[j]])
+		})
+		for i, rowIdx := range pendingRows {
+			streamID := &streamIDs[rowIdx]
+			if i > 0 && streamIDs[pendingRows[i-1]].equal(streamID) {
+				continue
+			}
+			if pt.hasStreamIDInCache(streamID) {
+				continue
+			}
+			if !pt.idb.hasStreamID(streamID) {
+				streamTagsCanonical := streamTagsCanonicals[rowIdx]
+				pt.idb.mustRegisterStream(streamID, streamTagsCanonical)
+				if logNewStreams {
+					pt.logNewStream(streamTagsCanonical, lr.rows[rowIdx])
+				}
+			}
+			pt.putStreamIDToCache(streamID)
+		}
+	}
+
+	// Add rows to datadb
+	pt.ddb.mustAddRows(lr)
+	if pt.s.logIngestedRows {
+		pt.logIngestedRows(lr)
+	}
+}
+
+func (pt *partition) logNewStream(streamTagsCanonical []byte, fields []Field) {
+	streamTags := getStreamTagsString(streamTagsCanonical)
+	rf := RowFormatter(fields)
+	logger.Infof("partition %s: new stream %s for log entry %s", pt.path, streamTags, &rf)
+}
+
+func (pt *partition) logIngestedRows(lr *LogRows) {
+	var rf RowFormatter
+	for i, fields := range lr.rows {
+		tf := TimeFormatter(lr.timestamps[i])
+		streamTags := getStreamTagsString(lr.streamTagsCanonicals[i])
+		rf = append(rf[:0], fields...)
+		rf = append(rf, Field{
+			Name:  "_time",
+			Value: tf.String(),
+		})
+		rf = append(rf, Field{
+			Name:  "_stream",
+			Value: streamTags,
+		})
+		sort.Slice(rf, func(i, j int) bool {
+			return rf[i].Name < rf[j].Name
+		})
+		logger.Infof("partition %s: new log entry %s", pt.path, &rf)
+	}
+}
+
+// appendStreamTagsByStreamID appends canonical representation of stream tags for the given sid to dst
+// and returns the result.
+func (pt *partition) appendStreamTagsByStreamID(dst []byte, sid *streamID) []byte {
+	// Search for the StreamTags in the cache.
+	key := bbPool.Get()
+	defer bbPool.Put(key)
+
+	// There is no need in putting partition name into key here,
+	// since StreamTags is uniquely identified by streamID.
+	key.B = sid.marshal(key.B)
+	dstLen := len(dst)
+	dst = pt.s.streamTagsCache.GetBig(dst, key.B)
+	if len(dst) > dstLen {
+		// Fast path - the StreamTags have been found in cache.
+		return dst
+	}
+
+	// Slow path - search for StreamTags in idb
+	dst = pt.idb.appendStreamTagsByStreamID(dst, sid)
+	if len(dst) > dstLen {
+		// Store the found StreamTags to cache
+		pt.s.streamTagsCache.SetBig(key.B, dst[dstLen:])
+	}
+	return dst
+}
+
+func (pt *partition) hasStreamIDInCache(sid *streamID) bool {
+	var result [1]byte
+
+	bb := bbPool.Get()
+	bb.B = pt.marshalStreamIDCacheKey(bb.B, sid)
+	value := pt.s.streamIDCache.Get(result[:0], bb.B)
+	bbPool.Put(bb)
+
+	return bytes.Equal(value, okValue)
+}
+
+func (pt *partition) putStreamIDToCache(sid *streamID) {
+	bb := bbPool.Get()
+	bb.B = pt.marshalStreamIDCacheKey(bb.B, sid)
+	pt.s.streamIDCache.Set(bb.B, okValue)
+	bbPool.Put(bb)
+}
+
+func (pt *partition) marshalStreamIDCacheKey(dst []byte, sid *streamID) []byte {
+	dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(pt.name))
+	dst = sid.marshal(dst)
+	return dst
+}
+
+var okValue = []byte("1")
+
+// debugFlush makes sure that all the recently ingested data data becomes searchable
+func (pt *partition) debugFlush() {
+	pt.ddb.debugFlush()
+	pt.idb.debugFlush()
+}
+
+func (pt *partition) updateStats(ps *PartitionStats) {
+	pt.ddb.updateStats(&ps.DatadbStats)
+	pt.idb.updateStats(&ps.IndexdbStats)
+}
--- a/lib/logstorage/partition_test.go
+++ b/lib/logstorage/partition_test.go
@ -0,0 +1,187 @@
+package logstorage
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
+)
+
+func TestPartitionLifecycle(t *testing.T) {
+	const path = "TestPartitionLifecycle"
+	var ddbStats DatadbStats
+
+	s := newTestStorage()
+	for i := 0; i < 3; i++ {
+		mustCreatePartition(path)
+		for j := 0; j < 2; j++ {
+			pt := mustOpenPartition(s, path)
+			ddbStats.reset()
+			pt.ddb.updateStats(&ddbStats)
+			if n := ddbStats.RowsCount(); n != 0 {
+				t.Fatalf("unexpected non-zero number of entries in empty partition: %d", n)
+			}
+			if ddbStats.InmemoryParts != 0 {
+				t.Fatalf("unexpected non-zero number of in-memory parts in empty partition: %d", ddbStats.InmemoryParts)
+			}
+			if ddbStats.FileParts != 0 {
+				t.Fatalf("unexpected non-zero number of file parts in empty partition: %d", ddbStats.FileParts)
+			}
+			if ddbStats.CompressedInmemorySize != 0 {
+				t.Fatalf("unexpected non-zero size of inmemory parts for empty partition")
+			}
+			if ddbStats.CompressedFileSize != 0 {
+				t.Fatalf("unexpected non-zero size of file parts for empty partition")
+			}
+			time.Sleep(10 * time.Millisecond)
+			mustClosePartition(pt)
+		}
+		mustDeletePartition(path)
+	}
+	closeTestStorage(s)
+}
+
+func TestPartitionMustAddRowsSerial(t *testing.T) {
+	const path = "TestPartitionMustAddRowsSerial"
+	var ddbStats DatadbStats
+
+	s := newTestStorage()
+	mustCreatePartition(path)
+	pt := mustOpenPartition(s, path)
+
+	// Try adding the same entry at a time.
+	totalRowsCount := uint64(0)
+	for i := 0; i < 100; i++ {
+		lr := newTestLogRows(1, 1, 0)
+		totalRowsCount += uint64(len(lr.timestamps))
+		pt.mustAddRows(lr)
+		ddbStats.reset()
+		pt.ddb.updateStats(&ddbStats)
+		if n := ddbStats.RowsCount(); n != totalRowsCount {
+			t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
+		}
+	}
+
+	// Try adding different entry at a time.
+	for i := 0; i < 100; i++ {
+		lr := newTestLogRows(1, 1, int64(i))
+		totalRowsCount += uint64(len(lr.timestamps))
+		pt.mustAddRows(lr)
+		ddbStats.reset()
+		pt.ddb.updateStats(&ddbStats)
+		if n := ddbStats.RowsCount(); n != totalRowsCount {
+			t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
+		}
+	}
+
+	// Re-open the partition and verify the number of entries remains the same
+	mustClosePartition(pt)
+	pt = mustOpenPartition(s, path)
+	ddbStats.reset()
+	pt.ddb.updateStats(&ddbStats)
+	if n := ddbStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries after re-opening the partition; got %d; want %d", n, totalRowsCount)
+	}
+	if ddbStats.InmemoryParts != 0 {
+		t.Fatalf("unexpected non-zero number of in-memory parts after re-opening the partition: %d", ddbStats.InmemoryParts)
+	}
+	if ddbStats.FileParts == 0 {
+		t.Fatalf("the number of file parts must be greater than 0 after re-opening the partition")
+	}
+
+	// Try adding entries for multiple streams at a time
+	for i := 0; i < 5; i++ {
+		lr := newTestLogRows(3, 7, 0)
+		totalRowsCount += uint64(len(lr.timestamps))
+		pt.mustAddRows(lr)
+		ddbStats.reset()
+		pt.ddb.updateStats(&ddbStats)
+		if n := ddbStats.RowsCount(); n != totalRowsCount {
+			t.Fatalf("unexpected number of entries in partition; got %d; want %d", n, totalRowsCount)
+		}
+		time.Sleep(time.Millisecond)
+	}
+
+	// Re-open the partition and verify the number of entries remains the same
+	mustClosePartition(pt)
+	pt = mustOpenPartition(s, path)
+	ddbStats.reset()
+	pt.ddb.updateStats(&ddbStats)
+	if n := ddbStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries after re-opening the partition; got %d; want %d", n, totalRowsCount)
+	}
+	if ddbStats.InmemoryParts != 0 {
+		t.Fatalf("unexpected non-zero number of in-memory parts after re-opening the partition: %d", ddbStats.InmemoryParts)
+	}
+	if ddbStats.FileParts == 0 {
+		t.Fatalf("the number of file parts must be greater than 0 after re-opening the partition")
+	}
+
+	mustClosePartition(pt)
+	mustDeletePartition(path)
+
+	closeTestStorage(s)
+}
+
+func TestPartitionMustAddRowsConcurrent(t *testing.T) {
+	const path = "TestPartitionMustAddRowsConcurrent"
+	s := newTestStorage()
+
+	mustCreatePartition(path)
+	pt := mustOpenPartition(s, path)
+
+	const workersCount = 3
+	totalRowsCount := uint64(0)
+	doneCh := make(chan struct{}, workersCount)
+	for i := 0; i < cap(doneCh); i++ {
+		go func() {
+			for j := 0; j < 7; j++ {
+				lr := newTestLogRows(5, 10, int64(j))
+				pt.mustAddRows(lr)
+				atomic.AddUint64(&totalRowsCount, uint64(len(lr.timestamps)))
+			}
+			doneCh <- struct{}{}
+		}()
+	}
+	timer := timerpool.Get(time.Second)
+	defer timerpool.Put(timer)
+	for i := 0; i < cap(doneCh); i++ {
+		select {
+		case <-doneCh:
+		case <-timer.C:
+			t.Fatalf("timeout")
+		}
+	}
+
+	var ddbStats DatadbStats
+	pt.ddb.updateStats(&ddbStats)
+	if n := ddbStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries; got %d; want %d", n, totalRowsCount)
+	}
+
+	mustClosePartition(pt)
+	mustDeletePartition(path)
+
+	closeTestStorage(s)
+}
+
+// newTestStorage creates new storage for tests.
+//
+// When the storage is no longer needed, closeTestStorage() must be called.
+func newTestStorage() *Storage {
+	streamIDCache := workingsetcache.New(1024 * 1024)
+	streamFilterCache := workingsetcache.New(1024 * 1024)
+	return &Storage{
+		flushInterval:     time.Second,
+		streamIDCache:     streamIDCache,
+		streamFilterCache: streamFilterCache,
+	}
+}
+
+// closeTestStorage closes storage created via newTestStorage().
+func closeTestStorage(s *Storage) {
+	s.streamIDCache.Stop()
+	s.streamFilterCache.Stop()
+}
--- a/lib/logstorage/rows.go
+++ b/lib/logstorage/rows.go
@ -0,0 +1,123 @@
+package logstorage
+
+import (
+	"fmt"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+// Field is a single field for the log entry.
+type Field struct {
+	// Name is the name of the field
+	Name string
+
+	// Value is the value of the field
+	Value string
+}
+
+// Reset resets f for future re-use.
+func (f *Field) Reset() {
+	f.Name = ""
+	f.Value = ""
+}
+
+// String returns string representation of f.
+func (f *Field) String() string {
+	name := f.Name
+	if name == "" {
+		name = "_msg"
+	}
+	return fmt.Sprintf("%q:%q", name, f.Value)
+}
+
+func (f *Field) marshal(dst []byte) []byte {
+	dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.Name))
+	dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.Value))
+	return dst
+}
+
+func (f *Field) unmarshal(src []byte) ([]byte, error) {
+	srcOrig := src
+
+	// Unmarshal field name
+	tail, b, err := encoding.UnmarshalBytes(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal field name: %w", err)
+	}
+	// Do not use bytesutil.InternBytes(b) here, since it works slower than the string(b) in prod
+	f.Name = string(b)
+	src = tail
+
+	// Unmarshal field value
+	tail, b, err = encoding.UnmarshalBytes(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal field value: %w", err)
+	}
+	// Do not use bytesutil.InternBytes(b) here, since it works slower than the string(b) in prod
+	f.Value = string(b)
+	src = tail
+
+	return src, nil
+}
+
+// rows is an aux structure used during rows merge
+type rows struct {
+	fieldsBuf []Field
+
+	timestamps []int64
+
+	rows [][]Field
+}
+
+// reset resets rs
+func (rs *rows) reset() {
+	fb := rs.fieldsBuf
+	for i := range fb {
+		fb[i].Reset()
+	}
+	rs.fieldsBuf = fb[:0]
+
+	rs.timestamps = rs.timestamps[:0]
+
+	rows := rs.rows
+	for i := range rows {
+		rows[i] = nil
+	}
+	rs.rows = rows[:0]
+}
+
+// appendRows appends rows with the given timestamps to rs.
+func (rs *rows) appendRows(timestamps []int64, rows [][]Field) {
+	rs.timestamps = append(rs.timestamps, timestamps...)
+
+	fieldsBuf := rs.fieldsBuf
+	for _, fields := range rows {
+		fieldsLen := len(fieldsBuf)
+		fieldsBuf = append(fieldsBuf, fields...)
+		rs.rows = append(rs.rows, fieldsBuf[fieldsLen:])
+	}
+	rs.fieldsBuf = fieldsBuf
+}
+
+// mergeRows merges the args and appends them to rs.
+func (rs *rows) mergeRows(timestampsA, timestampsB []int64, fieldsA, fieldsB [][]Field) {
+	for len(timestampsA) > 0 && len(timestampsB) > 0 {
+		i := 0
+		minTimestamp := timestampsB[0]
+		for i < len(timestampsA) && timestampsA[i] <= minTimestamp {
+			i++
+		}
+		rs.appendRows(timestampsA[:i], fieldsA[:i])
+		fieldsA = fieldsA[i:]
+		timestampsA = timestampsA[i:]
+
+		fieldsA, fieldsB = fieldsB, fieldsA
+		timestampsA, timestampsB = timestampsB, timestampsA
+	}
+	if len(timestampsA) == 0 {
+		rs.appendRows(timestampsB, fieldsB)
+	} else {
+		rs.appendRows(timestampsA, fieldsA)
+	}
+}
--- a/lib/logstorage/rows_test.go
+++ b/lib/logstorage/rows_test.go
@ -0,0 +1,287 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestGetRowsSizeBytes(t *testing.T) {
+	f := func(rows [][]Field, uncompressedSizeBytesExpected int) {
+		t.Helper()
+		sizeBytes := uncompressedRowsSizeBytes(rows)
+		if sizeBytes != uint64(uncompressedSizeBytesExpected) {
+			t.Fatalf("unexpected sizeBytes; got %d; want %d", sizeBytes, uncompressedSizeBytesExpected)
+		}
+	}
+	f(nil, 0)
+	f([][]Field{}, 0)
+	f([][]Field{{}}, 35)
+	f([][]Field{{{Name: "foo"}}}, 40)
+
+	_, rows := newTestRows(1000, 10)
+	f(rows, 233900)
+}
+
+func TestRowsAppendRows(t *testing.T) {
+	var rs rows
+
+	timestamps := []int64{1}
+	rows := [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	rs.appendRows(timestamps, rows)
+	if len(rs.timestamps) != 1 {
+		t.Fatalf("unexpected number of row items; got %d; want 1", len(rs.timestamps))
+	}
+	rs.appendRows(timestamps, rows)
+	if len(rs.timestamps) != 2 {
+		t.Fatalf("unexpected number of row items; got %d; want 2", len(rs.timestamps))
+	}
+	for i := range rs.timestamps {
+		if rs.timestamps[i] != timestamps[0] {
+			t.Fatalf("unexpected timestamps copied; got %d; want %d", rs.timestamps[i], timestamps[0])
+		}
+		if !reflect.DeepEqual(rs.rows[i], rows[0]) {
+			t.Fatalf("unexpected fields copied\ngot\n%v\nwant\n%v", rs.rows[i], rows[0])
+		}
+	}
+
+	// append multiple log entries
+	timestamps, rows = newTestRows(100, 4)
+	rs.appendRows(timestamps, rows)
+	if len(rs.timestamps) != 102 {
+		t.Fatalf("unexpected number of row items; got %d; want 102", len(rs.timestamps))
+	}
+	for i := range timestamps {
+		if rs.timestamps[i+2] != timestamps[i] {
+			t.Fatalf("unexpected timestamps copied; got %d; want %d", rs.timestamps[i+2], timestamps[i])
+		}
+		if !reflect.DeepEqual(rs.rows[i+2], rows[i]) {
+			t.Fatalf("unexpected log entry copied\ngot\n%v\nwant\n%v", rs.rows[i+2], rows[i])
+		}
+	}
+
+	// reset rows
+	rs.reset()
+	if len(rs.timestamps) != 0 {
+		t.Fatalf("unexpected non-zero number of row items after reset: %d", len(rs.timestamps))
+	}
+}
+
+func TestMergeRows(t *testing.T) {
+	f := func(timestampsA, timestampsB []int64, fieldsA, fieldsB [][]Field, timestampsExpected []int64, rowsExpected [][]Field) {
+		t.Helper()
+		var rs rows
+		rs.mergeRows(timestampsA, timestampsB, fieldsA, fieldsB)
+		if !reflect.DeepEqual(rs.timestamps, timestampsExpected) {
+			t.Fatalf("unexpected timestamps after merge\ngot\n%v\nwant\n%v", rs.timestamps, timestampsExpected)
+		}
+		if !reflect.DeepEqual(rs.rows, rowsExpected) {
+			t.Fatalf("unexpected rows after merge\ngot\n%v\nwant\n%v", rs.rows, rowsExpected)
+		}
+
+		// check that the result doesn't change when merging in reverse order
+		rs.reset()
+		rs.mergeRows(timestampsB, timestampsA, fieldsB, fieldsA)
+		if !reflect.DeepEqual(rs.timestamps, timestampsExpected) {
+			t.Fatalf("unexpected timestamps after reverse merge\ngot\n%v\nwant\n%v", rs.timestamps, timestampsExpected)
+		}
+		if !reflect.DeepEqual(rs.rows, rowsExpected) {
+			t.Fatalf("unexpected rows after reverse merge\ngot\n%v\nwant\n%v", rs.rows, rowsExpected)
+		}
+	}
+
+	f(nil, nil, nil, nil, nil, nil)
+
+	// merge single entry with zero entries
+	timestampsA := []int64{123}
+	timestampsB := []int64{}
+
+	fieldsA := [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	fieldsB := [][]Field{}
+
+	resultTimestamps := []int64{123}
+	resultFields := [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
+
+	// merge two single entries
+	timestampsA = []int64{123}
+	timestampsB = []int64{43323}
+
+	fieldsA = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+	}
+	fieldsB = [][]Field{
+		{
+			{
+				Name:  "asdfds",
+				Value: "asdfsa",
+			},
+		},
+	}
+
+	resultTimestamps = []int64{123, 43323}
+	resultFields = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "asdfds",
+				Value: "asdfsa",
+			},
+		},
+	}
+	f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
+
+	// merge identical entries
+	timestampsA = []int64{123, 456}
+	timestampsB = []int64{123, 456}
+
+	fieldsA = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "baz",
+			},
+		},
+	}
+	fieldsB = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "baz",
+			},
+		},
+	}
+
+	resultTimestamps = []int64{123, 123, 456, 456}
+	resultFields = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "baz",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "baz",
+			},
+		},
+	}
+	f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
+
+	// merge interleaved entries
+	timestampsA = []int64{12, 13432}
+	timestampsB = []int64{3, 43323}
+
+	fieldsA = [][]Field{
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "xfoo",
+				Value: "xbar",
+			},
+		},
+	}
+	fieldsB = [][]Field{
+		{
+			{
+				Name:  "asd",
+				Value: "assa",
+			},
+		},
+		{
+			{
+				Name:  "asdfds",
+				Value: "asdfsa",
+			},
+		},
+	}
+
+	resultTimestamps = []int64{3, 12, 13432, 43323}
+	resultFields = [][]Field{
+		{
+			{
+				Name:  "asd",
+				Value: "assa",
+			},
+		},
+		{
+			{
+				Name:  "foo",
+				Value: "bar",
+			},
+		},
+		{
+			{
+				Name:  "xfoo",
+				Value: "xbar",
+			},
+		},
+		{
+			{
+				Name:  "asdfds",
+				Value: "asdfsa",
+			},
+		},
+	}
+	f(timestampsA, timestampsB, fieldsA, fieldsB, resultTimestamps, resultFields)
+}
--- a/lib/logstorage/storage.go
+++ b/lib/logstorage/storage.go
@ -0,0 +1,532 @@
+package logstorage
+
+import (
+	"os"
+	"path/filepath"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
+)
+
+// StorageStats represents stats for the storage. It may be obtained by calling Storage.UpdateStats().
+type StorageStats struct {
+	// RowsDroppedTooBigTimestamp is the number of rows dropped during data ingestion because their timestamp is smaller than the minimum allowed
+	RowsDroppedTooBigTimestamp uint64
+
+	// RowsDroppedTooSmallTimestamp is the number of rows dropped during data ingestion because their timestamp is bigger than the maximum allowed
+	RowsDroppedTooSmallTimestamp uint64
+
+	// PartitionsCount is the number of partitions in the storage
+	PartitionsCount uint64
+
+	PartitionStats
+}
+
+// Reset resets s.
+func (s *StorageStats) Reset() {
+	*s = StorageStats{}
+}
+
+// StorageConfig is the config for the Storage.
+type StorageConfig struct {
+	// Retention is the retention for the ingested data.
+	//
+	// Older data is automatically deleted.
+	Retention time.Duration
+
+	// FlushInterval is the interval for flushing the in-memory data to disk at the Storage
+	FlushInterval time.Duration
+
+	// FutureRetention is the allowed retention from the current time to future for the ingested data.
+	//
+	// Log entries with timestamps bigger than now+FutureRetention are ignored.
+	FutureRetention time.Duration
+
+	// LogNewStreams indicates whether to log newly created log streams.
+	//
+	// This can be useful for debugging of high cardinality issues.
+	// https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#high-cardinality
+	LogNewStreams bool
+
+	// LogIngestedRows indicates whether to log the ingested log entries.
+	//
+	// This can be useful for debugging of data ingestion.
+	LogIngestedRows bool
+}
+
+// Storage is the storage for log entries.
+type Storage struct {
+	rowsDroppedTooBigTimestamp   uint64
+	rowsDroppedTooSmallTimestamp uint64
+
+	// path is the path to the Storage directory
+	path string
+
+	// retention is the retention for the stored data
+	//
+	// older data is automatically deleted
+	retention time.Duration
+
+	// flushInterval is the interval for flushing in-memory data to disk
+	flushInterval time.Duration
+
+	// futureRetention is the maximum allowed interval to write data into the future
+	futureRetention time.Duration
+
+	// logNewStreams instructs to log new streams if it is set to true
+	logNewStreams bool
+
+	// logIngestedRows instructs to log all the ingested log entries if it is set to true
+	logIngestedRows bool
+
+	// flockF is a file, which makes sure that the Storage is opened by a single process
+	flockF *os.File
+
+	// partitions is a list of partitions for the Storage.
+	//
+	// It must be accessed under partitionsLock.
+	partitions []*partitionWrapper
+
+	// ptwHot is the "hot" partition, were the last rows were ingested.
+	//
+	// It must be accessed under partitionsLock.
+	ptwHot *partitionWrapper
+
+	// partitionsLock protects partitions and ptwHot.
+	partitionsLock sync.Mutex
+
+	// stopCh is closed when the Storage must be stopped.
+	stopCh chan struct{}
+
+	// wg is used for waiting for background workers at MustClose().
+	wg sync.WaitGroup
+
+	// streamIDCache caches (partition, streamIDs) seen during data ingestion.
+	//
+	// It reduces the load on persistent storage during data ingestion by skipping
+	// the check whether the given stream is already registered in the persistent storage.
+	streamIDCache *workingsetcache.Cache
+
+	// streamTagsCache caches StreamTags entries keyed by streamID.
+	//
+	// There is no need to put partition into the key for StreamTags,
+	// since StreamTags are uniquely identified by streamID.
+	//
+	// It reduces the load on persistent storage during querying
+	// when StreamTags must be found for the particular streamID
+	streamTagsCache *workingsetcache.Cache
+
+	// streamFilterCache caches streamIDs keyed by (partition, []TenanID, StreamFilter).
+	//
+	// It reduces the load on persistent storage during querying by _stream:{...} filter.
+	streamFilterCache *workingsetcache.Cache
+}
+
+type partitionWrapper struct {
+	// refCount is the number of active references to p.
+	// When it reaches zero, then the p is closed.
+	refCount int32
+
+	// The flag, which is set when the partition must be deleted after refCount reaches zero.
+	mustBeDeleted uint32
+
+	// day is the day for the partition in the unix timestamp divided by the number of seconds in the day.
+	day int64
+
+	// pt is the wrapped partition.
+	pt *partition
+}
+
+func newPartitionWrapper(pt *partition, day int64) *partitionWrapper {
+	pw := &partitionWrapper{
+		day: day,
+		pt:  pt,
+	}
+	pw.incRef()
+	return pw
+}
+
+func (ptw *partitionWrapper) incRef() {
+	atomic.AddInt32(&ptw.refCount, 1)
+}
+
+func (ptw *partitionWrapper) decRef() {
+	n := atomic.AddInt32(&ptw.refCount, -1)
+	if n > 0 {
+		return
+	}
+
+	deletePath := ""
+	if atomic.LoadUint32(&ptw.mustBeDeleted) != 0 {
+		deletePath = ptw.pt.path
+	}
+
+	// Close pw.pt, since nobody refers to it.
+	mustClosePartition(ptw.pt)
+	ptw.pt = nil
+
+	// Delete partition if needed.
+	if deletePath != "" {
+		mustDeletePartition(deletePath)
+	}
+}
+
+func (ptw *partitionWrapper) canAddAllRows(lr *LogRows) bool {
+	minTimestamp := ptw.day * nsecPerDay
+	maxTimestamp := minTimestamp + nsecPerDay - 1
+	for _, ts := range lr.timestamps {
+		if ts < minTimestamp || ts > maxTimestamp {
+			return false
+		}
+	}
+	return true
+}
+
+// mustCreateStorage creates Storage at the given path.
+func mustCreateStorage(path string) {
+	fs.MustMkdirFailIfExist(path)
+
+	partitionsPath := filepath.Join(path, partitionsDirname)
+	fs.MustMkdirFailIfExist(partitionsPath)
+}
+
+// MustOpenStorage opens Storage at the given path.
+//
+// MustClose must be called on the returned Storage when it is no longer needed.
+func MustOpenStorage(path string, cfg *StorageConfig) *Storage {
+	flushInterval := cfg.FlushInterval
+	if flushInterval < time.Second {
+		flushInterval = time.Second
+	}
+
+	retention := cfg.Retention
+	if retention < 24*time.Hour {
+		retention = 24 * time.Hour
+	}
+
+	futureRetention := cfg.FutureRetention
+	if futureRetention < 24*time.Hour {
+		futureRetention = 24 * time.Hour
+	}
+
+	if !fs.IsPathExist(path) {
+		mustCreateStorage(path)
+	}
+
+	flockF := fs.MustCreateFlockFile(path)
+
+	// Load caches
+	mem := memory.Allowed()
+	streamIDCachePath := filepath.Join(path, cacheDirname, streamIDCacheFilename)
+	streamIDCache := workingsetcache.Load(streamIDCachePath, mem/16)
+
+	streamTagsCache := workingsetcache.New(mem / 10)
+
+	streamFilterCache := workingsetcache.New(mem / 10)
+
+	s := &Storage{
+		path:            path,
+		retention:       retention,
+		flushInterval:   flushInterval,
+		futureRetention: futureRetention,
+		logNewStreams:   cfg.LogNewStreams,
+		logIngestedRows: cfg.LogIngestedRows,
+		flockF:          flockF,
+		stopCh:          make(chan struct{}),
+
+		streamIDCache:     streamIDCache,
+		streamTagsCache:   streamTagsCache,
+		streamFilterCache: streamFilterCache,
+	}
+
+	partitionsPath := filepath.Join(path, partitionsDirname)
+	fs.MustMkdirIfNotExist(partitionsPath)
+	des := fs.MustReadDir(partitionsPath)
+	ptws := make([]*partitionWrapper, len(des))
+	for i, de := range des {
+		fname := de.Name()
+
+		// Parse the day for the partition
+		t, err := time.Parse(partitionNameFormat, fname)
+		if err != nil {
+			logger.Panicf("FATAL: cannot parse partition filename %q at %q; it must be in the form YYYYMMDD: %s", fname, partitionsPath, err)
+		}
+		day := t.UTC().UnixNano() / nsecPerDay
+
+		partitionPath := filepath.Join(partitionsPath, fname)
+		pt := mustOpenPartition(s, partitionPath)
+		ptws[i] = newPartitionWrapper(pt, day)
+	}
+	sort.Slice(ptws, func(i, j int) bool {
+		return ptws[i].day < ptws[j].day
+	})
+
+	// Delete partitions from the future if needed
+	maxAllowedDay := s.getMaxAllowedDay()
+	j := len(ptws) - 1
+	for j >= 0 {
+		ptw := ptws[j]
+		if ptw.day <= maxAllowedDay {
+			break
+		}
+		logger.Infof("the partition %s is scheduled to be deleted because it is outside the -futureRetention=%dd", ptw.pt.path, durationToDays(s.futureRetention))
+		atomic.StoreUint32(&ptw.mustBeDeleted, 1)
+		ptw.decRef()
+		j--
+	}
+	j++
+	for i := j; i < len(ptws); i++ {
+		ptws[i] = nil
+	}
+	ptws = ptws[:j]
+
+	s.partitions = ptws
+	s.runRetentionWatcher()
+	return s
+}
+
+const partitionNameFormat = "20060102"
+
+func (s *Storage) runRetentionWatcher() {
+	s.wg.Add(1)
+	go func() {
+		s.watchRetention()
+		s.wg.Done()
+	}()
+}
+
+func (s *Storage) watchRetention() {
+	ticker := time.NewTicker(time.Hour)
+	defer ticker.Stop()
+	for {
+		var ptwsToDelete []*partitionWrapper
+		minAllowedDay := s.getMinAllowedDay()
+
+		s.partitionsLock.Lock()
+
+		// Delete outdated partitions.
+		// s.partitions are sorted by day, so the partitions, which can become outdated, are located at the beginning of the list
+		for _, ptw := range s.partitions {
+			if ptw.day >= minAllowedDay {
+				break
+			}
+			ptwsToDelete = append(ptwsToDelete, ptw)
+		}
+		for i := range ptwsToDelete {
+			s.partitions[i] = nil
+		}
+		s.partitions = s.partitions[len(ptwsToDelete):]
+
+		s.partitionsLock.Unlock()
+
+		for _, ptw := range ptwsToDelete {
+			logger.Infof("the partition %s is scheduled to be deleted because it is outside the -retentionPeriod=%dd", ptw.pt.path, durationToDays(s.retention))
+			atomic.StoreUint32(&ptw.mustBeDeleted, 1)
+			ptw.decRef()
+		}
+
+		select {
+		case <-s.stopCh:
+			return
+		case <-ticker.C:
+		}
+	}
+}
+
+func (s *Storage) getMinAllowedDay() int64 {
+	return time.Now().UTC().Add(-s.retention).UnixNano() / nsecPerDay
+}
+
+func (s *Storage) getMaxAllowedDay() int64 {
+	return time.Now().UTC().Add(s.futureRetention).UnixNano() / nsecPerDay
+}
+
+// MustClose closes s.
+//
+// It is expected that nobody uses the storage at the close time.
+func (s *Storage) MustClose() {
+	// Stop background workers
+	close(s.stopCh)
+	s.wg.Wait()
+
+	// Close partitions
+	for _, pw := range s.partitions {
+		pw.decRef()
+		if pw.refCount != 0 {
+			logger.Panicf("BUG: there are %d users of partition", pw.refCount)
+		}
+	}
+	s.partitions = nil
+
+	// Save caches
+	streamIDCachePath := filepath.Join(s.path, cacheDirname, streamIDCacheFilename)
+	if err := s.streamIDCache.Save(streamIDCachePath); err != nil {
+		logger.Panicf("FATAL: cannot save streamID cache to %q: %s", streamIDCachePath, err)
+	}
+	s.streamIDCache.Stop()
+	s.streamIDCache = nil
+
+	s.streamTagsCache.Stop()
+	s.streamTagsCache = nil
+
+	s.streamFilterCache.Stop()
+	s.streamFilterCache = nil
+
+	// release lock file
+	fs.MustClose(s.flockF)
+	s.flockF = nil
+
+	s.path = ""
+}
+
+// MustAddRows adds lr to s.
+func (s *Storage) MustAddRows(lr *LogRows) {
+	// Fast path - try adding all the rows to the hot partition
+	s.partitionsLock.Lock()
+	ptwHot := s.ptwHot
+	if ptwHot != nil {
+		ptwHot.incRef()
+	}
+	s.partitionsLock.Unlock()
+
+	if ptwHot != nil {
+		if ptwHot.canAddAllRows(lr) {
+			ptwHot.pt.mustAddRows(lr)
+			ptwHot.decRef()
+			return
+		}
+		ptwHot.decRef()
+	}
+
+	// Slow path - rows cannot be added to the hot partition, so split rows among available partitions
+	minAllowedDay := s.getMinAllowedDay()
+	maxAllowedDay := s.getMaxAllowedDay()
+	m := make(map[int64]*LogRows)
+	for i, ts := range lr.timestamps {
+		day := ts / nsecPerDay
+		if day < minAllowedDay {
+			rf := RowFormatter(lr.rows[i])
+			tsf := TimeFormatter(ts)
+			minAllowedTsf := TimeFormatter(minAllowedDay * nsecPerDay)
+			tooSmallTimestampLogger.Warnf("skipping log entry with too small timestamp=%s; it must be bigger than %s according "+
+				"to the configured -retentionPeriod. See https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
+				"log entry: %s", &tsf, &minAllowedTsf, &rf)
+			atomic.AddUint64(&s.rowsDroppedTooSmallTimestamp, 1)
+			continue
+		}
+		if day > maxAllowedDay {
+			rf := RowFormatter(lr.rows[i])
+			tsf := TimeFormatter(ts)
+			maxAllowedTsf := TimeFormatter(maxAllowedDay * nsecPerDay)
+			tooBigTimestampLogger.Warnf("skipping log entry with too big timestamp=%s; it must be smaller than %s according "+
+				"to the configured -futureRetention; see https://docs.victoriametrics.com/VictoriaLogs/#retention ; "+
+				"log entry: %s", &tsf, &maxAllowedTsf, &rf)
+			atomic.AddUint64(&s.rowsDroppedTooBigTimestamp, 1)
+			continue
+		}
+		lrPart := m[day]
+		if lrPart == nil {
+			lrPart = GetLogRows(nil, nil)
+			m[day] = lrPart
+		}
+		lrPart.mustAddInternal(lr.streamIDs[i], ts, lr.rows[i], lr.streamTagsCanonicals[i])
+	}
+	for day, lrPart := range m {
+		ptw := s.getPartitionForDay(day)
+		ptw.pt.mustAddRows(lrPart)
+		ptw.decRef()
+		PutLogRows(lrPart)
+	}
+}
+
+var tooSmallTimestampLogger = logger.WithThrottler("too_small_timestamp", 5*time.Second)
+var tooBigTimestampLogger = logger.WithThrottler("too_big_timestamp", 5*time.Second)
+
+const nsecPerDay = 24 * 3600 * 1e9
+
+// TimeFormatter implements fmt.Stringer for timestamp in nanoseconds
+type TimeFormatter int64
+
+// String returns human-readable representation for tf.
+func (tf *TimeFormatter) String() string {
+	ts := int64(*tf)
+	t := time.Unix(0, ts).UTC()
+	return t.Format(time.RFC3339)
+}
+
+func (s *Storage) getPartitionForDay(day int64) *partitionWrapper {
+	s.partitionsLock.Lock()
+
+	// Search for the partition using binary search
+	ptws := s.partitions
+	n := sort.Search(len(ptws), func(i int) bool {
+		return ptws[i].day >= day
+	})
+	var ptw *partitionWrapper
+	if n < len(ptws) {
+		ptw = ptws[n]
+		if ptw.day != day {
+			ptw = nil
+		}
+	}
+	if ptw == nil {
+		// Missing partition for the given day. Create it.
+		fname := time.Unix(0, day*nsecPerDay).UTC().Format(partitionNameFormat)
+		partitionPath := filepath.Join(s.path, partitionsDirname, fname)
+		mustCreatePartition(partitionPath)
+
+		pt := mustOpenPartition(s, partitionPath)
+		ptw = newPartitionWrapper(pt, day)
+		if n == len(ptws) {
+			ptws = append(ptws, ptw)
+		} else {
+			ptws = append(ptws[:n+1], ptws[n:]...)
+			ptws[n] = ptw
+		}
+		s.partitions = ptws
+	}
+
+	s.ptwHot = ptw
+	ptw.incRef()
+
+	s.partitionsLock.Unlock()
+
+	return ptw
+}
+
+// UpdateStats updates ss for the given s.
+func (s *Storage) UpdateStats(ss *StorageStats) {
+	ss.RowsDroppedTooBigTimestamp += atomic.LoadUint64(&s.rowsDroppedTooBigTimestamp)
+	ss.RowsDroppedTooSmallTimestamp += atomic.LoadUint64(&s.rowsDroppedTooSmallTimestamp)
+
+	s.partitionsLock.Lock()
+	ss.PartitionsCount += uint64(len(s.partitions))
+	for _, ptw := range s.partitions {
+		ptw.pt.updateStats(&ss.PartitionStats)
+	}
+	s.partitionsLock.Unlock()
+}
+
+func (s *Storage) debugFlush() {
+	s.partitionsLock.Lock()
+	ptws := append([]*partitionWrapper{}, s.partitions...)
+	for _, ptw := range ptws {
+		ptw.incRef()
+	}
+	s.partitionsLock.Unlock()
+
+	for _, ptw := range ptws {
+		ptw.pt.debugFlush()
+		ptw.decRef()
+	}
+}
+
+func durationToDays(d time.Duration) int64 {
+	return int64(d / (time.Hour * 24))
+}
--- a/lib/logstorage/storage_search.go
+++ b/lib/logstorage/storage_search.go
@ -0,0 +1,602 @@
+package logstorage
+
+import (
+	"math"
+	"sort"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
+)
+
+// genericSearchOptions contain options used for search.
+type genericSearchOptions struct {
+	// tenantIDs must contain the list of tenantIDs for the search.
+	tenantIDs []TenantID
+
+	// filter is the filter to use for the search
+	filter filter
+
+	// resultColumnNames is names of columns to return in the result.
+	resultColumnNames []string
+}
+
+type searchOptions struct {
+	// Optional sorted list of tenantIDs for the search.
+	// If it is empty, then the search is performed by streamIDs
+	tenantIDs []TenantID
+
+	// Optional sorted list of streamIDs for the search.
+	// If it is empty, then the search is performed by tenantIDs
+	streamIDs []streamID
+
+	// minTimestamp is the minimum timestamp for the search
+	minTimestamp int64
+
+	// maxTimestamp is the maximum timestamp for the search
+	maxTimestamp int64
+
+	// filter is the filter to use for the search
+	filter filter
+
+	// resultColumnNames is names of columns to return in the result
+	resultColumnNames []string
+}
+
+// RunQuery runs the given q and calls processBlock for results
+func (s *Storage) RunQuery(tenantIDs []TenantID, q *Query, stopCh <-chan struct{}, processBlock func(columns []BlockColumn)) {
+	resultColumnNames := q.getResultColumnNames()
+	so := &genericSearchOptions{
+		tenantIDs:         tenantIDs,
+		filter:            q.f,
+		resultColumnNames: resultColumnNames,
+	}
+	workersCount := cgroup.AvailableCPUs()
+	s.search(workersCount, so, stopCh, func(workerID uint, br *blockResult) {
+		brs := getBlockRows()
+		cs := brs.cs
+
+		for i, columnName := range resultColumnNames {
+			cs = append(cs, BlockColumn{
+				Name:   columnName,
+				Values: br.getColumnValues(i),
+			})
+		}
+		processBlock(cs)
+
+		brs.cs = cs
+		putBlockRows(brs)
+	})
+}
+
+type blockRows struct {
+	cs []BlockColumn
+}
+
+func (brs *blockRows) reset() {
+	cs := brs.cs
+	for i := range cs {
+		cs[i].reset()
+	}
+	brs.cs = cs[:0]
+}
+
+func getBlockRows() *blockRows {
+	v := blockRowsPool.Get()
+	if v == nil {
+		return &blockRows{}
+	}
+	return v.(*blockRows)
+}
+
+func putBlockRows(brs *blockRows) {
+	brs.reset()
+	blockRowsPool.Put(brs)
+}
+
+var blockRowsPool sync.Pool
+
+// BlockColumn is a single column of a block of data
+type BlockColumn struct {
+	// Name is the column name
+	Name string
+
+	// Values is column values
+	Values []string
+}
+
+func (c *BlockColumn) reset() {
+	c.Name = ""
+	c.Values = nil
+}
+
+// The number of blocks to search at once by a single worker
+//
+// This number must be increased on systems with many CPU cores in order to amortize
+// the overhead for passing the blockSearchWork to worker goroutines.
+const blockSearchWorksPerBatch = 64
+
+// searchResultFunc must process sr.
+//
+// The callback is called at the worker with the given workerID.
+type searchResultFunc func(workerID uint, br *blockResult)
+
+// search searches for the matching rows according to so.
+//
+// It calls f for each found matching block.
+func (s *Storage) search(workersCount int, so *genericSearchOptions, stopCh <-chan struct{}, processBlockResult searchResultFunc) {
+	// Spin up workers
+	var wg sync.WaitGroup
+	workCh := make(chan []*blockSearchWork, workersCount)
+	wg.Add(workersCount)
+	for i := 0; i < workersCount; i++ {
+		go func(workerID uint) {
+			bs := getBlockSearch()
+			for bsws := range workCh {
+				for _, bsw := range bsws {
+					bs.search(bsw)
+					if bs.br.RowsCount() > 0 {
+						processBlockResult(workerID, &bs.br)
+					}
+				}
+			}
+			putBlockSearch(bs)
+			wg.Done()
+		}(uint(i))
+	}
+
+	// Obtain common time filter from so.filter
+	tf, f := getCommonTimeFilter(so.filter)
+
+	// Select partitions according to the selected time range
+	s.partitionsLock.Lock()
+	ptws := s.partitions
+	minDay := tf.minTimestamp / nsecPerDay
+	n := sort.Search(len(ptws), func(i int) bool {
+		return ptws[i].day >= minDay
+	})
+	ptws = ptws[n:]
+	maxDay := tf.maxTimestamp / nsecPerDay
+	n = sort.Search(len(ptws), func(i int) bool {
+		return ptws[i].day > maxDay
+	})
+	ptws = ptws[:n]
+	for _, ptw := range ptws {
+		ptw.incRef()
+	}
+	s.partitionsLock.Unlock()
+
+	// Obtain common streamFilter from f
+	var sf *StreamFilter
+	sf, f = getCommonStreamFilter(f)
+
+	// Apply search to matching partitions
+	var pws []*partWrapper
+	for _, ptw := range ptws {
+		pws = ptw.pt.search(pws, tf, sf, f, so, workCh, stopCh)
+	}
+
+	// Wait until workers finish their work
+	close(workCh)
+	wg.Wait()
+
+	// Decrement references to parts
+	for _, pw := range pws {
+		pw.decRef()
+	}
+
+	// Decrement references to partitions
+	for _, ptw := range ptws {
+		ptw.decRef()
+	}
+}
+
+func (pt *partition) search(pwsDst []*partWrapper, tf *timeFilter, sf *StreamFilter, f filter, so *genericSearchOptions,
+	workCh chan<- []*blockSearchWork, stopCh <-chan struct{},
+) []*partWrapper {
+	tenantIDs := so.tenantIDs
+	var streamIDs []streamID
+	if sf != nil {
+		streamIDs = pt.idb.searchStreamIDs(tenantIDs, sf)
+		tenantIDs = nil
+	}
+	if hasStreamFilters(f) {
+		f = initStreamFilters(tenantIDs, pt.idb, f)
+	}
+	soInternal := &searchOptions{
+		tenantIDs:         tenantIDs,
+		streamIDs:         streamIDs,
+		minTimestamp:      tf.minTimestamp,
+		maxTimestamp:      tf.maxTimestamp,
+		filter:            f,
+		resultColumnNames: so.resultColumnNames,
+	}
+	return pt.ddb.search(pwsDst, soInternal, workCh, stopCh)
+}
+
+func hasStreamFilters(f filter) bool {
+	switch t := f.(type) {
+	case *andFilter:
+		return hasStreamFiltersInList(t.filters)
+	case *orFilter:
+		return hasStreamFiltersInList(t.filters)
+	case *notFilter:
+		return hasStreamFilters(t.f)
+	case *streamFilter:
+		return true
+	default:
+		return false
+	}
+}
+
+func hasStreamFiltersInList(filters []filter) bool {
+	for _, f := range filters {
+		if hasStreamFilters(f) {
+			return true
+		}
+	}
+	return false
+}
+
+func initStreamFilters(tenantIDs []TenantID, idb *indexdb, f filter) filter {
+	switch t := f.(type) {
+	case *andFilter:
+		return &andFilter{
+			filters: initStreamFiltersList(tenantIDs, idb, t.filters),
+		}
+	case *orFilter:
+		return &orFilter{
+			filters: initStreamFiltersList(tenantIDs, idb, t.filters),
+		}
+	case *notFilter:
+		return &notFilter{
+			f: initStreamFilters(tenantIDs, idb, t.f),
+		}
+	case *streamFilter:
+		return &streamFilter{
+			f:         t.f,
+			tenantIDs: tenantIDs,
+			idb:       idb,
+		}
+	default:
+		return t
+	}
+}
+
+func initStreamFiltersList(tenantIDs []TenantID, idb *indexdb, filters []filter) []filter {
+	result := make([]filter, len(filters))
+	for i, f := range filters {
+		result[i] = initStreamFilters(tenantIDs, idb, f)
+	}
+	return result
+}
+
+func (ddb *datadb) search(pwsDst []*partWrapper, so *searchOptions, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) []*partWrapper {
+	// Select parts with data for the given time range
+	ddb.partsLock.Lock()
+	pwsDstLen := len(pwsDst)
+	pwsDst = appendPartsInTimeRange(pwsDst, ddb.inmemoryParts, so.minTimestamp, so.maxTimestamp)
+	pwsDst = appendPartsInTimeRange(pwsDst, ddb.fileParts, so.minTimestamp, so.maxTimestamp)
+	pws := pwsDst[pwsDstLen:]
+	for _, pw := range pws {
+		pw.incRef()
+	}
+	ddb.partsLock.Unlock()
+
+	// Apply search to matching parts
+	for _, pw := range pws {
+		pw.p.search(so, workCh, stopCh)
+	}
+
+	return pwsDst
+}
+
+func (p *part) search(so *searchOptions, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
+	bhss := getBlockHeaders()
+	if len(so.tenantIDs) > 0 {
+		p.searchByTenantIDs(so, bhss, workCh, stopCh)
+	} else {
+		p.searchByStreamIDs(so, bhss, workCh, stopCh)
+	}
+	putBlockHeaders(bhss)
+}
+
+func getBlockHeaders() *blockHeaders {
+	v := blockHeadersPool.Get()
+	if v == nil {
+		return &blockHeaders{}
+	}
+	return v.(*blockHeaders)
+}
+
+func putBlockHeaders(bhss *blockHeaders) {
+	bhss.reset()
+	blockHeadersPool.Put(bhss)
+}
+
+var blockHeadersPool sync.Pool
+
+type blockHeaders struct {
+	bhs []blockHeader
+}
+
+func (bhss *blockHeaders) reset() {
+	bhs := bhss.bhs
+	for i := range bhs {
+		bhs[i].reset()
+	}
+	bhss.bhs = bhs[:0]
+}
+
+func (p *part) searchByTenantIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
+	// it is assumed that tenantIDs are sorted
+	tenantIDs := so.tenantIDs
+
+	bsws := make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
+	scheduleBlockSearch := func(bh *blockHeader) bool {
+		// Do not use pool for blockSearchWork, since it is returned back to the pool
+		// at another goroutine, which may run on another CPU core.
+		// This means that it will be put into another per-CPU pool, which may result
+		// in slowdown related to memory synchronization between CPU cores.
+		// This slowdown is increased on systems with bigger number of CPU cores.
+		bsw := newBlockSearchWork(p, so, bh)
+		bsws = append(bsws, bsw)
+		if len(bsws) < cap(bsws) {
+			return true
+		}
+		select {
+		case <-stopCh:
+			return false
+		case workCh <- bsws:
+			bsws = make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
+			return true
+		}
+	}
+
+	// it is assumed that ibhs are sorted
+	ibhs := p.indexBlockHeaders
+	for len(ibhs) > 0 && len(tenantIDs) > 0 {
+		select {
+		case <-stopCh:
+			return
+		default:
+		}
+
+		// locate tenantID equal or bigger than the tenantID in ibhs[0]
+		tenantID := &tenantIDs[0]
+		if tenantID.less(&ibhs[0].streamID.tenantID) {
+			tenantID = &ibhs[0].streamID.tenantID
+			n := sort.Search(len(tenantIDs), func(i int) bool {
+				return !tenantIDs[i].less(tenantID)
+			})
+			if n == len(tenantIDs) {
+				tenantIDs = nil
+				break
+			}
+			tenantID = &tenantIDs[n]
+			tenantIDs = tenantIDs[n:]
+		}
+
+		// locate indexBlockHeader with equal or bigger tenantID than the given tenantID
+		n := 0
+		if ibhs[0].streamID.tenantID.less(tenantID) {
+			n = sort.Search(len(ibhs), func(i int) bool {
+				return !ibhs[i].streamID.tenantID.less(tenantID)
+			})
+			if n == len(ibhs) || n > 0 && ibhs[n].streamID.tenantID.equal(tenantID) {
+				// The end of ibhs[n-1] may contain blocks for the given tenantID, so move it backwards
+				n--
+			}
+		}
+		ibh := &ibhs[n]
+		ibhs = ibhs[n+1:]
+
+		if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp {
+			// Skip the ibh, since it doesn't contain entries on the requested time range
+			continue
+		}
+
+		bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p)
+
+		bhs := bhss.bhs
+		for len(bhs) > 0 {
+			// search for blocks with the given tenantID
+			n = sort.Search(len(bhs), func(i int) bool {
+				return !bhs[i].streamID.tenantID.less(tenantID)
+			})
+			bhs = bhs[n:]
+			for len(bhs) > 0 && bhs[0].streamID.tenantID.equal(tenantID) {
+				bh := &bhs[0]
+				bhs = bhs[1:]
+				th := &bh.timestampsHeader
+				if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp {
+					continue
+				}
+				scheduleBlockSearch(bh)
+			}
+			if len(bhs) == 0 {
+				break
+			}
+
+			// search for the next tenantID, which can potentially match tenantID from bhs[0]
+			tenantID = &bhs[0].streamID.tenantID
+			n = sort.Search(len(tenantIDs), func(i int) bool {
+				return !tenantIDs[i].less(tenantID)
+			})
+			if n == len(tenantIDs) {
+				tenantIDs = nil
+				break
+			}
+			tenantID = &tenantIDs[n]
+			tenantIDs = tenantIDs[n:]
+		}
+	}
+
+	// Flush the remaining work
+	if len(bsws) > 0 {
+		workCh <- bsws
+	}
+}
+
+func (p *part) searchByStreamIDs(so *searchOptions, bhss *blockHeaders, workCh chan<- []*blockSearchWork, stopCh <-chan struct{}) {
+	// it is assumed that streamIDs are sorted
+	streamIDs := so.streamIDs
+
+	bsws := make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
+	scheduleBlockSearch := func(bh *blockHeader) bool {
+		// Do not use pool for blockSearchWork, since it is returned back to the pool
+		// at another goroutine, which may run on another CPU core.
+		// This means that it will be put into another per-CPU pool, which may result
+		// in slowdown related to memory synchronization between CPU cores.
+		// This slowdown is increased on systems with bigger number of CPU cores.
+		bsw := newBlockSearchWork(p, so, bh)
+		bsws = append(bsws, bsw)
+		if len(bsws) < cap(bsws) {
+			return true
+		}
+		select {
+		case <-stopCh:
+			return false
+		case workCh <- bsws:
+			bsws = make([]*blockSearchWork, 0, blockSearchWorksPerBatch)
+			return true
+		}
+	}
+
+	// it is assumed that ibhs are sorted
+	ibhs := p.indexBlockHeaders
+
+	for len(ibhs) > 0 && len(streamIDs) > 0 {
+		select {
+		case <-stopCh:
+			return
+		default:
+		}
+
+		// locate streamID equal or bigger than the streamID in ibhs[0]
+		streamID := &streamIDs[0]
+		if streamID.less(&ibhs[0].streamID) {
+			streamID = &ibhs[0].streamID
+			n := sort.Search(len(streamIDs), func(i int) bool {
+				return !streamIDs[i].less(streamID)
+			})
+			if n == len(streamIDs) {
+				streamIDs = nil
+				break
+			}
+			streamID = &streamIDs[n]
+			streamIDs = streamIDs[n:]
+		}
+
+		// locate indexBlockHeader with equal or bigger streamID than the given streamID
+		n := 0
+		if ibhs[0].streamID.less(streamID) {
+			n = sort.Search(len(ibhs), func(i int) bool {
+				return !ibhs[i].streamID.less(streamID)
+			})
+			if n == len(ibhs) || n > 0 && ibhs[n].streamID.equal(streamID) {
+				// The end of ibhs[n-1] may contain blocks for the given streamID, so move it backwards
+				n--
+			}
+		}
+		ibh := &ibhs[n]
+		ibhs = ibhs[n+1:]
+
+		if so.minTimestamp > ibh.maxTimestamp || so.maxTimestamp < ibh.minTimestamp {
+			// Skip the ibh, since it doesn't contain entries on the requested time range
+			continue
+		}
+
+		bhss.bhs = ibh.mustReadBlockHeaders(bhss.bhs[:0], p)
+
+		bhs := bhss.bhs
+		for len(bhs) > 0 {
+			// search for blocks with the given streamID
+			n = sort.Search(len(bhs), func(i int) bool {
+				return !bhs[i].streamID.less(streamID)
+			})
+			bhs = bhs[n:]
+			for len(bhs) > 0 && bhs[0].streamID.equal(streamID) {
+				bh := &bhs[0]
+				bhs = bhs[1:]
+				th := &bh.timestampsHeader
+				if so.minTimestamp > th.maxTimestamp || so.maxTimestamp < th.minTimestamp {
+					continue
+				}
+				if !scheduleBlockSearch(bh) {
+					return
+				}
+			}
+			if len(bhs) == 0 {
+				break
+			}
+
+			// search for the next streamID, which can potentially match streamID from bhs[0]
+			streamID = &bhs[0].streamID
+			n = sort.Search(len(streamIDs), func(i int) bool {
+				return !streamIDs[i].less(streamID)
+			})
+			if n == len(streamIDs) {
+				streamIDs = nil
+				break
+			}
+			streamID = &streamIDs[n]
+			streamIDs = streamIDs[n:]
+		}
+	}
+
+	// Flush the remaining work
+	if len(bsws) > 0 {
+		workCh <- bsws
+	}
+}
+
+func appendPartsInTimeRange(dst, src []*partWrapper, minTimestamp, maxTimestamp int64) []*partWrapper {
+	for _, pw := range src {
+		if maxTimestamp < pw.p.ph.MinTimestamp || minTimestamp > pw.p.ph.MaxTimestamp {
+			continue
+		}
+		dst = append(dst, pw)
+	}
+	return dst
+}
+
+func getCommonStreamFilter(f filter) (*StreamFilter, filter) {
+	switch t := f.(type) {
+	case *andFilter:
+		filters := t.filters
+		for i, filter := range filters {
+			sf, ok := filter.(*streamFilter)
+			if ok && !sf.f.isEmpty() {
+				// Remove sf from filters, since it doesn't filter out anything then.
+				af := &andFilter{
+					filters: append(filters[:i:i], filters[i+1:]...),
+				}
+				return sf.f, af
+			}
+		}
+	case *streamFilter:
+		return t.f, &noopFilter{}
+	}
+	return nil, f
+}
+
+func getCommonTimeFilter(f filter) (*timeFilter, filter) {
+	switch t := f.(type) {
+	case *andFilter:
+		for _, filter := range t.filters {
+			tf, ok := filter.(*timeFilter)
+			if ok {
+				// The tf must remain in af in order to properly filter out rows outside the selected time range
+				return tf, f
+			}
+		}
+	case *timeFilter:
+		return t, f
+	}
+	return allTimeFilter, f
+}
+
+var allTimeFilter = &timeFilter{
+	minTimestamp: math.MinInt64,
+	maxTimestamp: math.MaxInt64,
+}
--- a/lib/logstorage/storage_search_test.go
+++ b/lib/logstorage/storage_search_test.go
@ -0,0 +1,663 @@
+package logstorage
+
+import (
+	"fmt"
+	"regexp"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+)
+
+func TestStorageRunQuery(t *testing.T) {
+	const path = "TestStorageRunQuery"
+
+	const tenantsCount = 11
+	const streamsPerTenant = 3
+	const blocksPerStream = 5
+	const rowsPerBlock = 7
+
+	sc := &StorageConfig{
+		Retention: 24 * time.Hour,
+	}
+	s := MustOpenStorage(path, sc)
+
+	// fill the storage with data
+	var allTenantIDs []TenantID
+	baseTimestamp := time.Now().UnixNano() - 3600*1e9
+	var fields []Field
+	streamTags := []string{
+		"job",
+		"instance",
+	}
+	for i := 0; i < tenantsCount; i++ {
+		tenantID := TenantID{
+			AccountID: uint32(i),
+			ProjectID: uint32(10*i + 1),
+		}
+		allTenantIDs = append(allTenantIDs, tenantID)
+		for j := 0; j < streamsPerTenant; j++ {
+			streamIDValue := fmt.Sprintf("stream_id=%d", j)
+			for k := 0; k < blocksPerStream; k++ {
+				lr := GetLogRows(streamTags, nil)
+				for m := 0; m < rowsPerBlock; m++ {
+					timestamp := baseTimestamp + int64(m)*1e9 + int64(k)
+					// Append stream fields
+					fields = append(fields[:0], Field{
+						Name:  "job",
+						Value: "foobar",
+					}, Field{
+						Name:  "instance",
+						Value: fmt.Sprintf("host-%d:234", j),
+					})
+					// append the remaining fields
+					fields = append(fields, Field{
+						Name:  "_msg",
+						Value: fmt.Sprintf("log message %d at block %d", m, k),
+					})
+					fields = append(fields, Field{
+						Name:  "source-file",
+						Value: "/foo/bar/baz",
+					})
+					fields = append(fields, Field{
+						Name:  "tenant.id",
+						Value: tenantID.String(),
+					})
+					fields = append(fields, Field{
+						Name:  "stream-id",
+						Value: streamIDValue,
+					})
+					lr.MustAdd(tenantID, timestamp, fields)
+				}
+				s.MustAddRows(lr)
+				PutLogRows(lr)
+			}
+		}
+	}
+	s.debugFlush()
+
+	// run tests on the storage data
+	t.Run("missing-tenant", func(t *testing.T) {
+		q := mustParseQuery(`"log message"`)
+		tenantID := TenantID{
+			AccountID: 0,
+			ProjectID: 0,
+		}
+		processBlock := func(columns []BlockColumn) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+	})
+	t.Run("missing-message-text", func(t *testing.T) {
+		q := mustParseQuery(`foobar`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		processBlock := func(columns []BlockColumn) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+	})
+	t.Run("matching-tenant-id", func(t *testing.T) {
+		q := mustParseQuery(`tenant.id:*`)
+		for i := 0; i < tenantsCount; i++ {
+			tenantID := TenantID{
+				AccountID: uint32(i),
+				ProjectID: uint32(10*i + 1),
+			}
+			expectedTenantID := tenantID.String()
+			rowsCount := uint32(0)
+			processBlock := func(columns []BlockColumn) {
+				hasTenantIDColumn := false
+				var columnNames []string
+				for _, c := range columns {
+					if c.Name == "tenant.id" {
+						hasTenantIDColumn = true
+						if len(c.Values) == 0 {
+							panic(fmt.Errorf("unexpected zero rows"))
+						}
+						for _, v := range c.Values {
+							if v != expectedTenantID {
+								panic(fmt.Errorf("unexpected tenant.id; got %s; want %s", v, expectedTenantID))
+							}
+						}
+					}
+					columnNames = append(columnNames, c.Name)
+				}
+				if !hasTenantIDColumn {
+					panic(fmt.Errorf("missing tenant.id column among columns: %q", columnNames))
+				}
+				atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+			}
+			tenantIDs := []TenantID{tenantID}
+			s.RunQuery(tenantIDs, q, nil, processBlock)
+
+			expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
+			if rowsCount != uint32(expectedRowsCount) {
+				t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
+			}
+		}
+	})
+	t.Run("matching-multiple-tenant-ids", func(t *testing.T) {
+		q := mustParseQuery(`"log message"`)
+		rowsCount := uint32(0)
+		processBlock := func(columns []BlockColumn) {
+			atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+		}
+		s.RunQuery(allTenantIDs, q, nil, processBlock)
+
+		expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-in-filter", func(t *testing.T) {
+		q := mustParseQuery(`source-file:in(foobar,/foo/bar/baz)`)
+		rowsCount := uint32(0)
+		processBlock := func(columns []BlockColumn) {
+			atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+		}
+		s.RunQuery(allTenantIDs, q, nil, processBlock)
+
+		expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("stream-filter-mismatch", func(t *testing.T) {
+		q := mustParseQuery(`_stream:{job="foobar",instance=~"host-.+:2345"} log`)
+		processBlock := func(columns []BlockColumn) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.RunQuery(allTenantIDs, q, nil, processBlock)
+	})
+	t.Run("matching-stream-id", func(t *testing.T) {
+		for i := 0; i < streamsPerTenant; i++ {
+			q := mustParseQuery(fmt.Sprintf(`log _stream:{job="foobar",instance="host-%d:234"} AND stream-id:*`, i))
+			tenantID := TenantID{
+				AccountID: 1,
+				ProjectID: 11,
+			}
+			expectedStreamID := fmt.Sprintf("stream_id=%d", i)
+			rowsCount := uint32(0)
+			processBlock := func(columns []BlockColumn) {
+				hasStreamIDColumn := false
+				var columnNames []string
+				for _, c := range columns {
+					if c.Name == "stream-id" {
+						hasStreamIDColumn = true
+						if len(c.Values) == 0 {
+							panic(fmt.Errorf("unexpected zero rows"))
+						}
+						for _, v := range c.Values {
+							if v != expectedStreamID {
+								panic(fmt.Errorf("unexpected stream-id; got %s; want %s", v, expectedStreamID))
+							}
+						}
+					}
+					columnNames = append(columnNames, c.Name)
+				}
+				if !hasStreamIDColumn {
+					panic(fmt.Errorf("missing stream-id column among columns: %q", columnNames))
+				}
+				atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+			}
+			tenantIDs := []TenantID{tenantID}
+			s.RunQuery(tenantIDs, q, nil, processBlock)
+
+			expectedRowsCount := blocksPerStream * rowsPerBlock
+			if rowsCount != uint32(expectedRowsCount) {
+				t.Fatalf("unexpected number of rows for stream %d; got %d; want %d", i, rowsCount, expectedRowsCount)
+			}
+		}
+	})
+	t.Run("matching-multiple-stream-ids-with-re-filter", func(t *testing.T) {
+		q := mustParseQuery(`_msg:log _stream:{job="foobar",instance=~"host-[^:]+:234"} and re("message [02] at")`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		rowsCount := uint32(0)
+		processBlock := func(columns []BlockColumn) {
+			atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+
+		expectedRowsCount := streamsPerTenant * blocksPerStream * 2
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-time-range", func(t *testing.T) {
+		minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
+		q := mustParseQuery(fmt.Sprintf(`_time:[%f,%f]`, float64(minTimestamp)/1e9, float64(maxTimestamp)/1e9))
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		rowsCount := uint32(0)
+		processBlock := func(columns []BlockColumn) {
+			atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+
+		expectedRowsCount := streamsPerTenant * blocksPerStream
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-stream-id-with-time-range", func(t *testing.T) {
+		minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
+		q := mustParseQuery(fmt.Sprintf(`_time:[%f,%f] _stream:{job="foobar",instance="host-1:234"}`, float64(minTimestamp)/1e9, float64(maxTimestamp)/1e9))
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		rowsCount := uint32(0)
+		processBlock := func(columns []BlockColumn) {
+			atomic.AddUint32(&rowsCount, uint32(len(columns[0].Values)))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+
+		expectedRowsCount := blocksPerStream
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-stream-id-missing-time-range", func(t *testing.T) {
+		minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
+		q := mustParseQuery(fmt.Sprintf(`_stream:{job="foobar",instance="host-1:234"} _time:[%d, %d)`, minTimestamp/1e9, maxTimestamp/1e9))
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		processBlock := func(columns []BlockColumn) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+	})
+	t.Run("missing-time-range", func(t *testing.T) {
+		minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
+		q := mustParseQuery(fmt.Sprintf(`_time:[%d, %d)`, minTimestamp/1e9, maxTimestamp/1e9))
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		processBlock := func(columns []BlockColumn) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		tenantIDs := []TenantID{tenantID}
+		s.RunQuery(tenantIDs, q, nil, processBlock)
+	})
+
+	// Close the storage and delete its data
+	s.MustClose()
+	fs.MustRemoveAll(path)
+}
+
+func mustParseQuery(query string) *Query {
+	q, err := ParseQuery(query)
+	if err != nil {
+		panic(fmt.Errorf("BUG: cannot parse %s: %s", query, err))
+	}
+	return q
+}
+
+func TestStorageSearch(t *testing.T) {
+	const path = "TestStorageSearch"
+
+	const tenantsCount = 11
+	const streamsPerTenant = 3
+	const blocksPerStream = 5
+	const rowsPerBlock = 7
+
+	sc := &StorageConfig{
+		Retention: 24 * time.Hour,
+	}
+	s := MustOpenStorage(path, sc)
+
+	// fill the storage with data.
+	var allTenantIDs []TenantID
+	baseTimestamp := time.Now().UnixNano() - 3600*1e9
+	var fields []Field
+	streamTags := []string{
+		"job",
+		"instance",
+	}
+	for i := 0; i < tenantsCount; i++ {
+		tenantID := TenantID{
+			AccountID: uint32(i),
+			ProjectID: uint32(10*i + 1),
+		}
+		allTenantIDs = append(allTenantIDs, tenantID)
+		for j := 0; j < streamsPerTenant; j++ {
+			for k := 0; k < blocksPerStream; k++ {
+				lr := GetLogRows(streamTags, nil)
+				for m := 0; m < rowsPerBlock; m++ {
+					timestamp := baseTimestamp + int64(m)*1e9 + int64(k)
+					// Append stream fields
+					fields = append(fields[:0], Field{
+						Name:  "job",
+						Value: "foobar",
+					}, Field{
+						Name:  "instance",
+						Value: fmt.Sprintf("host-%d:234", j),
+					})
+					// append the remaining fields
+					fields = append(fields, Field{
+						Name:  "_msg",
+						Value: fmt.Sprintf("log message %d at block %d", m, k),
+					})
+					fields = append(fields, Field{
+						Name:  "source-file",
+						Value: "/foo/bar/baz",
+					})
+					lr.MustAdd(tenantID, timestamp, fields)
+				}
+				s.MustAddRows(lr)
+				PutLogRows(lr)
+			}
+		}
+	}
+	s.debugFlush()
+
+	// run tests on the filled storage
+	const workersCount = 3
+
+	getBaseFilter := func(minTimestamp, maxTimestamp int64, sf *StreamFilter) filter {
+		var filters []filter
+		filters = append(filters, &timeFilter{
+			minTimestamp: minTimestamp,
+			maxTimestamp: maxTimestamp,
+		})
+		if sf != nil {
+			filters = append(filters, &streamFilter{
+				f: sf,
+			})
+		}
+		return &andFilter{
+			filters: filters,
+		}
+	}
+
+	t.Run("missing-tenant-smaller-than-existing", func(t *testing.T) {
+		tenantID := TenantID{
+			AccountID: 0,
+			ProjectID: 0,
+		}
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, nil)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		processBlock := func(workerID uint, br *blockResult) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.search(workersCount, so, nil, processBlock)
+	})
+	t.Run("missing-tenant-bigger-than-existing", func(t *testing.T) {
+		tenantID := TenantID{
+			AccountID: tenantsCount + 1,
+			ProjectID: 0,
+		}
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, nil)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		processBlock := func(workerID uint, br *blockResult) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.search(workersCount, so, nil, processBlock)
+	})
+	t.Run("missing-tenant-middle", func(t *testing.T) {
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 0,
+		}
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, nil)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		processBlock := func(workerID uint, br *blockResult) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.search(workersCount, so, nil, processBlock)
+	})
+	t.Run("matching-tenant-id", func(t *testing.T) {
+		for i := 0; i < tenantsCount; i++ {
+			tenantID := TenantID{
+				AccountID: uint32(i),
+				ProjectID: uint32(10*i + 1),
+			}
+			minTimestamp := baseTimestamp
+			maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+			f := getBaseFilter(minTimestamp, maxTimestamp, nil)
+			so := &genericSearchOptions{
+				tenantIDs:         []TenantID{tenantID},
+				filter:            f,
+				resultColumnNames: []string{"_msg"},
+			}
+			rowsCount := uint32(0)
+			processBlock := func(workerID uint, br *blockResult) {
+				if !br.streamID.tenantID.equal(&tenantID) {
+					panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
+				}
+				atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+			}
+			s.search(workersCount, so, nil, processBlock)
+
+			expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
+			if rowsCount != uint32(expectedRowsCount) {
+				t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
+			}
+		}
+	})
+	t.Run("matching-multiple-tenant-ids", func(t *testing.T) {
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, nil)
+		so := &genericSearchOptions{
+			tenantIDs:         allTenantIDs,
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		rowsCount := uint32(0)
+		processBlock := func(workerID uint, br *blockResult) {
+			atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+		}
+		s.search(workersCount, so, nil, processBlock)
+
+		expectedRowsCount := tenantsCount * streamsPerTenant * blocksPerStream * rowsPerBlock
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of matching rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("stream-filter-mismatch", func(t *testing.T) {
+		sf := mustNewStreamFilter(`{job="foobar",instance=~"host-.+:2345"}`)
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+		so := &genericSearchOptions{
+			tenantIDs:         allTenantIDs,
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		processBlock := func(workerID uint, br *blockResult) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.search(workersCount, so, nil, processBlock)
+	})
+	t.Run("matching-stream-id", func(t *testing.T) {
+		for i := 0; i < streamsPerTenant; i++ {
+			sf := mustNewStreamFilter(fmt.Sprintf(`{job="foobar",instance="host-%d:234"}`, i))
+			tenantID := TenantID{
+				AccountID: 1,
+				ProjectID: 11,
+			}
+			minTimestamp := baseTimestamp
+			maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+			f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+			so := &genericSearchOptions{
+				tenantIDs:         []TenantID{tenantID},
+				filter:            f,
+				resultColumnNames: []string{"_msg"},
+			}
+			rowsCount := uint32(0)
+			processBlock := func(workerID uint, br *blockResult) {
+				if !br.streamID.tenantID.equal(&tenantID) {
+					panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
+				}
+				atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+			}
+			s.search(workersCount, so, nil, processBlock)
+
+			expectedRowsCount := blocksPerStream * rowsPerBlock
+			if rowsCount != uint32(expectedRowsCount) {
+				t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+			}
+		}
+	})
+	t.Run("matching-multiple-stream-ids", func(t *testing.T) {
+		sf := mustNewStreamFilter(`{job="foobar",instance=~"host-[^:]+:234"}`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		rowsCount := uint32(0)
+		processBlock := func(workerID uint, br *blockResult) {
+			if !br.streamID.tenantID.equal(&tenantID) {
+				panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
+			}
+			atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+		}
+		s.search(workersCount, so, nil, processBlock)
+
+		expectedRowsCount := streamsPerTenant * blocksPerStream * rowsPerBlock
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-multiple-stream-ids-with-re-filter", func(t *testing.T) {
+		sf := mustNewStreamFilter(`{job="foobar",instance=~"host-[^:]+:234"}`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		minTimestamp := baseTimestamp
+		maxTimestamp := baseTimestamp + rowsPerBlock*1e9 + blocksPerStream
+		f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+		f = &andFilter{
+			filters: []filter{
+				f,
+				&regexpFilter{
+					fieldName: "_msg",
+					re:        regexp.MustCompile("message [02] at "),
+				},
+			},
+		}
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		rowsCount := uint32(0)
+		processBlock := func(workerID uint, br *blockResult) {
+			if !br.streamID.tenantID.equal(&tenantID) {
+				panic(fmt.Errorf("unexpected tenantID; got %s; want %s", &br.streamID.tenantID, &tenantID))
+			}
+			atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+		}
+		s.search(workersCount, so, nil, processBlock)
+
+		expectedRowsCount := streamsPerTenant * blocksPerStream * 2
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-stream-id-smaller-time-range", func(t *testing.T) {
+		sf := mustNewStreamFilter(`{job="foobar",instance="host-1:234"}`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		minTimestamp := baseTimestamp + (rowsPerBlock-2)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock-1)*1e9 - 1
+		f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		rowsCount := uint32(0)
+		processBlock := func(workerID uint, br *blockResult) {
+			atomic.AddUint32(&rowsCount, uint32(br.RowsCount()))
+		}
+		s.search(workersCount, so, nil, processBlock)
+
+		expectedRowsCount := blocksPerStream
+		if rowsCount != uint32(expectedRowsCount) {
+			t.Fatalf("unexpected number of rows; got %d; want %d", rowsCount, expectedRowsCount)
+		}
+	})
+	t.Run("matching-stream-id-missing-time-range", func(t *testing.T) {
+		sf := mustNewStreamFilter(`{job="foobar",instance="host-1:234"}`)
+		tenantID := TenantID{
+			AccountID: 1,
+			ProjectID: 11,
+		}
+		minTimestamp := baseTimestamp + (rowsPerBlock+1)*1e9
+		maxTimestamp := baseTimestamp + (rowsPerBlock+2)*1e9
+		f := getBaseFilter(minTimestamp, maxTimestamp, sf)
+		so := &genericSearchOptions{
+			tenantIDs:         []TenantID{tenantID},
+			filter:            f,
+			resultColumnNames: []string{"_msg"},
+		}
+		processBlock := func(workerID uint, br *blockResult) {
+			panic(fmt.Errorf("unexpected match"))
+		}
+		s.search(workersCount, so, nil, processBlock)
+	})
+
+	s.MustClose()
+	fs.MustRemoveAll(path)
+}
+
+func mustNewStreamFilter(s string) *StreamFilter {
+	sf, err := newStreamFilter(s)
+	if err != nil {
+		panic(fmt.Errorf("unexpected error in newStreamFilter(%q): %s", s, err))
+	}
+	return sf
+}
--- a/lib/logstorage/storage_test.go
+++ b/lib/logstorage/storage_test.go
@ -0,0 +1,102 @@
+package logstorage
+
+import (
+	"testing"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
+)
+
+func TestStorageLifecycle(t *testing.T) {
+	const path = "TestStorageLifecycle"
+
+	for i := 0; i < 3; i++ {
+		cfg := &StorageConfig{}
+		s := MustOpenStorage(path, cfg)
+		s.MustClose()
+	}
+	fs.MustRemoveAll(path)
+}
+
+func TestStorageMustAddRows(t *testing.T) {
+	const path = "TestStorageMustAddRows"
+
+	var sStats StorageStats
+
+	cfg := &StorageConfig{}
+	s := MustOpenStorage(path, cfg)
+
+	// Try adding the same entry multiple times.
+	totalRowsCount := uint64(0)
+	for i := 0; i < 100; i++ {
+		lr := newTestLogRows(1, 1, 0)
+		lr.timestamps[0] = time.Now().UTC().UnixNano()
+		totalRowsCount += uint64(len(lr.timestamps))
+		s.MustAddRows(lr)
+		sStats.Reset()
+		s.UpdateStats(&sStats)
+		if n := sStats.RowsCount(); n != totalRowsCount {
+			t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
+		}
+	}
+
+	s.MustClose()
+
+	// Re-open the storage and try writing data to it
+	s = MustOpenStorage(path, cfg)
+
+	sStats.Reset()
+	s.UpdateStats(&sStats)
+	if n := sStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
+	}
+
+	lr := newTestLogRows(3, 10, 0)
+	for i := range lr.timestamps {
+		lr.timestamps[i] = time.Now().UTC().UnixNano()
+	}
+	totalRowsCount += uint64(len(lr.timestamps))
+	s.MustAddRows(lr)
+	sStats.Reset()
+	s.UpdateStats(&sStats)
+	if n := sStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
+	}
+
+	s.MustClose()
+
+	// Re-open the storage with big retention and try writing data
+	// to different days in the past and in the future
+	cfg = &StorageConfig{
+		Retention:       365 * 24 * time.Hour,
+		FutureRetention: 365 * 24 * time.Hour,
+	}
+	s = MustOpenStorage(path, cfg)
+
+	lr = newTestLogRows(3, 10, 0)
+	now := time.Now().UTC().UnixNano() - int64(len(lr.timestamps)/2)*nsecPerDay
+	for i := range lr.timestamps {
+		lr.timestamps[i] = now
+		now += nsecPerDay
+	}
+	totalRowsCount += uint64(len(lr.timestamps))
+	s.MustAddRows(lr)
+	sStats.Reset()
+	s.UpdateStats(&sStats)
+	if n := sStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
+	}
+
+	s.MustClose()
+
+	// Make sure the stats is valid after re-opening the storage
+	s = MustOpenStorage(path, cfg)
+	sStats.Reset()
+	s.UpdateStats(&sStats)
+	if n := sStats.RowsCount(); n != totalRowsCount {
+		t.Fatalf("unexpected number of entries in storage; got %d; want %d", n, totalRowsCount)
+	}
+	s.MustClose()
+
+	fs.MustRemoveAll(path)
+}
--- a/lib/logstorage/stream_filter.go
+++ b/lib/logstorage/stream_filter.go
@ -0,0 +1,90 @@
+package logstorage
+
+import (
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
+)
+
+// StreamFilter is a filter for streams, e.g. `_stream:{...}`
+type StreamFilter struct {
+	orFilters []*andStreamFilter
+}
+
+func (sf *StreamFilter) isEmpty() bool {
+	for _, af := range sf.orFilters {
+		if len(af.tagFilters) > 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func (sf *StreamFilter) marshalForCacheKey(dst []byte) []byte {
+	dst = encoding.MarshalVarUint64(dst, uint64(len(sf.orFilters)))
+	for _, af := range sf.orFilters {
+		dst = encoding.MarshalVarUint64(dst, uint64(len(af.tagFilters)))
+		for _, f := range af.tagFilters {
+			dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.tagName))
+			dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.op))
+			dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(f.value))
+		}
+	}
+	return dst
+}
+
+func (sf *StreamFilter) String() string {
+	a := make([]string, len(sf.orFilters))
+	for i := range a {
+		a[i] = sf.orFilters[i].String()
+	}
+	return "{" + strings.Join(a, " or ") + "}"
+}
+
+type andStreamFilter struct {
+	tagFilters []*streamTagFilter
+}
+
+func (af *andStreamFilter) String() string {
+	a := make([]string, len(af.tagFilters))
+	for i := range a {
+		a[i] = af.tagFilters[i].String()
+	}
+	return strings.Join(a, ",")
+}
+
+// streamTagFilter is a filter for `tagName op value`
+type streamTagFilter struct {
+	// tagName is the name for the tag to filter
+	tagName string
+	// op is operation such as `=`, `!=`, `=~` or `!~`
+	op string
+
+	// value is the value
+	value string
+
+	regexpOnce sync.Once
+	regexp     *regexutil.PromRegex
+}
+
+func (tf *streamTagFilter) getRegexp() *regexutil.PromRegex {
+	tf.regexpOnce.Do(tf.initRegexp)
+	return tf.regexp
+}
+
+func (tf *streamTagFilter) initRegexp() {
+	re, err := regexutil.NewPromRegex(tf.value)
+	if err != nil {
+		logger.Panicf("BUG: cannot parse regexp %q: %s", tf.value, err)
+	}
+	tf.regexp = re
+}
+
+func (tf *streamTagFilter) String() string {
+	return quoteTokenIfNeeded(tf.tagName) + tf.op + strconv.Quote(tf.value)
+}
--- a/lib/logstorage/stream_id.go
+++ b/lib/logstorage/stream_id.go
@ -0,0 +1,69 @@
+package logstorage
+
+import (
+	"fmt"
+)
+
+// streamID is an internal id of log stream.
+//
+// Blocks are ordered by streamID inside parts.
+type streamID struct {
+	// tenantID is a tenant id for the given stream.
+	// It is located at the beginning of streamID in order
+	// to physically group blocks for the same tenants on the storage.
+	tenantID TenantID
+
+	// id is internal id, which uniquely identifies the stream in the tenant by its labels.
+	// It is calculated as a hash of canonically sorted stream labels.
+	//
+	// Streams with identical sets of labels, which belong to distinct tenants, have the same id.
+	id u128
+}
+
+// reset resets sid for subsequent re-use
+func (sid *streamID) reset() {
+	*sid = streamID{}
+}
+
+// String returns human-readable representation for sid.
+func (sid *streamID) String() string {
+	return fmt.Sprintf("(tenant_id=%s, id=%s)", &sid.tenantID, &sid.id)
+}
+
+// less returns true if a is less than sid.
+func (sid *streamID) less(a *streamID) bool {
+	if !sid.tenantID.equal(&a.tenantID) {
+		return sid.tenantID.less(&a.tenantID)
+	}
+	return sid.id.less(&a.id)
+}
+
+// equal returns true if sid equalt to a.
+func (sid *streamID) equal(a *streamID) bool {
+	if !sid.tenantID.equal(&a.tenantID) {
+		return false
+	}
+	return sid.id.equal(&a.id)
+}
+
+// marshal appends the marshaled sid to dst and returns the result
+func (sid *streamID) marshal(dst []byte) []byte {
+	dst = sid.tenantID.marshal(dst)
+	dst = sid.id.marshal(dst)
+	return dst
+}
+
+// unmarshal unmarshals sid from src and returns the tail from src.
+func (sid *streamID) unmarshal(src []byte) ([]byte, error) {
+	srcOrig := src
+	tail, err := sid.tenantID.unmarshal(src)
+	if err != nil {
+		return srcOrig, err
+	}
+	src = tail
+	tail, err = sid.id.unmarshal(src)
+	if err != nil {
+		return srcOrig, err
+	}
+	return tail, nil
+}
--- a/lib/logstorage/stream_id_test.go
+++ b/lib/logstorage/stream_id_test.go
@ -0,0 +1,172 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestStreamIDMarshalUnmarshal(t *testing.T) {
+	f := func(sid *streamID, marshaledLen int) {
+		t.Helper()
+		data := sid.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected length of marshaled streamID; got %d; want %d", len(data), marshaledLen)
+		}
+		var sid2 streamID
+		tail, err := sid2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error on unmarshal(%s): %s", sid, err)
+		}
+		if len(tail) != 0 {
+			t.Fatalf("unexpected non-empty tail on unmarshal(%s): %X", sid, tail)
+		}
+		if !reflect.DeepEqual(sid, &sid2) {
+			t.Fatalf("unexpected result on unmarshal; got %s; want %s", &sid2, sid)
+		}
+		s1 := sid.String()
+		s2 := sid2.String()
+		if s1 != s2 {
+			t.Fatalf("unexpected string result on unmarshal; got %s; want %s", s2, s1)
+		}
+	}
+	f(&streamID{}, 24)
+	f(&streamID{
+		tenantID: TenantID{
+			AccountID: 123,
+			ProjectID: 456,
+		},
+		id: u128{
+			lo: 89,
+			hi: 344334,
+		},
+	}, 24)
+}
+
+func TestStreamIDUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var sid streamID
+		tail, err := sid.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+	f([]byte("1234567890"))
+}
+
+func TestStreamIDLessEqual(t *testing.T) {
+	// compare equal values
+	sid1 := &streamID{}
+	sid2 := &streamID{}
+	if sid1.less(sid2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if sid2.less(sid1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !sid1.equal(sid2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid1, sid2)
+	}
+	if !sid2.equal(sid1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid2, sid1)
+	}
+
+	sid1 = &streamID{
+		tenantID: TenantID{
+			AccountID: 1,
+			ProjectID: 2,
+		},
+		id: u128{
+			hi: 123,
+			lo: 456,
+		},
+	}
+	sid2 = &streamID{
+		tenantID: TenantID{
+			AccountID: 1,
+			ProjectID: 2,
+		},
+		id: u128{
+			hi: 123,
+			lo: 456,
+		},
+	}
+	if sid1.less(sid2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if sid2.less(sid1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !sid1.equal(sid2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid1, sid2)
+	}
+	if !sid2.equal(sid1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", sid2, sid1)
+	}
+
+	// compare unequal values
+	sid1 = &streamID{
+		id: u128{
+			lo: 456,
+		},
+	}
+	sid2 = &streamID{
+		id: u128{
+			hi: 123,
+		},
+	}
+	if !sid1.less(sid2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", sid1, sid2)
+	}
+	if sid2.less(sid1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", sid2, sid1)
+	}
+	if sid1.equal(sid2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", sid1, sid2)
+	}
+
+	sid1 = &streamID{
+		id: u128{
+			hi: 123,
+			lo: 456,
+		},
+	}
+	sid2 = &streamID{
+		tenantID: TenantID{
+			AccountID: 123,
+		},
+	}
+	if !sid1.less(sid2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", sid1, sid2)
+	}
+	if sid2.less(sid1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", sid2, sid1)
+	}
+	if sid1.equal(sid2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", sid1, sid2)
+	}
+}
+
+func TestStreamIDReset(t *testing.T) {
+	sid := &streamID{
+		tenantID: TenantID{
+			AccountID: 123,
+			ProjectID: 456,
+		},
+		id: u128{
+			hi: 234,
+			lo: 9843,
+		},
+	}
+	sid.reset()
+	sidZero := &streamID{}
+	if !reflect.DeepEqual(sid, sidZero) {
+		t.Fatalf("non-zero streamID after reset(): %s", sid)
+	}
+}
--- a/lib/logstorage/stream_tags.go
+++ b/lib/logstorage/stream_tags.go
@ -0,0 +1,298 @@
+package logstorage
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// GetStreamTags returns a StreamTags from pool.
+func GetStreamTags() *StreamTags {
+	v := streamTagsPool.Get()
+	if v == nil {
+		return &StreamTags{}
+	}
+	return v.(*StreamTags)
+}
+
+// PutStreamTags returns st to the pool.
+func PutStreamTags(st *StreamTags) {
+	st.Reset()
+	streamTagsPool.Put(st)
+}
+
+var streamTagsPool sync.Pool
+
+// StreamTags contains stream tags.
+type StreamTags struct {
+	// buf holds all the data backed by tags
+	buf []byte
+
+	// tags contains added tags.
+	tags []streamTag
+}
+
+// Reset resets st for re-use
+func (st *StreamTags) Reset() {
+	st.buf = st.buf[:0]
+
+	tags := st.tags
+	for i := range tags {
+		t := &tags[i]
+		t.Name = nil
+		t.Value = nil
+	}
+	st.tags = tags[:0]
+}
+
+// String returns string representation of st.
+func (st *StreamTags) String() string {
+	b := st.marshalString(nil)
+	return string(b)
+}
+
+func (st *StreamTags) marshalString(dst []byte) []byte {
+	dst = append(dst, '{')
+
+	tags := st.tags
+	if len(tags) > 0 {
+		dst = tags[0].marshalString(dst)
+		tags = tags[1:]
+		for i := range tags {
+			dst = append(dst, ',')
+			dst = tags[i].marshalString(dst)
+		}
+	}
+
+	dst = append(dst, '}')
+
+	return dst
+}
+
+// Add adds (name:value) tag to st.
+func (st *StreamTags) Add(name, value string) {
+	if len(name) == 0 || len(value) == 0 {
+		return
+	}
+
+	buf := st.buf
+
+	bufLen := len(buf)
+	buf = append(buf, name...)
+	bName := buf[bufLen:]
+
+	bufLen = len(buf)
+	buf = append(buf, value...)
+	bValue := buf[bufLen:]
+
+	st.buf = buf
+
+	st.tags = append(st.tags, streamTag{
+		Name:  bName,
+		Value: bValue,
+	})
+}
+
+// MarshalCanonical marshal st in a canonical way
+func (st *StreamTags) MarshalCanonical(dst []byte) []byte {
+	sort.Sort(st)
+
+	tags := st.tags
+	dst = encoding.MarshalVarUint64(dst, uint64(len(tags)))
+	for i := range tags {
+		tag := &tags[i]
+		dst = encoding.MarshalBytes(dst, tag.Name)
+		dst = encoding.MarshalBytes(dst, tag.Value)
+	}
+	return dst
+}
+
+// UnmarshalCanonical unmarshals st from src marshaled with MarshalCanonical.
+func (st *StreamTags) UnmarshalCanonical(src []byte) ([]byte, error) {
+	st.Reset()
+
+	srcOrig := src
+
+	tail, n, err := encoding.UnmarshalVarUint64(src)
+	if err != nil {
+		return srcOrig, fmt.Errorf("cannot unmarshal tags len: %w", err)
+	}
+	src = tail
+	for i := uint64(0); i < n; i++ {
+		tail, name, err := encoding.UnmarshalBytes(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal tag name: %w", err)
+		}
+		src = tail
+
+		tail, value, err := encoding.UnmarshalBytes(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot unmarshal tag value: %w", err)
+		}
+		src = tail
+
+		sName := bytesutil.ToUnsafeString(name)
+		sValue := bytesutil.ToUnsafeString(value)
+		st.Add(sName, sValue)
+	}
+
+	return src, nil
+}
+
+func getStreamTagsString(streamTagsCanonical []byte) string {
+	st := GetStreamTags()
+	mustUnmarshalStreamTags(st, streamTagsCanonical)
+	s := st.String()
+	PutStreamTags(st)
+
+	return s
+}
+
+func mustUnmarshalStreamTags(dst *StreamTags, src []byte) {
+	tail, err := dst.UnmarshalCanonical(src)
+	if err != nil {
+		logger.Panicf("FATAL: cannot unmarshal StreamTags from value obtained from cache: %s", err)
+	}
+	if len(tail) > 0 {
+		logger.Panicf("FATAL: unexpected tail left after unmarshaling StreamTags; len(tail)=%d; tail=%q", len(tail), tail)
+	}
+}
+
+// Len returns the number of tags in st.
+func (st *StreamTags) Len() int {
+	return len(st.tags)
+}
+
+// Less returns true if tag i is smaller than the tag j.
+func (st *StreamTags) Less(i, j int) bool {
+	tags := st.tags
+	return tags[i].less(&tags[j])
+}
+
+// Swap swaps i and j tags
+func (st *StreamTags) Swap(i, j int) {
+	tags := st.tags
+	tags[i], tags[j] = tags[j], tags[i]
+}
+
+// streamTag represents a (name:value) tag for stream.
+type streamTag struct {
+	Name  []byte
+	Value []byte
+}
+
+func (tag *streamTag) marshalString(dst []byte) []byte {
+	dst = append(dst, tag.Name...)
+	dst = append(dst, '=')
+	dst = strconv.AppendQuote(dst, bytesutil.ToUnsafeString(tag.Value))
+	return dst
+}
+
+// reset resets the tag.
+func (tag *streamTag) reset() {
+	tag.Name = tag.Name[:0]
+	tag.Value = tag.Value[:0]
+}
+
+func (tag *streamTag) equal(t *streamTag) bool {
+	return string(tag.Name) == string(t.Name) && string(tag.Value) == string(t.Value)
+}
+
+func (tag *streamTag) less(t *streamTag) bool {
+	if string(tag.Name) != string(t.Name) {
+		return string(tag.Name) < string(t.Name)
+	}
+	return string(tag.Value) < string(t.Value)
+}
+
+func (tag *streamTag) indexdbMarshal(dst []byte) []byte {
+	dst = marshalTagValue(dst, tag.Name)
+	dst = marshalTagValue(dst, tag.Value)
+	return dst
+}
+
+func (tag *streamTag) indexdbUnmarshal(src []byte) ([]byte, error) {
+	var err error
+	src, tag.Name, err = unmarshalTagValue(tag.Name[:0], src)
+	if err != nil {
+		return src, fmt.Errorf("cannot unmarshal key: %w", err)
+	}
+	src, tag.Value, err = unmarshalTagValue(tag.Value[:0], src)
+	if err != nil {
+		return src, fmt.Errorf("cannot unmarshal value: %w", err)
+	}
+	return src, nil
+}
+
+const (
+	escapeChar       = 0
+	tagSeparatorChar = 1
+	kvSeparatorChar  = 2
+)
+
+func marshalTagValue(dst, src []byte) []byte {
+	n1 := bytes.IndexByte(src, escapeChar)
+	n2 := bytes.IndexByte(src, tagSeparatorChar)
+	n3 := bytes.IndexByte(src, kvSeparatorChar)
+	if n1 < 0 && n2 < 0 && n3 < 0 {
+		// Fast path.
+		dst = append(dst, src...)
+		dst = append(dst, tagSeparatorChar)
+		return dst
+	}
+
+	// Slow path.
+	for _, ch := range src {
+		switch ch {
+		case escapeChar:
+			dst = append(dst, escapeChar, '0')
+		case tagSeparatorChar:
+			dst = append(dst, escapeChar, '1')
+		case kvSeparatorChar:
+			dst = append(dst, escapeChar, '2')
+		default:
+			dst = append(dst, ch)
+		}
+	}
+
+	dst = append(dst, tagSeparatorChar)
+	return dst
+}
+
+func unmarshalTagValue(dst, src []byte) ([]byte, []byte, error) {
+	n := bytes.IndexByte(src, tagSeparatorChar)
+	if n < 0 {
+		return src, dst, fmt.Errorf("cannot find the end of tag value")
+	}
+	b := src[:n]
+	src = src[n+1:]
+	for {
+		n := bytes.IndexByte(b, escapeChar)
+		if n < 0 {
+			dst = append(dst, b...)
+			return src, dst, nil
+		}
+		dst = append(dst, b[:n]...)
+		b = b[n+1:]
+		if len(b) == 0 {
+			return src, dst, fmt.Errorf("missing escaped char")
+		}
+		switch b[0] {
+		case '0':
+			dst = append(dst, escapeChar)
+		case '1':
+			dst = append(dst, tagSeparatorChar)
+		case '2':
+			dst = append(dst, kvSeparatorChar)
+		default:
+			return src, dst, fmt.Errorf("unsupported escaped char: %c", b[0])
+		}
+		b = b[1:]
+	}
+}
--- a/lib/logstorage/tenant_id.go
+++ b/lib/logstorage/tenant_id.go
@ -0,0 +1,91 @@
+package logstorage
+
+import (
+	"fmt"
+	"net/http"
+	"strconv"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+// TenantID is an id of a tenant for log streams.
+//
+// Each log stream is associated with a single TenantID.
+type TenantID struct {
+	// AccountID is the id of the account for the log stream.
+	AccountID uint32
+
+	// ProjectID is the id of the project for the log stream.
+	ProjectID uint32
+}
+
+// Reset resets tid.
+func (tid *TenantID) Reset() {
+	tid.AccountID = 0
+	tid.ProjectID = 0
+}
+
+// String returns human-readable representation of tid
+func (tid *TenantID) String() string {
+	return fmt.Sprintf("{accountID=%d,projectID=%d}", tid.AccountID, tid.ProjectID)
+}
+
+// equal returns true if tid equals to a.
+func (tid *TenantID) equal(a *TenantID) bool {
+	return tid.AccountID == a.AccountID && tid.ProjectID == a.ProjectID
+}
+
+// less returns true if tid is less than a.
+func (tid *TenantID) less(a *TenantID) bool {
+	if tid.AccountID != a.AccountID {
+		return tid.AccountID < a.AccountID
+	}
+	return tid.ProjectID < a.ProjectID
+}
+
+// marshal appends the marshaled tid to dst and returns the result
+func (tid *TenantID) marshal(dst []byte) []byte {
+	dst = encoding.MarshalUint32(dst, tid.AccountID)
+	dst = encoding.MarshalUint32(dst, tid.ProjectID)
+	return dst
+}
+
+// unmarshal unmarshals tid from src and returns the remaining tail.
+func (tid *TenantID) unmarshal(src []byte) ([]byte, error) {
+	if len(src) < 8 {
+		return src, fmt.Errorf("cannot unmarshal tenantID from %d bytes; need at least 8 bytes", len(src))
+	}
+	tid.AccountID = encoding.UnmarshalUint32(src[:4])
+	tid.ProjectID = encoding.UnmarshalUint32(src[4:])
+	return src[8:], nil
+}
+
+// GetTenantIDFromRequest returns tenantID from r.
+func GetTenantIDFromRequest(r *http.Request) (TenantID, error) {
+	var tenantID TenantID
+
+	accountID, err := getUint32FromHeader(r, "AccountID")
+	if err != nil {
+		return tenantID, err
+	}
+	projectID, err := getUint32FromHeader(r, "ProjectID")
+	if err != nil {
+		return tenantID, err
+	}
+
+	tenantID.AccountID = accountID
+	tenantID.ProjectID = projectID
+	return tenantID, nil
+}
+
+func getUint32FromHeader(r *http.Request, headerName string) (uint32, error) {
+	s := r.Header.Get(headerName)
+	if len(s) == 0 {
+		return 0, nil
+	}
+	n, err := strconv.ParseUint(s, 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("cannot parse %s header %q: %w", headerName, s, err)
+	}
+	return uint32(n), nil
+}
--- a/lib/logstorage/tenant_id_test.go
+++ b/lib/logstorage/tenant_id_test.go
@ -0,0 +1,124 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestTenantIDMarshalUnmarshal(t *testing.T) {
+	f := func(tid *TenantID) {
+		t.Helper()
+		data := tid.marshal(nil)
+		var tid2 TenantID
+		tail, err := tid2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error at unmarshal(%s): %s", tid, err)
+		}
+		if len(tail) != 0 {
+			t.Fatalf("unexpected non-emtpy tail after unmarshal(%s): %X", tid, tail)
+		}
+		if !reflect.DeepEqual(tid, &tid2) {
+			t.Fatalf("unexpected value after unmarshal; got %s; want %s", &tid2, tid)
+		}
+		s1 := tid.String()
+		s2 := tid2.String()
+		if s1 != s2 {
+			t.Fatalf("unexpected string value after unmarshal; got %s; want %s", s2, s1)
+		}
+	}
+	f(&TenantID{})
+	f(&TenantID{
+		AccountID: 123,
+		ProjectID: 456,
+	})
+}
+
+func TestTenantIDUnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var tid TenantID
+		tail, err := tid.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("abc"))
+}
+
+func TestTenantIDLessEqual(t *testing.T) {
+	// compare equal values
+	tid1 := &TenantID{}
+	tid2 := &TenantID{}
+	if tid1.less(tid2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if tid2.less(tid1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !tid1.equal(tid2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid1, tid2)
+	}
+	if !tid2.equal(tid1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid2, tid1)
+	}
+
+	tid1 = &TenantID{
+		AccountID: 123,
+		ProjectID: 456,
+	}
+	tid2 = &TenantID{
+		AccountID: 123,
+		ProjectID: 456,
+	}
+	if tid1.less(tid2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if tid2.less(tid1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !tid1.equal(tid2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid1, tid2)
+	}
+	if !tid2.equal(tid1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", tid2, tid1)
+	}
+
+	// compare unequal values
+	tid1 = &TenantID{
+		ProjectID: 456,
+	}
+	tid2 = &TenantID{
+		AccountID: 123,
+	}
+	if !tid1.less(tid2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", tid1, tid2)
+	}
+	if tid2.less(tid1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", tid2, tid1)
+	}
+	if tid1.equal(tid2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", tid1, tid2)
+	}
+
+	tid1 = &TenantID{
+		AccountID: 123,
+	}
+	tid2 = &TenantID{
+		AccountID: 123,
+		ProjectID: 456,
+	}
+	if !tid1.less(tid2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", tid1, tid2)
+	}
+	if tid2.less(tid1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", tid2, tid1)
+	}
+	if tid1.equal(tid2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", tid1, tid2)
+	}
+}
--- a/lib/logstorage/tokenizer.go
+++ b/lib/logstorage/tokenizer.go
@ -0,0 +1,153 @@
+package logstorage
+
+import (
+	"sort"
+	"sync"
+	"unicode"
+)
+
+// tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
+func tokenizeStrings(dst, a []string) []string {
+	t := getTokenizer()
+	m := t.m
+	for i, s := range a {
+		if i > 0 && s == a[i-1] {
+			// This string has been already tokenized
+			continue
+		}
+		tokenizeString(m, s)
+	}
+	dstLen := len(dst)
+	for k := range t.m {
+		dst = append(dst, k)
+	}
+	putTokenizer(t)
+
+	// Sort tokens with zero memory allocations
+	ss := getStringsSorter(dst[dstLen:])
+	sort.Sort(ss)
+	putStringsSorter(ss)
+
+	return dst
+}
+
+type tokenizer struct {
+	m map[string]struct{}
+}
+
+func (t *tokenizer) reset() {
+	m := t.m
+	for k := range m {
+		delete(m, k)
+	}
+}
+
+func tokenizeString(dst map[string]struct{}, s string) {
+	for len(s) > 0 {
+		// Search for the next token.
+		nextIdx := len(s)
+		for i, c := range s {
+			if isTokenRune(c) {
+				nextIdx = i
+				break
+			}
+		}
+		s = s[nextIdx:]
+		// Search for the end of the token
+		nextIdx = len(s)
+		for i, c := range s {
+			if !isTokenRune(c) {
+				nextIdx = i
+				break
+			}
+		}
+		token := s[:nextIdx]
+		if len(token) > 0 {
+			dst[token] = struct{}{}
+		}
+		s = s[nextIdx:]
+	}
+}
+
+func isTokenRune(c rune) bool {
+	return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
+}
+
+func getTokenizer() *tokenizer {
+	v := tokenizerPool.Get()
+	if v == nil {
+		return &tokenizer{
+			m: make(map[string]struct{}),
+		}
+	}
+	return v.(*tokenizer)
+}
+
+func putTokenizer(t *tokenizer) {
+	t.reset()
+	tokenizerPool.Put(t)
+}
+
+var tokenizerPool sync.Pool
+
+type stringsSorter struct {
+	a []string
+}
+
+func (ss *stringsSorter) Len() int {
+	return len(ss.a)
+}
+func (ss *stringsSorter) Swap(i, j int) {
+	a := ss.a
+	a[i], a[j] = a[j], a[i]
+}
+func (ss *stringsSorter) Less(i, j int) bool {
+	a := ss.a
+	return a[i] < a[j]
+}
+
+func getStringsSorter(a []string) *stringsSorter {
+	v := stringsSorterPool.Get()
+	if v == nil {
+		return &stringsSorter{
+			a: a,
+		}
+	}
+	ss := v.(*stringsSorter)
+	ss.a = a
+	return ss
+}
+
+func putStringsSorter(ss *stringsSorter) {
+	ss.a = nil
+	stringsSorterPool.Put(ss)
+}
+
+var stringsSorterPool sync.Pool
+
+type tokensBuf struct {
+	A []string
+}
+
+func (tb *tokensBuf) reset() {
+	a := tb.A
+	for i := range a {
+		a[i] = ""
+	}
+	tb.A = a[:0]
+}
+
+func getTokensBuf() *tokensBuf {
+	v := tokensBufPool.Get()
+	if v == nil {
+		return &tokensBuf{}
+	}
+	return v.(*tokensBuf)
+}
+
+func putTokensBuf(tb *tokensBuf) {
+	tb.reset()
+	tokensBufPool.Put(tb)
+}
+
+var tokensBufPool sync.Pool
--- a/lib/logstorage/tokenizer_test.go
+++ b/lib/logstorage/tokenizer_test.go
@ -0,0 +1,29 @@
+package logstorage
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestTokenizeStrings(t *testing.T) {
+	f := func(a, tokensExpected []string) {
+		t.Helper()
+		tokens := tokenizeStrings(nil, a)
+		if !reflect.DeepEqual(tokens, tokensExpected) {
+			t.Fatalf("unexpected tokens;\ngot\n%q\nwant\n%q", tokens, tokensExpected)
+		}
+	}
+	f(nil, nil)
+	f([]string{""}, nil)
+	f([]string{"foo"}, []string{"foo"})
+	f([]string{"foo bar---.!!([baz]!!! %$# TaSte"}, []string{"TaSte", "bar", "baz", "foo"})
+	f([]string{"теСТ 1234 f12.34", "34 f12 AS"}, []string{"1234", "34", "AS", "f12", "теСТ"})
+	f(strings.Split(`
+Apr 28 13:43:38 localhost whoopsie[2812]: [13:43:38] online
+Apr 28 13:45:01 localhost CRON[12181]: (root) CMD (command -v debian-sa1 > /dev/null && debian-sa1 1 1)
+Apr 28 13:48:01 localhost kernel: [36020.497806] CPU0: Core temperature above threshold, cpu clock throttled (total events = 22034)
+`, "\n"), []string{"01", "1", "12181", "13", "22034", "28", "2812", "36020", "38", "43", "45", "48", "497806", "Apr", "CMD", "CPU0", "CRON",
+		"Core", "above", "clock", "command", "cpu", "debian", "dev", "events", "kernel", "localhost", "null", "online", "root",
+		"sa1", "temperature", "threshold", "throttled", "total", "v", "whoopsie"})
+}
--- a/lib/logstorage/tokenizer_timing_test.go
+++ b/lib/logstorage/tokenizer_timing_test.go
@ -0,0 +1,19 @@
+package logstorage
+
+import (
+	"strings"
+	"testing"
+)
+
+func BenchmarkTokenizeStrings(b *testing.B) {
+	a := strings.Split(benchLogs, "\n")
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(benchLogs)))
+	b.RunParallel(func(pb *testing.PB) {
+		var tokens []string
+		for pb.Next() {
+			tokens = tokenizeStrings(tokens[:0], a)
+		}
+	})
+}
--- a/lib/logstorage/u128.go
+++ b/lib/logstorage/u128.go
@ -0,0 +1,50 @@
+package logstorage
+
+import (
+	"fmt"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+)
+
+// u128 is 128-bit uint number.
+//
+// It is used as an unique id of stream.
+type u128 struct {
+	hi uint64
+	lo uint64
+}
+
+// String returns human-readable representation of u.
+func (u *u128) String() string {
+	return fmt.Sprintf("{hi=%d,lo=%d}", u.hi, u.lo)
+}
+
+// less returns true if u is less than a.
+func (u *u128) less(a *u128) bool {
+	if u.hi != a.hi {
+		return u.hi < a.hi
+	}
+	return u.lo < a.lo
+}
+
+// equal returns true if u equalst to a.
+func (u *u128) equal(a *u128) bool {
+	return u.hi == a.hi && u.lo == a.lo
+}
+
+// marshal appends the marshaled u to dst and returns the result.
+func (u *u128) marshal(dst []byte) []byte {
+	dst = encoding.MarshalUint64(dst, u.hi)
+	dst = encoding.MarshalUint64(dst, u.lo)
+	return dst
+}
+
+// unmarshal unmarshals u from src and returns the tail.
+func (u *u128) unmarshal(src []byte) ([]byte, error) {
+	if len(src) < 16 {
+		return src, fmt.Errorf("cannot unmarshal u128 from %d bytes; need at least 16 bytes", len(src))
+	}
+	u.hi = encoding.UnmarshalUint64(src[:8])
+	u.lo = encoding.UnmarshalUint64(src[8:])
+	return src[16:], nil
+}
--- a/lib/logstorage/u128_test.go
+++ b/lib/logstorage/u128_test.go
@ -0,0 +1,127 @@
+package logstorage
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestU128MarshalUnmarshal(t *testing.T) {
+	f := func(u *u128, marshaledLen int) {
+		t.Helper()
+		data := u.marshal(nil)
+		if len(data) != marshaledLen {
+			t.Fatalf("unexpected length of marshaled u128; got %d; want %d", len(data), marshaledLen)
+		}
+		var u2 u128
+		tail, err := u2.unmarshal(data)
+		if err != nil {
+			t.Fatalf("unexpected error at unmarshal(%s): %s", u, err)
+		}
+		if len(tail) != 0 {
+			t.Fatalf("unexpected non-emtpy tail after unmarshal(%s): %X", u, tail)
+		}
+		if !reflect.DeepEqual(u, &u2) {
+			t.Fatalf("unexpected value obtained from unmarshal(%s); got %s; want %s", u, &u2, u)
+		}
+		s1 := u.String()
+		s2 := u2.String()
+		if s1 != s2 {
+			t.Fatalf("unexpected string representation after unmarshal; got %s; want %s", s2, s1)
+		}
+	}
+	f(&u128{}, 16)
+	f(&u128{
+		hi: 123,
+		lo: 456,
+	}, 16)
+}
+
+func TestU128UnmarshalFailure(t *testing.T) {
+	f := func(data []byte) {
+		t.Helper()
+		dataOrig := append([]byte{}, data...)
+		var u u128
+		tail, err := u.unmarshal(data)
+		if err == nil {
+			t.Fatalf("expecting non-nil error")
+		}
+		if string(tail) != string(dataOrig) {
+			t.Fatalf("unexpected tail; got %q; want %q", tail, dataOrig)
+		}
+	}
+	f(nil)
+	f([]byte("foo"))
+}
+
+func TestU128LessEqual(t *testing.T) {
+	// compare equal values
+	u1 := &u128{}
+	u2 := &u128{}
+	if u1.less(u2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if u2.less(u1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !u1.equal(u2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u1, u2)
+	}
+	if !u2.equal(u1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u2, u1)
+	}
+
+	u1 = &u128{
+		hi: 123,
+		lo: 456,
+	}
+	u2 = &u128{
+		hi: 123,
+		lo: 456,
+	}
+	if u1.less(u2) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if u2.less(u1) {
+		t.Fatalf("less for equal values must return false")
+	}
+	if !u1.equal(u2) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u1, u2)
+	}
+	if !u2.equal(u1) {
+		t.Fatalf("unexpected equal(%s, %s) result; got false; want true", u2, u1)
+	}
+
+	// compare unequal values
+	u1 = &u128{
+		lo: 456,
+	}
+	u2 = &u128{
+		hi: 123,
+	}
+	if !u1.less(u2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", u1, u2)
+	}
+	if u2.less(u1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", u2, u1)
+	}
+	if u1.equal(u2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", u1, u2)
+	}
+
+	u1 = &u128{
+		hi: 123,
+	}
+	u2 = &u128{
+		hi: 123,
+		lo: 456,
+	}
+	if !u1.less(u2) {
+		t.Fatalf("unexpected result for less(%s, %s); got false; want true", u1, u2)
+	}
+	if u2.less(u1) {
+		t.Fatalf("unexpected result for less(%s, %s); got true; want false", u2, u1)
+	}
+	if u1.equal(u2) {
+		t.Fatalf("unexpected result for equal(%s, %s); got true; want false", u1, u2)
+	}
+}
--- a/lib/logstorage/values_encoder.go
+++ b/lib/logstorage/values_encoder.go
@ -0,0 +1,742 @@
+package logstorage
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+// valueType is the type of values stored in every column block.
+type valueType byte
+
+const (
+	// valueTypeUnknown is used for determining whether the value type is unknown.
+	valueTypeUnknown = valueType(0)
+
+	// default encoding for column blocks. Strings are stored as is.
+	valueTypeString = valueType(1)
+
+	// column blocks with small number of unique values are encoded as dict.
+	valueTypeDict = valueType(2)
+
+	// uint values up to 2^8-1 are encoded into valueTypeUint8.
+	// Every value occupies a single byte.
+	valueTypeUint8 = valueType(3)
+
+	// uint values up to 2^16-1 are encoded into valueTypeUint16.
+	// Every value occupies 2 bytes.
+	valueTypeUint16 = valueType(4)
+
+	// uint values up to 2^31-1 are encoded into valueTypeUint32.
+	// Every value occupies 4 bytes.
+	valueTypeUint32 = valueType(5)
+
+	// uint values up to 2^64-1 are encoded into valueTypeUint64.
+	// Every value occupies 8 bytes.
+	valueTypeUint64 = valueType(6)
+
+	// floating-point values are encoded into valueTypeFloat64.
+	valueTypeFloat64 = valueType(7)
+
+	// column blocks with ipv4 addresses are encoded as 4-byte strings.
+	valueTypeIPv4 = valueType(8)
+
+	// column blocks with ISO8601 timestamps are encoded into valueTypeTimestampISO8601.
+	// These timestamps are commonly used by Logstash.
+	valueTypeTimestampISO8601 = valueType(9)
+)
+
+type valuesEncoder struct {
+	// buf contains data for values.
+	buf []byte
+
+	// values contains encoded values.
+	values []string
+}
+
+func (ve *valuesEncoder) reset() {
+	ve.buf = ve.buf[:0]
+
+	vs := ve.values
+	for i := range vs {
+		vs[i] = ""
+	}
+	ve.values = vs[:0]
+}
+
+// encode encodes values to ve.values and returns the encoded value type with min/max encoded values.
+func (ve *valuesEncoder) encode(values []string, dict *valuesDict) (valueType, uint64, uint64) {
+	ve.reset()
+
+	if len(values) == 0 {
+		return valueTypeString, 0, 0
+	}
+
+	var vt valueType
+	var minValue, maxValue uint64
+
+	// Try dict encoding at first, since it gives the highest speedup during querying.
+	// It also usually gives the best compression, since every value is encoded as a single byte.
+	ve.buf, ve.values, vt = tryDictEncoding(ve.buf[:0], ve.values[:0], values, dict)
+	if vt != valueTypeUnknown {
+		return vt, 0, 0
+	}
+
+	ve.buf, ve.values, vt, minValue, maxValue = tryUintEncoding(ve.buf[:0], ve.values[:0], values)
+	if vt != valueTypeUnknown {
+		return vt, minValue, maxValue
+	}
+
+	ve.buf, ve.values, vt, minValue, maxValue = tryFloat64Encoding(ve.buf[:0], ve.values[:0], values)
+	if vt != valueTypeUnknown {
+		return vt, minValue, maxValue
+	}
+
+	ve.buf, ve.values, vt, minValue, maxValue = tryIPv4Encoding(ve.buf[:0], ve.values[:0], values)
+	if vt != valueTypeUnknown {
+		return vt, minValue, maxValue
+	}
+
+	ve.buf, ve.values, vt, minValue, maxValue = tryTimestampISO8601Encoding(ve.buf[:0], ve.values[:0], values)
+	if vt != valueTypeUnknown {
+		return vt, minValue, maxValue
+	}
+
+	// Fall back to default encoding, e.g. leave values as is.
+	ve.values = append(ve.values[:0], values...)
+	return valueTypeString, 0, 0
+}
+
+func getValuesEncoder() *valuesEncoder {
+	v := valuesEncoderPool.Get()
+	if v == nil {
+		return &valuesEncoder{}
+	}
+	return v.(*valuesEncoder)
+}
+
+func putValuesEncoder(ve *valuesEncoder) {
+	ve.reset()
+	valuesEncoderPool.Put(ve)
+}
+
+var valuesEncoderPool sync.Pool
+
+type valuesDecoder struct {
+	buf []byte
+}
+
+func (vd *valuesDecoder) reset() {
+	vd.buf = vd.buf[:0]
+}
+
+// decodeInplace decodes values encoded with the given vt and the given dict inplace.
+//
+// the decoded values remain valid until vd.reset() is called.
+func (vd *valuesDecoder) decodeInplace(values []string, vt valueType, dict *valuesDict) error {
+	// do not reset vd.buf, since it may contain previously decoded data,
+	// which must be preserved until reset() call.
+	dstBuf := vd.buf
+
+	switch vt {
+	case valueTypeString:
+		// nothing to do - values are already decoded.
+	case valueTypeUint8:
+		for i, v := range values {
+			if len(v) != 1 {
+				return fmt.Errorf("unexpected value length for uint8; got %d; want 1", len(v))
+			}
+			n := uint64(v[0])
+			dstLen := len(dstBuf)
+			dstBuf = strconv.AppendUint(dstBuf, n, 10)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeUint16:
+		for i, v := range values {
+			if len(v) != 2 {
+				return fmt.Errorf("unexpected value length for uint16; got %d; want 2", len(v))
+			}
+			b := bytesutil.ToUnsafeBytes(v)
+			n := uint64(encoding.UnmarshalUint16(b))
+			dstLen := len(dstBuf)
+			dstBuf = strconv.AppendUint(dstBuf, n, 10)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeUint32:
+		for i, v := range values {
+			if len(v) != 4 {
+				return fmt.Errorf("unexpected value length for uint32; got %d; want 4", len(v))
+			}
+			b := bytesutil.ToUnsafeBytes(v)
+			n := uint64(encoding.UnmarshalUint32(b))
+			dstLen := len(dstBuf)
+			dstBuf = strconv.AppendUint(dstBuf, n, 10)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeUint64:
+		for i, v := range values {
+			if len(v) != 8 {
+				return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
+			}
+			b := bytesutil.ToUnsafeBytes(v)
+			n := encoding.UnmarshalUint64(b)
+			dstLen := len(dstBuf)
+			dstBuf = strconv.AppendUint(dstBuf, n, 10)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeDict:
+		dictValues := dict.values
+		for i, v := range values {
+			id := int(v[0])
+			if id >= len(dictValues) {
+				return fmt.Errorf("unexpected dictionary id: %d; it must be smaller than %d", id, len(dictValues))
+			}
+			values[i] = dictValues[id]
+		}
+	case valueTypeIPv4:
+		for i, v := range values {
+			if len(v) != 4 {
+				return fmt.Errorf("unexpected value length for ipv4; got %d; want 4", len(v))
+			}
+			dstLen := len(dstBuf)
+			dstBuf = toIPv4String(dstBuf, v)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeTimestampISO8601:
+		for i, v := range values {
+			if len(v) != 8 {
+				return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
+			}
+			dstLen := len(dstBuf)
+			dstBuf = toTimestampISO8601String(dstBuf, v)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	case valueTypeFloat64:
+		for i, v := range values {
+			if len(v) != 8 {
+				return fmt.Errorf("unexpected value length for uint64; got %d; want 8", len(v))
+			}
+			dstLen := len(dstBuf)
+			dstBuf = toFloat64String(dstBuf, v)
+			values[i] = bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		}
+	default:
+		return fmt.Errorf("unknown valueType=%d", vt)
+	}
+
+	vd.buf = dstBuf
+	return nil
+}
+
+func toTimestampISO8601String(dst []byte, v string) []byte {
+	b := bytesutil.ToUnsafeBytes(v)
+	n := encoding.UnmarshalUint64(b)
+	t := time.Unix(0, int64(n)).UTC()
+	dst = t.AppendFormat(dst, iso8601Timestamp)
+	return dst
+}
+
+func toIPv4String(dst []byte, v string) []byte {
+	dst = strconv.AppendUint(dst, uint64(v[0]), 10)
+	dst = append(dst, '.')
+	dst = strconv.AppendUint(dst, uint64(v[1]), 10)
+	dst = append(dst, '.')
+	dst = strconv.AppendUint(dst, uint64(v[2]), 10)
+	dst = append(dst, '.')
+	dst = strconv.AppendUint(dst, uint64(v[3]), 10)
+	return dst
+}
+
+func toFloat64String(dst []byte, v string) []byte {
+	b := bytesutil.ToUnsafeBytes(v)
+	n := encoding.UnmarshalUint64(b)
+	f := math.Float64frombits(n)
+	dst = strconv.AppendFloat(dst, f, 'g', -1, 64)
+	return dst
+}
+
+func getValuesDecoder() *valuesDecoder {
+	v := valuesDecoderPool.Get()
+	if v == nil {
+		return &valuesDecoder{}
+	}
+	return v.(*valuesDecoder)
+}
+
+func putValuesDecoder(vd *valuesDecoder) {
+	vd.reset()
+	valuesDecoderPool.Put(vd)
+}
+
+var valuesDecoderPool sync.Pool
+
+func tryTimestampISO8601Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
+	u64s := encoding.GetUint64s(len(srcValues))
+	defer encoding.PutUint64s(u64s)
+	a := u64s.A
+	var minValue, maxValue uint64
+	for i, v := range srcValues {
+		n, ok := tryParseTimestampISO8601(v)
+		if !ok {
+			return dstBuf, dstValues, valueTypeUnknown, 0, 0
+		}
+		a[i] = n
+		if i == 0 || n < minValue {
+			minValue = n
+		}
+		if i == 0 || n > maxValue {
+			maxValue = n
+		}
+	}
+	for _, n := range a {
+		dstLen := len(dstBuf)
+		dstBuf = encoding.MarshalUint64(dstBuf, n)
+		v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		dstValues = append(dstValues, v)
+	}
+	return dstBuf, dstValues, valueTypeTimestampISO8601, minValue, maxValue
+}
+
+func tryParseTimestampISO8601(s string) (uint64, bool) {
+	// Do not parse timestamps with timezone, since they cannot be converted back
+	// to the same string representation in general case.
+	// This may break search.
+	if len(s) != len("2006-01-02T15:04:05.000Z") {
+		return 0, false
+	}
+
+	// Parse year
+	if s[len("YYYY")] != '-' {
+		return 0, false
+	}
+	yearStr := s[:len("YYYY")]
+	n, ok := tryParseUint64(yearStr)
+	if !ok || n > 3000 {
+		return 0, false
+	}
+	year := int(n)
+	s = s[len("YYYY")+1:]
+
+	// Parse month
+	if s[len("MM")] != '-' {
+		return 0, false
+	}
+	monthStr := s[:len("MM")]
+	n, ok = tryParseUint64(monthStr)
+	if !ok || n < 1 || n > 12 {
+		return 0, false
+	}
+	month := time.Month(n)
+	s = s[len("MM")+1:]
+
+	// Parse day
+	if s[len("DD")] != 'T' {
+		return 0, false
+	}
+	dayStr := s[:len("DD")]
+	n, ok = tryParseUint64(dayStr)
+	if !ok || n < 1 || n > 31 {
+		return 0, false
+	}
+	day := int(n)
+	s = s[len("DD")+1:]
+
+	// Parse hour
+	if s[len("HH")] != ':' {
+		return 0, false
+	}
+	hourStr := s[:len("HH")]
+	n, ok = tryParseUint64(hourStr)
+	if !ok || n > 23 {
+		return 0, false
+	}
+	hour := int(n)
+	s = s[len("HH")+1:]
+
+	// Parse minute
+	if s[len("MM")] != ':' {
+		return 0, false
+	}
+	minuteStr := s[:len("MM")]
+	n, ok = tryParseUint64(minuteStr)
+	if !ok || n > 59 {
+		return 0, false
+	}
+	minute := int(n)
+	s = s[len("MM")+1:]
+
+	// Parse second
+	if s[len("SS")] != '.' {
+		return 0, false
+	}
+	secondStr := s[:len("SS")]
+	n, ok = tryParseUint64(secondStr)
+	if !ok || n > 59 {
+		return 0, false
+	}
+	second := int(n)
+	s = s[len("SS")+1:]
+
+	// Parse millisecond
+	tzDelimiter := s[len("000")]
+	if tzDelimiter != 'Z' {
+		return 0, false
+	}
+	millisecondStr := s[:len("000")]
+	n, ok = tryParseUint64(millisecondStr)
+	if !ok || n > 999 {
+		return 0, false
+	}
+	millisecond := int(n)
+	s = s[len("000")+1:]
+
+	if len(s) != 0 {
+		return 0, false
+	}
+
+	t := time.Date(year, month, day, hour, minute, second, millisecond*1e6, time.UTC)
+	ts := t.UnixNano()
+	return uint64(ts), true
+}
+
+func tryParseUint64(s string) (uint64, bool) {
+	if len(s) == 0 || len(s) > 18 {
+		return 0, false
+	}
+	n := uint64(0)
+	for i := 0; i < len(s); i++ {
+		ch := s[i]
+		if ch < '0' || ch > '9' {
+			return 0, false
+		}
+		n *= 10
+		n += uint64(ch - '0')
+	}
+	return n, true
+}
+
+const iso8601Timestamp = "2006-01-02T15:04:05.000Z"
+
+func tryIPv4Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
+	u32s := encoding.GetUint32s(len(srcValues))
+	defer encoding.PutUint32s(u32s)
+	a := u32s.A
+	var minValue, maxValue uint32
+	for i, v := range srcValues {
+		n, ok := tryParseIPv4(v)
+		if !ok {
+			return dstBuf, dstValues, valueTypeUnknown, 0, 0
+		}
+		a[i] = n
+		if i == 0 || n < minValue {
+			minValue = n
+		}
+		if i == 0 || n > maxValue {
+			maxValue = n
+		}
+	}
+	for _, n := range a {
+		dstLen := len(dstBuf)
+		dstBuf = encoding.MarshalUint32(dstBuf, n)
+		v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		dstValues = append(dstValues, v)
+	}
+	return dstBuf, dstValues, valueTypeIPv4, uint64(minValue), uint64(maxValue)
+}
+
+func tryParseIPv4(s string) (uint32, bool) {
+	if len(s) < len("1.1.1.1") || len(s) > len("255.255.255.255") || strings.Count(s, ".") != 3 {
+		// Fast path - the entry isn't IPv4
+		return 0, false
+	}
+
+	var octets [4]byte
+	var v uint64
+	var ok bool
+
+	// Parse octet 1
+	n := strings.IndexByte(s, '.')
+	if n <= 0 || n > 3 {
+		return 0, false
+	}
+	v, ok = tryParseUint64(s[:n])
+	if !ok || v > 255 {
+		return 0, false
+	}
+	octets[0] = byte(v)
+	s = s[n+1:]
+
+	// Parse octet 2
+	n = strings.IndexByte(s, '.')
+	if n <= 0 || n > 3 {
+		return 0, false
+	}
+	v, ok = tryParseUint64(s[:n])
+	if !ok || v > 255 {
+		return 0, false
+	}
+	octets[1] = byte(v)
+	s = s[n+1:]
+
+	// Parse octet 3
+	n = strings.IndexByte(s, '.')
+	if n <= 0 || n > 3 {
+		return 0, false
+	}
+	v, ok = tryParseUint64(s[:n])
+	if !ok || v > 255 {
+		return 0, false
+	}
+	octets[2] = byte(v)
+	s = s[n+1:]
+
+	// Parse octet 4
+	v, ok = tryParseUint64(s)
+	if !ok || v > 255 {
+		return 0, false
+	}
+	octets[3] = byte(v)
+
+	ipv4 := encoding.UnmarshalUint32(octets[:])
+	return ipv4, true
+}
+
+func tryFloat64Encoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
+	u64s := encoding.GetUint64s(len(srcValues))
+	defer encoding.PutUint64s(u64s)
+	a := u64s.A
+	var minValue, maxValue float64
+	for i, v := range srcValues {
+		f, ok := tryParseFloat64(v)
+		if !ok {
+			return dstBuf, dstValues, valueTypeUnknown, 0, 0
+		}
+		a[i] = math.Float64bits(f)
+		if i == 0 || f < minValue {
+			minValue = f
+		}
+		if i == 0 || f > maxValue {
+			maxValue = f
+		}
+	}
+	for _, n := range a {
+		dstLen := len(dstBuf)
+		dstBuf = encoding.MarshalUint64(dstBuf, n)
+		v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		dstValues = append(dstValues, v)
+	}
+	minValueU64 := math.Float64bits(minValue)
+	maxValueU64 := math.Float64bits(maxValue)
+	return dstBuf, dstValues, valueTypeFloat64, minValueU64, maxValueU64
+}
+
+func tryParseFloat64(s string) (float64, bool) {
+	if len(s) == 0 || len(s) > 20 {
+		return 0, false
+	}
+	// Allow only decimal digits, minus and a dot.
+	// Do not allows scientific notation (for example 1.23E+05),
+	// since it cannot be converted back to the same string form.
+
+	minus := s[0] == '-'
+	if minus {
+		s = s[1:]
+	}
+	n := strings.IndexByte(s, '.')
+	if n < 0 {
+		// fast path - there are no dots
+		n, ok := tryParseUint64(s)
+		if !ok {
+			return 0, false
+		}
+		f := float64(n)
+		if minus {
+			f = -f
+		}
+		return f, true
+	}
+	if n == 0 || n == len(s)-1 {
+		// Do not allow dots at the beginning and at the end of s,
+		// since they cannot be converted back to the same string form.
+		return 0, false
+	}
+	sInt := s[:n]
+	sFrac := s[n+1:]
+	nInt, ok := tryParseUint64(sInt)
+	if !ok {
+		return 0, false
+	}
+	nFrac, ok := tryParseUint64(sFrac)
+	if !ok {
+		return 0, false
+	}
+	f := math.FMA(float64(nFrac), math.Pow10(-len(sFrac)), float64(nInt))
+	if minus {
+		f = -f
+	}
+	return f, true
+}
+
+func tryUintEncoding(dstBuf []byte, dstValues, srcValues []string) ([]byte, []string, valueType, uint64, uint64) {
+	u64s := encoding.GetUint64s(len(srcValues))
+	defer encoding.PutUint64s(u64s)
+	a := u64s.A
+	var minValue, maxValue uint64
+	for i, v := range srcValues {
+		n, ok := tryParseUint64(v)
+		if !ok {
+			return dstBuf, dstValues, valueTypeUnknown, 0, 0
+		}
+		a[i] = n
+		if i == 0 || n < minValue {
+			minValue = n
+		}
+		if i == 0 || n > maxValue {
+			maxValue = n
+		}
+	}
+
+	minBitSize := bits.Len64(maxValue)
+	if minBitSize <= 8 {
+		for _, n := range a {
+			dstLen := len(dstBuf)
+			dstBuf = append(dstBuf, byte(n))
+			v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+			dstValues = append(dstValues, v)
+		}
+		return dstBuf, dstValues, valueTypeUint8, minValue, maxValue
+	}
+	if minBitSize <= 16 {
+		for _, n := range a {
+			dstLen := len(dstBuf)
+			dstBuf = encoding.MarshalUint16(dstBuf, uint16(n))
+			v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+			dstValues = append(dstValues, v)
+		}
+		return dstBuf, dstValues, valueTypeUint16, minValue, maxValue
+	}
+	if minBitSize <= 32 {
+		for _, n := range a {
+			dstLen := len(dstBuf)
+			dstBuf = encoding.MarshalUint32(dstBuf, uint32(n))
+			v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+			dstValues = append(dstValues, v)
+		}
+		return dstBuf, dstValues, valueTypeUint32, minValue, maxValue
+	}
+	for _, n := range a {
+		dstLen := len(dstBuf)
+		dstBuf = encoding.MarshalUint64(dstBuf, n)
+		v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		dstValues = append(dstValues, v)
+	}
+	return dstBuf, dstValues, valueTypeUint64, minValue, maxValue
+}
+
+func tryDictEncoding(dstBuf []byte, dstValues, srcValues []string, dict *valuesDict) ([]byte, []string, valueType) {
+	dict.reset()
+	dstBufOrig := dstBuf
+	dstValuesOrig := dstValues
+
+	for _, v := range srcValues {
+		id, ok := dict.getOrAdd(v)
+		if !ok {
+			dict.reset()
+			return dstBufOrig, dstValuesOrig, valueTypeUnknown
+		}
+		dstLen := len(dstBuf)
+		dstBuf = append(dstBuf, id)
+		v := bytesutil.ToUnsafeString(dstBuf[dstLen:])
+		dstValues = append(dstValues, v)
+	}
+	return dstBuf, dstValues, valueTypeDict
+}
+
+type valuesDict struct {
+	values []string
+}
+
+func (vd *valuesDict) reset() {
+	vs := vd.values
+	for i := range vs {
+		vs[i] = ""
+	}
+	vd.values = vs[:0]
+}
+
+func (vd *valuesDict) copyFrom(src *valuesDict) {
+	vd.reset()
+
+	vd.values = append(vd.values[:0], src.values...)
+}
+
+func (vd *valuesDict) getOrAdd(k string) (byte, bool) {
+	if len(k) > maxDictSizeBytes {
+		return 0, false
+	}
+	vs := vd.values
+	dictSizeBytes := 0
+	for i, v := range vs {
+		if k == v {
+			return byte(i), true
+		}
+		dictSizeBytes += len(v)
+	}
+	if len(vs) >= maxDictLen || dictSizeBytes+len(k) > maxDictSizeBytes {
+		return 0, false
+	}
+	vs = append(vs, k)
+	vd.values = vs
+
+	return byte(len(vs) - 1), true
+}
+
+func (vd *valuesDict) marshal(dst []byte) []byte {
+	values := vd.values
+	if len(values) > maxDictLen {
+		logger.Panicf("BUG: valuesDict may contain max %d items; got %d items", maxDictLen, len(values))
+	}
+	dst = append(dst, byte(len(values)))
+	for _, v := range values {
+		dst = encoding.MarshalBytes(dst, bytesutil.ToUnsafeBytes(v))
+	}
+	return dst
+}
+
+func (vd *valuesDict) unmarshal(src []byte) ([]byte, error) {
+	vd.reset()
+
+	srcOrig := src
+	if len(src) < 1 {
+		return srcOrig, fmt.Errorf("cannot umarshal dict len from 0 bytes; need at least 1 byte")
+	}
+	dictLen := int(src[0])
+	src = src[1:]
+	for i := 0; i < dictLen; i++ {
+		tail, data, err := encoding.UnmarshalBytes(src)
+		if err != nil {
+			return srcOrig, fmt.Errorf("cannot umarshal value %d out of %d from dict: %w", i, dictLen, err)
+		}
+		src = tail
+		// Do not use bytesutil.InternBytes(data) here, since it works slower than the string(data) in prod
+		v := string(data)
+		vd.values = append(vd.values, v)
+	}
+	return src, nil
+}
+
+// maxDictSizeBytes is the maximum length of all the keys in the valuesDict
+const maxDictSizeBytes = 256
+
+// maxDictLen is the maximum number of entries in the valuesDict.
+//
+// it shouldn't exceed 255, since the dict len is marshaled into a single byte.
+const maxDictLen = 8
--- a/lib/logstorage/values_encoder_test.go
+++ b/lib/logstorage/values_encoder_test.go
@ -0,0 +1,228 @@
+package logstorage
+
+import (
+	"fmt"
+	"math"
+	"reflect"
+	"testing"
+)
+
+func TestValuesEncoder(t *testing.T) {
+	f := func(values []string, expectedValueType valueType, expectedMinValue, expectedMaxValue uint64) {
+		t.Helper()
+		ve := getValuesEncoder()
+		var dict valuesDict
+		vt, minValue, maxValue := ve.encode(values, &dict)
+		if vt != expectedValueType {
+			t.Fatalf("unexpected value type; got %d; want %d", vt, expectedValueType)
+		}
+		if minValue != expectedMinValue {
+			t.Fatalf("unexpected minValue; got %d; want %d", minValue, expectedMinValue)
+		}
+		if maxValue != expectedMaxValue {
+			t.Fatalf("unexpected maxValue; got %d; want %d", maxValue, expectedMaxValue)
+		}
+		encodedValues := append([]string{}, ve.values...)
+		putValuesEncoder(ve)
+
+		vd := getValuesDecoder()
+		if err := vd.decodeInplace(encodedValues, vt, &dict); err != nil {
+			t.Fatalf("unexpected error in decodeInplace(): %s", err)
+		}
+		if len(values) == 0 {
+			values = []string{}
+		}
+		if !reflect.DeepEqual(values, encodedValues) {
+			t.Fatalf("unexpected values decoded\ngot\n%q\nwant\n%q", encodedValues, values)
+		}
+		putValuesDecoder(vd)
+	}
+
+	// An empty values list
+	f(nil, valueTypeString, 0, 0)
+
+	// string values
+	values := make([]string, maxDictLen+1)
+	for i := range values {
+		values[i] = fmt.Sprintf("value_%d", i)
+	}
+	f(values, valueTypeString, 0, 0)
+
+	// dict values
+	f([]string{"foobar"}, valueTypeDict, 0, 0)
+	f([]string{"foo", "bar"}, valueTypeDict, 0, 0)
+	f([]string{"1", "2foo"}, valueTypeDict, 0, 0)
+
+	// uint8 values
+	for i := range values {
+		values[i] = fmt.Sprintf("%d", i+1)
+	}
+	f(values, valueTypeUint8, 1, uint64(len(values)))
+
+	// uint16 values
+	for i := range values {
+		values[i] = fmt.Sprintf("%d", (i+1)<<8)
+	}
+	f(values, valueTypeUint16, 1<<8, uint64(len(values)<<8))
+
+	// uint32 values
+	for i := range values {
+		values[i] = fmt.Sprintf("%d", (i+1)<<16)
+	}
+	f(values, valueTypeUint32, 1<<16, uint64(len(values)<<16))
+
+	// uint64 values
+	for i := range values {
+		values[i] = fmt.Sprintf("%d", (i+1)<<32)
+	}
+	f(values, valueTypeUint64, 1<<32, uint64(len(values)<<32))
+
+	// ipv4 values
+	for i := range values {
+		values[i] = fmt.Sprintf("1.2.3.%d", i)
+	}
+	f(values, valueTypeIPv4, 16909056, 16909064)
+
+	// iso8601 timestamps
+	for i := range values {
+		values[i] = fmt.Sprintf("2011-04-19T03:44:01.%03dZ", i)
+	}
+	f(values, valueTypeTimestampISO8601, 1303184641000000000, 1303184641008000000)
+
+	// float64 values
+	for i := range values {
+		values[i] = fmt.Sprintf("%g", math.Sqrt(float64(i+1)))
+	}
+	f(values, valueTypeFloat64, 4607182418800017408, 4613937818241073152)
+}
+
+func TestTryParseIPv4(t *testing.T) {
+	f := func(s string, nExpected uint32, okExpected bool) {
+		t.Helper()
+		n, ok := tryParseIPv4(s)
+		if n != nExpected {
+			t.Fatalf("unexpected n; got %d; want %d", n, nExpected)
+		}
+		if ok != okExpected {
+			t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
+		}
+	}
+
+	f("", 0, false)
+	f("foo", 0, false)
+	f("a.b.c.d", 0, false)
+	f("1.2.3.4", 0x01020304, true)
+	f("255.255.255.255", 0xffffffff, true)
+	f("0.0.0.0", 0, true)
+	f("127.0.0.1", 0x7f000001, true)
+	f("127.0.0.x", 0, false)
+	f("127.0.x.0", 0, false)
+	f("127.x.0.0", 0, false)
+	f("x.0.0.0", 0, false)
+	f("127.127.127.256", 0, false)
+	f("127.127.256.127", 0, false)
+	f("127.256.127.127", 0, false)
+	f("256.127.127.127", 0, false)
+	f("-1.127.127.127", 0, false)
+	f("127.-1.127.127", 0, false)
+	f("127.127.-1.127", 0, false)
+	f("127.127.127.-1", 0, false)
+}
+
+func TestTryParseTimestampISO8601(t *testing.T) {
+	f := func(s string, timestampExpected uint64, okExpected bool) {
+		t.Helper()
+		timestamp, ok := tryParseTimestampISO8601(s)
+		if timestamp != timestampExpected {
+			t.Fatalf("unexpected timestamp; got %d; want %d", timestamp, timestampExpected)
+		}
+		if ok != okExpected {
+			t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
+		}
+	}
+
+	f("2023-01-15T23:45:51.123Z", 1673826351123000000, true)
+
+	// Invalid milliseconds
+	f("2023-01-15T22:15:51.12345Z", 0, false)
+	f("2023-01-15T22:15:51.12Z", 0, false)
+	f("2023-01-15T22:15:51Z", 0, false)
+
+	// Missing Z
+	f("2023-01-15T23:45:51.123", 0, false)
+
+	// Invalid timestamp
+	f("foo", 0, false)
+	f("2023-01-15T23:45:51.123Zxyabcd", 0, false)
+	f("2023-01-15T23:45:51.123Z01:00", 0, false)
+
+	// timestamp with timezone
+	f("2023-01-16T00:45:51.123+01:00", 0, false)
+}
+
+func TestTryParseFloat64(t *testing.T) {
+	f := func(s string, valueExpected float64, okExpected bool) {
+		t.Helper()
+
+		value, ok := tryParseFloat64(s)
+		if value != valueExpected {
+			t.Fatalf("unexpected value; got %v; want %v", value, valueExpected)
+		}
+		if ok != okExpected {
+			t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
+		}
+	}
+
+	f("0", 0, true)
+	f("1234567890", 1234567890, true)
+	f("-1.234567", -1.234567, true)
+
+	// Empty value
+	f("", 0, false)
+
+	// Plus in the value isn't allowed, since it cannot be convered back to the same string representation
+	f("+123", 0, false)
+
+	// Dot at the beginning and the end of value isn't allowed, since it cannot converted back to the same string representation
+	f(".123", 0, false)
+	f("123.", 0, false)
+
+	// Multiple dots aren't allowed
+	f("123.434.55", 0, false)
+
+	// Invalid dots
+	f("-.123", 0, false)
+	f(".", 0, false)
+
+	// Scientific notation isn't allowed, since it cannot be converted back to the same string representation
+	f("12e5", 0, false)
+
+	// Minus in the middle of string isn't allowed
+	f("12-5", 0, false)
+}
+
+func TestTryParseUint64(t *testing.T) {
+	f := func(s string, valueExpected uint64, okExpected bool) {
+		t.Helper()
+
+		value, ok := tryParseUint64(s)
+		if value != valueExpected {
+			t.Fatalf("unexpected value; got %d; want %d", value, valueExpected)
+		}
+		if ok != okExpected {
+			t.Fatalf("unexpected ok; got %v; want %v", ok, okExpected)
+		}
+	}
+
+	f("0", 0, true)
+	f("123456789012345678", 123456789012345678, true)
+
+	// empty value
+	f("", 0, false)
+
+	// too big value
+	f("1234567890123456789", 0, false)
+
+	// invalid value
+	f("foo", 0, false)
+}
--- a/lib/logstorage/values_encoder_timing_test.go
+++ b/lib/logstorage/values_encoder_timing_test.go
@ -0,0 +1,98 @@
+package logstorage
+
+import (
+	"fmt"
+	"testing"
+)
+
+func BenchmarkTryParseTimestampISO8601(b *testing.B) {
+	a := []string{
+		"2023-01-15T23:45:51.123Z",
+		"2023-02-15T23:45:51.123Z",
+		"2023-02-15T23:45:51.123+01:00",
+		"2023-02-15T22:45:51.123-10:30",
+		"2023-02-15T22:45:51.000Z",
+	}
+
+	b.SetBytes(int64(len(a)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _, s := range a {
+				_, ok := tryParseTimestampISO8601(s)
+				if !ok {
+					panic(fmt.Errorf("cannot parse timestamp %q", s))
+				}
+			}
+		}
+	})
+}
+
+func BenchmarkTryParseIPv4(b *testing.B) {
+	a := []string{
+		"1.2.3.4",
+		"127.0.0.1",
+		"255.255.255.255",
+		"192.43.234.22",
+		"32.34.54.198",
+	}
+
+	b.SetBytes(int64(len(a)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _, s := range a {
+				_, ok := tryParseIPv4(s)
+				if !ok {
+					panic(fmt.Errorf("cannot parse ipv4 %q", s))
+				}
+			}
+		}
+	})
+}
+
+func BenchmarkTryParseUint64(b *testing.B) {
+	a := []string{
+		"1234",
+		"483932",
+		"28494",
+		"90012",
+		"889111",
+	}
+
+	b.SetBytes(int64(len(a)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _, s := range a {
+				_, ok := tryParseUint64(s)
+				if !ok {
+					panic(fmt.Errorf("cannot parse uint %q", s))
+				}
+			}
+		}
+	})
+}
+
+func BenchmarkTryParseFloat64(b *testing.B) {
+	a := []string{
+		"1.234",
+		"4.545",
+		"456.5645",
+		"-123.434",
+		"434.322",
+	}
+
+	b.SetBytes(int64(len(a)))
+	b.ReportAllocs()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _, s := range a {
+				_, ok := tryParseFloat64(s)
+				if !ok {
+					panic(fmt.Errorf("cannot parse float64 %q", s))
+				}
+			}
+		}
+	})
+}