VictoriaMetrics/lib/logstorage/part_header.go

package logstorage

import (
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"time"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)

// partHeader contains the information about a single part
type partHeader struct {
	// FormatVersion is the version of the part format
	FormatVersion uint

	// CompressedSizeBytes is physical size of the part
	CompressedSizeBytes uint64

	// UncompressedSizeBytes is the original size of log entries stored in the part
	UncompressedSizeBytes uint64

	// RowsCount is the number of log entries in the part
	RowsCount uint64

	// BlocksCount is the number of blocks in the part
	BlocksCount uint64

	// MinTimestamp is the minimum timestamp seen in the part
	MinTimestamp int64

	// MaxTimestamp is the maximum timestamp seen in the part
	MaxTimestamp int64

	// BloomValuesShardsCount is the number of (bloom, values) shards in the part.
	BloomValuesShardsCount uint64

	// BloomValuesFieldsCount is the number of fields with (bloom, values) pairs in the given part.
	BloomValuesFieldsCount uint64
}

// reset resets ph for subsequent re-use
func (ph *partHeader) reset() {
	ph.FormatVersion = 0
	ph.CompressedSizeBytes = 0
	ph.UncompressedSizeBytes = 0
	ph.RowsCount = 0
	ph.BlocksCount = 0
	ph.MinTimestamp = 0
	ph.MaxTimestamp = 0
	ph.BloomValuesShardsCount = 0
	ph.BloomValuesFieldsCount = 0
}

// String returns string represenation for ph.
func (ph *partHeader) String() string {
	return fmt.Sprintf("{FormatVersion=%d, CompressedSizeBytes=%d, UncompressedSizeBytes=%d, RowsCount=%d, BlocksCount=%d, "+
		"MinTimestamp=%s, MaxTimestamp=%s, BloomValuesShardsCount=%d, BloomValuesFieldsCount=%d}",
		ph.FormatVersion, ph.CompressedSizeBytes, ph.UncompressedSizeBytes, ph.RowsCount, ph.BlocksCount,
		timestampToString(ph.MinTimestamp), timestampToString(ph.MaxTimestamp), ph.BloomValuesShardsCount, ph.BloomValuesFieldsCount)
}

func (ph *partHeader) mustReadMetadata(partPath string) {
	ph.reset()

	metadataPath := filepath.Join(partPath, metadataFilename)
	metadata, err := os.ReadFile(metadataPath)
	if err != nil {
		logger.Panicf("FATAL: cannot read %q: %s", metadataPath, err)
	}
	if err := json.Unmarshal(metadata, ph); err != nil {
		logger.Panicf("FATAL: cannot parse %q: %s", metadataPath, err)
	}

	if ph.FormatVersion <= 1 {
		if ph.BloomValuesShardsCount != 0 {
			logger.Panicf("FATAL: unexpected BloomValuesShardsCount for FormatVersion<=1; got %d; want 0", ph.BloomValuesShardsCount)
		}
		if ph.BloomValuesFieldsCount != 0 {
			logger.Panicf("FATAL: unexpected BloomValuesFieldsCount for FormatVersion<=1; got %d; want 0", ph.BloomValuesFieldsCount)
		}
		if ph.FormatVersion == 1 {
			ph.BloomValuesShardsCount = 8
			ph.BloomValuesFieldsCount = bloomValuesMaxShardsCount
		}
	}

	// Perform various checks
	if ph.FormatVersion > partFormatLatestVersion {
		logger.Panicf("FATAL: unsupported part format version; got %d; mustn't exceed %d", partFormatLatestVersion)
	}
	if ph.MinTimestamp > ph.MaxTimestamp {
		logger.Panicf("FATAL: MinTimestamp cannot exceed MaxTimestamp; got %d vs %d", ph.MinTimestamp, ph.MaxTimestamp)
	}
	if ph.BlocksCount > ph.RowsCount {
		logger.Panicf("FATAL: BlocksCount=%d cannot exceed RowsCount=%d", ph.BlocksCount, ph.RowsCount)
	}
}

func (ph *partHeader) mustWriteMetadata(partPath string) {
	metadata, err := json.Marshal(ph)
	if err != nil {
		logger.Panicf("BUG: cannot marshal partHeader: %s", err)
	}
	metadataPath := filepath.Join(partPath, metadataFilename)
	fs.MustWriteSync(metadataPath, metadata)
}

func timestampToString(timestamp int64) string {
	t := time.Unix(0, timestamp).UTC()
	return strings.Replace(t.Format(timestampForPathname), ".", "", 1)
}

const timestampForPathname = "20060102150405.000000000"
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`package logstorage`

			`import (`
			`"encoding/json"`
			`"fmt"`
			`"os"`
			`"path/filepath"`
			`"strings"`
			`"time"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"`
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"`
			`)`

			`// partHeader contains the information about a single part`
			`type partHeader struct {`
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge . 2024-10-16 14:18:28 +00:00			`// FormatVersion is the version of the part format`
			`FormatVersion uint`

app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`// CompressedSizeBytes is physical size of the part`
			`CompressedSizeBytes uint64`

			`// UncompressedSizeBytes is the original size of log entries stored in the part`
			`UncompressedSizeBytes uint64`

			`// RowsCount is the number of log entries in the part`
			`RowsCount uint64`

			`// BlocksCount is the number of blocks in the part`
			`BlocksCount uint64`

			`// MinTimestamp is the minimum timestamp seen in the part`
			`MinTimestamp int64`

			`// MaxTimestamp is the maximum timestamp seen in the part`
			`MaxTimestamp int64`
lib/logstorage: dynamically adjust the number of (bloom, values) shards in a part depending on the number of non-const columns This allows reducing the amounts of data, which must be read during queries over logs with big number of fields (aka "wide events"). This, in turn, improves query performance when the data, which needs to be scanned during the query, doesn't fit OS page cache. 2024-10-29 10:28:41 +00:00
			`// BloomValuesShardsCount is the number of (bloom, values) shards in the part.`
			`BloomValuesShardsCount uint64`

			`// BloomValuesFieldsCount is the number of fields with (bloom, values) pairs in the given part.`
			`BloomValuesFieldsCount uint64`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`}`

			`// reset resets ph for subsequent re-use`
			`func (ph *partHeader) reset() {`
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge . 2024-10-16 14:18:28 +00:00			`ph.FormatVersion = 0`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`ph.CompressedSizeBytes = 0`
			`ph.UncompressedSizeBytes = 0`
			`ph.RowsCount = 0`
			`ph.BlocksCount = 0`
			`ph.MinTimestamp = 0`
			`ph.MaxTimestamp = 0`
lib/logstorage: dynamically adjust the number of (bloom, values) shards in a part depending on the number of non-const columns This allows reducing the amounts of data, which must be read during queries over logs with big number of fields (aka "wide events"). This, in turn, improves query performance when the data, which needs to be scanned during the query, doesn't fit OS page cache. 2024-10-29 10:28:41 +00:00			`ph.BloomValuesShardsCount = 0`
			`ph.BloomValuesFieldsCount = 0`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`}`

			`// String returns string represenation for ph.`
			`func (ph *partHeader) String() string {`
lib/logstorage: dynamically adjust the number of (bloom, values) shards in a part depending on the number of non-const columns This allows reducing the amounts of data, which must be read during queries over logs with big number of fields (aka "wide events"). This, in turn, improves query performance when the data, which needs to be scanned during the query, doesn't fit OS page cache. 2024-10-29 10:28:41 +00:00			`return fmt.Sprintf("{FormatVersion=%d, CompressedSizeBytes=%d, UncompressedSizeBytes=%d, RowsCount=%d, BlocksCount=%d, "+`
			`"MinTimestamp=%s, MaxTimestamp=%s, BloomValuesShardsCount=%d, BloomValuesFieldsCount=%d}",`
			`ph.FormatVersion, ph.CompressedSizeBytes, ph.UncompressedSizeBytes, ph.RowsCount, ph.BlocksCount,`
			`timestampToString(ph.MinTimestamp), timestampToString(ph.MaxTimestamp), ph.BloomValuesShardsCount, ph.BloomValuesFieldsCount)`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`}`

			`func (ph *partHeader) mustReadMetadata(partPath string) {`
			`ph.reset()`

			`metadataPath := filepath.Join(partPath, metadataFilename)`
			`metadata, err := os.ReadFile(metadataPath)`
			`if err != nil {`
			`logger.Panicf("FATAL: cannot read %q: %s", metadataPath, err)`
			`}`
			`if err := json.Unmarshal(metadata, ph); err != nil {`
			`logger.Panicf("FATAL: cannot parse %q: %s", metadataPath, err)`
			`}`

lib/logstorage: dynamically adjust the number of (bloom, values) shards in a part depending on the number of non-const columns This allows reducing the amounts of data, which must be read during queries over logs with big number of fields (aka "wide events"). This, in turn, improves query performance when the data, which needs to be scanned during the query, doesn't fit OS page cache. 2024-10-29 10:28:41 +00:00			`if ph.FormatVersion <= 1 {`
			`if ph.BloomValuesShardsCount != 0 {`
			`logger.Panicf("FATAL: unexpected BloomValuesShardsCount for FormatVersion<=1; got %d; want 0", ph.BloomValuesShardsCount)`
			`}`
			`if ph.BloomValuesFieldsCount != 0 {`
			`logger.Panicf("FATAL: unexpected BloomValuesFieldsCount for FormatVersion<=1; got %d; want 0", ph.BloomValuesFieldsCount)`
			`}`
			`if ph.FormatVersion == 1 {`
			`ph.BloomValuesShardsCount = 8`
			`ph.BloomValuesFieldsCount = bloomValuesMaxShardsCount`
			`}`
			`}`

app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`// Perform various checks`
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge . 2024-10-16 14:18:28 +00:00			`if ph.FormatVersion > partFormatLatestVersion {`
			`logger.Panicf("FATAL: unsupported part format version; got %d; mustn't exceed %d", partFormatLatestVersion)`
			`}`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`if ph.MinTimestamp > ph.MaxTimestamp {`
			`logger.Panicf("FATAL: MinTimestamp cannot exceed MaxTimestamp; got %d vs %d", ph.MinTimestamp, ph.MaxTimestamp)`
			`}`
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge . 2024-10-16 14:18:28 +00:00			`if ph.BlocksCount > ph.RowsCount {`
			`logger.Panicf("FATAL: BlocksCount=%d cannot exceed RowsCount=%d", ph.BlocksCount, ph.RowsCount)`
			`}`
app/victoria-logs: initial code release 2023-06-20 05:55:12 +00:00			`}`

			`func (ph *partHeader) mustWriteMetadata(partPath string) {`
			`metadata, err := json.Marshal(ph)`
			`if err != nil {`
			`logger.Panicf("BUG: cannot marshal partHeader: %s", err)`
			`}`
			`metadataPath := filepath.Join(partPath, metadataFilename)`
			`fs.MustWriteSync(metadataPath, metadata)`
			`}`

			`func timestampToString(timestamp int64) string {`
			`t := time.Unix(0, timestamp).UTC()`
			`return strings.Replace(t.Format(timestampForPathname), ".", "", 1)`
			`}`

			`const timestampForPathname = "20060102150405.000000000"`