VictoriaMetrics/lib/logstorage/inmemory_part_test.go

344 lines
11 KiB
Go
Raw Normal View History

package logstorage
import (
"fmt"
"math"
"math/rand"
"reflect"
"sort"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
func TestInmemoryPartMustInitFromRows(t *testing.T) {
f := func(lr *LogRows, blocksCountExpected int, compressionRateExpected float64) {
t.Helper()
uncompressedSizeBytesExpected := uncompressedRowsSizeBytes(lr.rows)
rowsCountExpected := len(lr.timestamps)
minTimestampExpected := int64(math.MaxInt64)
maxTimestampExpected := int64(math.MinInt64)
// make a copy of lr - it is used for comapring the results later,
// since lr may be modified by inmemoryPart.mustInitFromRows()
lrOrig := GetLogRows(nil, nil)
for i, timestamp := range lr.timestamps {
if timestamp < minTimestampExpected {
minTimestampExpected = timestamp
}
if timestamp > maxTimestampExpected {
maxTimestampExpected = timestamp
}
lrOrig.mustAddInternal(lr.streamIDs[i], timestamp, lr.rows[i], lr.streamTagsCanonicals[i])
}
// Create inmemory part from lr
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
// Check mp.ph
ph := &mp.ph
checkCompressionRate(t, ph, compressionRateExpected)
if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
t.Fatalf("unexpected UncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
}
if ph.RowsCount != uint64(rowsCountExpected) {
t.Fatalf("unexpected rowsCount in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
}
if ph.BlocksCount != uint64(blocksCountExpected) {
t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
}
if ph.RowsCount > 0 {
if ph.MinTimestamp != minTimestampExpected {
t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
}
if ph.MaxTimestamp != maxTimestampExpected {
t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
}
}
// Read log entries from mp to rrsResult
sbu := getStringsBlockUnmarshaler()
defer putStringsBlockUnmarshaler(sbu)
vd := getValuesDecoder()
defer putValuesDecoder(vd)
lrResult := mp.readLogRows(sbu, vd)
putInmemoryPart(mp)
// compare lrOrig to lrResult
if err := checkEqualRows(lrResult, lrOrig); err != nil {
t.Fatalf("unequal log entries: %s", err)
}
}
f(GetLogRows(nil, nil), 0, 0)
// Check how inmemoryPart works with a single stream
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 14:18:28 +00:00
f(newTestLogRows(1, 1, 0), 1, 0.7)
f(newTestLogRows(1, 2, 0), 1, 0.9)
f(newTestLogRows(1, 10, 0), 1, 2.0)
f(newTestLogRows(1, 1000, 0), 1, 7.1)
f(newTestLogRows(1, 20000, 0), 2, 7.2)
// Check how inmemoryPart works with multiple streams
f(newTestLogRows(2, 1, 0), 2, 0.8)
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 14:18:28 +00:00
f(newTestLogRows(10, 1, 0), 10, 1.1)
f(newTestLogRows(100, 1, 0), 100, 1.2)
f(newTestLogRows(10, 5, 0), 10, 1.5)
f(newTestLogRows(10, 1000, 0), 10, 7.2)
f(newTestLogRows(100, 100, 0), 100, 5.0)
}
func checkCompressionRate(t *testing.T, ph *partHeader, compressionRateExpected float64) {
t.Helper()
compressionRate := float64(ph.UncompressedSizeBytes) / float64(ph.CompressedSizeBytes)
2023-07-04 21:13:14 +00:00
if math.Abs(compressionRate-compressionRateExpected) > math.Abs(compressionRate+compressionRateExpected)*0.05 {
t.Fatalf("unexpected compression rate; got %.1f; want %.1f", compressionRate, compressionRateExpected)
}
}
func TestInmemoryPartInitFromBlockStreamReaders(t *testing.T) {
f := func(lrs []*LogRows, blocksCountExpected int, compressionRateExpected float64) {
t.Helper()
uncompressedSizeBytesExpected := uint64(0)
rowsCountExpected := 0
minTimestampExpected := int64(math.MaxInt64)
maxTimestampExpected := int64(math.MinInt64)
// make a copy of rrss in order to compare the results after merge.
lrOrig := GetLogRows(nil, nil)
for _, lr := range lrs {
uncompressedSizeBytesExpected += uncompressedRowsSizeBytes(lr.rows)
rowsCountExpected += len(lr.timestamps)
for j, timestamp := range lr.timestamps {
if timestamp < minTimestampExpected {
minTimestampExpected = timestamp
}
if timestamp > maxTimestampExpected {
maxTimestampExpected = timestamp
}
lrOrig.mustAddInternal(lr.streamIDs[j], timestamp, lr.rows[j], lr.streamTagsCanonicals[j])
}
}
// Initialize readers from lrs
var mpsSrc []*inmemoryPart
var bsrs []*blockStreamReader
for _, lr := range lrs {
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
mpsSrc = append(mpsSrc, mp)
bsr := getBlockStreamReader()
bsr.MustInitFromInmemoryPart(mp)
bsrs = append(bsrs, bsr)
}
defer func() {
for _, bsr := range bsrs {
putBlockStreamReader(bsr)
}
for _, mp := range mpsSrc {
putInmemoryPart(mp)
}
}()
// Merge data from bsrs into mpDst
mpDst := getInmemoryPart()
bsw := getBlockStreamWriter()
bsw.MustInitForInmemoryPart(mpDst)
mustMergeBlockStreams(&mpDst.ph, bsw, bsrs, nil)
putBlockStreamWriter(bsw)
// Check mpDst.ph stats
ph := &mpDst.ph
checkCompressionRate(t, ph, compressionRateExpected)
if ph.UncompressedSizeBytes != uncompressedSizeBytesExpected {
t.Fatalf("unexpected uncompressedSizeBytes in partHeader; got %d; want %d", ph.UncompressedSizeBytes, uncompressedSizeBytesExpected)
}
if ph.RowsCount != uint64(rowsCountExpected) {
t.Fatalf("unexpected number of entries in partHeader; got %d; want %d", ph.RowsCount, rowsCountExpected)
}
if ph.BlocksCount != uint64(blocksCountExpected) {
t.Fatalf("unexpected blocksCount in partHeader; got %d; want %d", ph.BlocksCount, blocksCountExpected)
}
if ph.RowsCount > 0 {
if ph.MinTimestamp != minTimestampExpected {
t.Fatalf("unexpected minTimestamp in partHeader; got %d; want %d", ph.MinTimestamp, minTimestampExpected)
}
if ph.MaxTimestamp != maxTimestampExpected {
t.Fatalf("unexpected maxTimestamp in partHeader; got %d; want %d", ph.MaxTimestamp, maxTimestampExpected)
}
}
// Read log entries from mpDst to rrsResult
sbu := getStringsBlockUnmarshaler()
defer putStringsBlockUnmarshaler(sbu)
vd := getValuesDecoder()
defer putValuesDecoder(vd)
lrResult := mpDst.readLogRows(sbu, vd)
putInmemoryPart(mpDst)
// compare rrsOrig to rrsResult
if err := checkEqualRows(lrResult, lrOrig); err != nil {
t.Fatalf("unequal log entries: %s", err)
}
}
// Check empty readers
f(nil, 0, 0)
f([]*LogRows{GetLogRows(nil, nil)}, 0, 0)
f([]*LogRows{GetLogRows(nil, nil), GetLogRows(nil, nil)}, 0, 0)
// Check merge with a single reader
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 14:18:28 +00:00
f([]*LogRows{newTestLogRows(1, 1, 0)}, 1, 0.7)
f([]*LogRows{newTestLogRows(1, 10, 0)}, 1, 2.0)
f([]*LogRows{newTestLogRows(1, 100, 0)}, 1, 4.9)
f([]*LogRows{newTestLogRows(1, 1000, 0)}, 1, 7.1)
f([]*LogRows{newTestLogRows(1, 10000, 0)}, 1, 7.4)
lib/logstorage: refactor storage format to be more efficient for querying wide events It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields. For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log. Such logs are also known as "wide events". The previous storage format was optimized for logs with a few fields. When at least a single field was referenced in the query, then the all the meta-information about all the log fields was unpacked and parsed per each scanned block during the query. This could require a lot of additional disk IO and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset) index per each field in every data block. This index allows reading and extracting only the needed metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ). This allows increasing performance for queries over wide events by 10x and more. Another issue was that the data for bloom filters and field values across all the log fields except of _msg was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ). This could result in huge disk read IO overhead when some small field was referred in the query, since the Operating System usually reads more data than requested. It reads the data from disk in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB). So, if 512-byte bloom filter or values' block is read from the file, then the Operating System reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache), but this overhead may become very annoying when performing the query over large volumes of data which isn't present in OS page cache. The solution for this issue is to split bloom filters and field values across multiple shards. This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards, while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N. Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases performance for queries over large volumes of newly ingested data by up to 1000x. The new storage format is versioned as v1, while the old storage format is version as v0. It is stored in the partHeader.FormatVersion. Parts with the old storage format are converted into parts with the new storage format during background merge. It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 14:18:28 +00:00
f([]*LogRows{newTestLogRows(10, 1, 0)}, 10, 1.1)
f([]*LogRows{newTestLogRows(100, 1, 0)}, 100, 1.3)
f([]*LogRows{newTestLogRows(1000, 1, 0)}, 1000, 1.2)
f([]*LogRows{newTestLogRows(10, 10, 0)}, 10, 2.1)
f([]*LogRows{newTestLogRows(10, 100, 0)}, 10, 4.9)
//Check merge with multiple readers
f([]*LogRows{
newTestLogRows(1, 1, 0),
newTestLogRows(1, 1, 1),
}, 2, 0.9)
f([]*LogRows{
newTestLogRows(2, 2, 0),
newTestLogRows(2, 2, 0),
}, 2, 1.8)
f([]*LogRows{
newTestLogRows(1, 20, 0),
newTestLogRows(1, 10, 1),
newTestLogRows(1, 5, 2),
}, 3, 2.2)
f([]*LogRows{
newTestLogRows(10, 20, 0),
newTestLogRows(20, 10, 1),
newTestLogRows(30, 5, 2),
}, 60, 2.0)
f([]*LogRows{
newTestLogRows(10, 20, 0),
newTestLogRows(20, 10, 1),
newTestLogRows(30, 5, 2),
newTestLogRows(20, 7, 3),
newTestLogRows(10, 9, 4),
}, 90, 1.9)
}
func newTestLogRows(streams, rowsPerStream int, seed int64) *LogRows {
streamTags := []string{
"some-stream-tag",
}
lr := GetLogRows(streamTags, nil)
rng := rand.New(rand.NewSource(seed))
var fields []Field
for i := 0; i < streams; i++ {
tenantID := TenantID{
AccountID: rng.Uint32(),
ProjectID: rng.Uint32(),
}
for j := 0; j < rowsPerStream; j++ {
// Add stream tags
fields = append(fields[:0], Field{
Name: "some-stream-tag",
Value: fmt.Sprintf("some-stream-value-%d", i),
})
// Add the remaining tags
for k := 0; k < 5; k++ {
if rng.Float64() < 0.5 {
fields = append(fields, Field{
Name: fmt.Sprintf("field_%d", k),
Value: fmt.Sprintf("value_%d_%d_%d", i, j, k),
})
}
}
// add a message field
fields = append(fields, Field{
Name: "",
Value: fmt.Sprintf("some row number %d at stream %d", j, i),
})
// add a field with constant value
fields = append(fields, Field{
Name: "job",
Value: "foobar",
})
// add a field with uint value
fields = append(fields, Field{
Name: "response_size_bytes",
Value: fmt.Sprintf("%d", rng.Intn(1234)),
})
// shuffle fields in order to check de-shuffling algorithm
rng.Shuffle(len(fields), func(i, j int) {
fields[i], fields[j] = fields[j], fields[i]
})
timestamp := rng.Int63()
lr.MustAdd(tenantID, timestamp, fields)
}
}
return lr
}
func checkEqualRows(lrResult, lrOrig *LogRows) error {
if len(lrResult.timestamps) != len(lrOrig.timestamps) {
return fmt.Errorf("unexpected length LogRows; got %d; want %d", len(lrResult.timestamps), len(lrOrig.timestamps))
}
sort.Sort(lrResult)
sort.Sort(lrOrig)
sortFieldNames := func(fields []Field) {
sort.Slice(fields, func(i, j int) bool {
return fields[i].Name < fields[j].Name
})
}
for i := range lrOrig.timestamps {
if !lrOrig.streamIDs[i].equal(&lrResult.streamIDs[i]) {
return fmt.Errorf("unexpected streamID for log entry %d\ngot\n%s\nwant\n%s", i, &lrResult.streamIDs[i], &lrOrig.streamIDs[i])
}
if lrOrig.timestamps[i] != lrResult.timestamps[i] {
return fmt.Errorf("unexpected timestamp for log entry %d\ngot\n%d\nwant\n%d", i, lrResult.timestamps[i], lrOrig.timestamps[i])
}
fieldsOrig := lrOrig.rows[i]
fieldsResult := lrResult.rows[i]
if len(fieldsOrig) != len(fieldsResult) {
return fmt.Errorf("unexpected number of fields at log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
}
sortFieldNames(fieldsOrig)
sortFieldNames(fieldsResult)
if !reflect.DeepEqual(fieldsOrig, fieldsResult) {
return fmt.Errorf("unexpected fields for log entry %d\ngot\n%s\nwant\n%s", i, fieldsResult, fieldsOrig)
}
}
return nil
}
// readLogRows reads log entries from mp.
//
// This function is for testing and debugging purposes only.
func (mp *inmemoryPart) readLogRows(sbu *stringsBlockUnmarshaler, vd *valuesDecoder) *LogRows {
lr := GetLogRows(nil, nil)
bsr := getBlockStreamReader()
defer putBlockStreamReader(bsr)
bsr.MustInitFromInmemoryPart(mp)
var tmp rows
for bsr.NextBlock() {
bd := &bsr.blockData
streamID := bd.streamID
if err := bd.unmarshalRows(&tmp, sbu, vd); err != nil {
logger.Panicf("BUG: cannot unmarshal log entries from inmemoryPart: %s", err)
}
for i, timestamp := range tmp.timestamps {
lr.MustAdd(streamID.tenantID, timestamp, tmp.rows[i])
lr.streamIDs[len(lr.streamIDs)-1] = streamID
}
tmp.reset()
}
return lr
}