package logstorage import ( "math/bits" "path/filepath" "sync" "github.com/cespare/xxhash/v2" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream" "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/slicesutil" ) // writerWithStats writes data to w and tracks the total amounts of data written at bytesWritten. type writerWithStats struct { w filestream.WriteCloser bytesWritten uint64 } func (w *writerWithStats) reset() { w.w = nil w.bytesWritten = 0 } func (w *writerWithStats) init(wc filestream.WriteCloser) { w.reset() w.w = wc } func (w *writerWithStats) Path() string { return w.w.Path() } func (w *writerWithStats) MustWrite(data []byte) { fs.MustWriteData(w.w, data) w.bytesWritten += uint64(len(data)) } // MustClose closes the underlying w. func (w *writerWithStats) MustClose() { w.w.MustClose() } // streamWriters contain writers for blockStreamWriter type streamWriters struct { columnNamesWriter writerWithStats metaindexWriter writerWithStats indexWriter writerWithStats columnsHeaderIndexWriter writerWithStats columnsHeaderWriter writerWithStats timestampsWriter writerWithStats messageBloomValuesWriter bloomValuesWriter bloomValuesShards []bloomValuesWriter } type bloomValuesWriter struct { bloom writerWithStats values writerWithStats } func (w *bloomValuesWriter) reset() { w.bloom.reset() w.values.reset() } func (w *bloomValuesWriter) init(sw bloomValuesStreamWriter) { w.bloom.init(sw.bloom) w.values.init(sw.values) } func (w *bloomValuesWriter) totalBytesWritten() uint64 { return w.bloom.bytesWritten + w.values.bytesWritten } func (w *bloomValuesWriter) MustClose() { w.bloom.MustClose() w.values.MustClose() } type bloomValuesStreamWriter struct { bloom filestream.WriteCloser values filestream.WriteCloser } func (sw *streamWriters) reset() { sw.columnNamesWriter.reset() sw.metaindexWriter.reset() sw.indexWriter.reset() sw.columnsHeaderIndexWriter.reset() sw.columnsHeaderWriter.reset() sw.timestampsWriter.reset() sw.messageBloomValuesWriter.reset() for i := range sw.bloomValuesShards { sw.bloomValuesShards[i].reset() } sw.bloomValuesShards = sw.bloomValuesShards[:0] } func (sw *streamWriters) init(columnNamesWriter, metaindexWriter, indexWriter, columnsHeaderIndexWriter, columnsHeaderWriter, timestampsWriter filestream.WriteCloser, messageBloomValuesWriter bloomValuesStreamWriter, bloomValuesShards []bloomValuesStreamWriter, ) { sw.columnNamesWriter.init(columnNamesWriter) sw.metaindexWriter.init(metaindexWriter) sw.indexWriter.init(indexWriter) sw.columnsHeaderIndexWriter.init(columnsHeaderIndexWriter) sw.columnsHeaderWriter.init(columnsHeaderWriter) sw.timestampsWriter.init(timestampsWriter) sw.messageBloomValuesWriter.init(messageBloomValuesWriter) sw.bloomValuesShards = slicesutil.SetLength(sw.bloomValuesShards, len(bloomValuesShards)) for i := range sw.bloomValuesShards { sw.bloomValuesShards[i].init(bloomValuesShards[i]) } } func (sw *streamWriters) totalBytesWritten() uint64 { n := uint64(0) n += sw.columnNamesWriter.bytesWritten n += sw.metaindexWriter.bytesWritten n += sw.indexWriter.bytesWritten n += sw.columnsHeaderIndexWriter.bytesWritten n += sw.columnsHeaderWriter.bytesWritten n += sw.timestampsWriter.bytesWritten n += sw.messageBloomValuesWriter.totalBytesWritten() for i := range sw.bloomValuesShards { n += sw.bloomValuesShards[i].totalBytesWritten() } return n } func (sw *streamWriters) MustClose() { sw.columnNamesWriter.MustClose() sw.metaindexWriter.MustClose() sw.indexWriter.MustClose() sw.columnsHeaderIndexWriter.MustClose() sw.columnsHeaderWriter.MustClose() sw.timestampsWriter.MustClose() sw.messageBloomValuesWriter.MustClose() for i := range sw.bloomValuesShards { sw.bloomValuesShards[i].MustClose() } } func (sw *streamWriters) getBloomValuesWriterForColumnName(name string) *bloomValuesWriter { if name == "" { return &sw.messageBloomValuesWriter } n := len(sw.bloomValuesShards) idx := uint64(0) if n > 1 { h := xxhash.Sum64(bytesutil.ToUnsafeBytes(name)) idx = h % uint64(n) } return &sw.bloomValuesShards[idx] } // blockStreamWriter is used for writing blocks into the underlying storage in streaming manner. type blockStreamWriter struct { // streamWriters contains writer for block data streamWriters streamWriters // sidLast is the streamID for the last written block sidLast streamID // sidFirst is the streamID for the first block in the current indexBlock sidFirst streamID // bloomValuesFieldsCount is the number of fields with (bloom, values) pairs in the output part. bloomValuesFieldsCount uint64 // minTimestampLast is the minimum timestamp seen for the last written block minTimestampLast int64 // minTimestamp is the minimum timestamp seen across written blocks for the current indexBlock minTimestamp int64 // maxTimestamp is the maximum timestamp seen across written blocks for the current indexBlock maxTimestamp int64 // hasWrittenBlocks is set to true if at least a single block is written to the current indexBlock hasWrittenBlocks bool // globalUncompressedSizeBytes is the total size of all the log entries written via bsw globalUncompressedSizeBytes uint64 // globalRowsCount is the total number of log entries written via bsw globalRowsCount uint64 // globalBlocksCount is the total number of blocks written to bsw globalBlocksCount uint64 // globalMinTimestamp is the minimum timestamp seen across all the blocks written to bsw globalMinTimestamp int64 // globalMaxTimestamp is the maximum timestamp seen across all the blocks written to bsw globalMaxTimestamp int64 // indexBlockData contains marshaled blockHeader data, which isn't written yet to indexFilename indexBlockData []byte // metaindexData contains marshaled indexBlockHeader data, which isn't written yet to metaindexFilename metaindexData []byte // indexBlockHeader is used for marshaling the data to metaindexData indexBlockHeader indexBlockHeader // columnNameIDGenerator is used for generating columnName->id mapping for all the columns seen in bsw columnNameIDGenerator columnNameIDGenerator } // reset resets bsw for subsequent re-use. func (bsw *blockStreamWriter) reset() { bsw.streamWriters.reset() bsw.sidLast.reset() bsw.sidFirst.reset() bsw.bloomValuesFieldsCount = 0 bsw.minTimestampLast = 0 bsw.minTimestamp = 0 bsw.maxTimestamp = 0 bsw.hasWrittenBlocks = false bsw.globalUncompressedSizeBytes = 0 bsw.globalRowsCount = 0 bsw.globalBlocksCount = 0 bsw.globalMinTimestamp = 0 bsw.globalMaxTimestamp = 0 bsw.indexBlockData = bsw.indexBlockData[:0] if len(bsw.metaindexData) > 1024*1024 { // The length of bsw.metaindexData is unbound, so drop too long buffer // in order to conserve memory. bsw.metaindexData = nil } else { bsw.metaindexData = bsw.metaindexData[:0] } bsw.indexBlockHeader.reset() bsw.columnNameIDGenerator.reset() } // MustInitForInmemoryPart initializes bsw from mp func (bsw *blockStreamWriter) MustInitForInmemoryPart(mp *inmemoryPart) { bsw.reset() messageBloomValues := mp.messageBloomValues.NewStreamWriter() bloomValuesShards := []bloomValuesStreamWriter{ mp.fieldBloomValues.NewStreamWriter(), } bsw.streamWriters.init(&mp.columnNames, &mp.metaindex, &mp.index, &mp.columnsHeaderIndex, &mp.columnsHeader, &mp.timestamps, messageBloomValues, bloomValuesShards) } // MustInitForFilePart initializes bsw for writing data to file part located at path. // // if nocache is true, then the written data doesn't go to OS page cache. func (bsw *blockStreamWriter) MustInitForFilePart(path string, nocache bool, bloomValuesShardsCount uint64) { bsw.reset() fs.MustMkdirFailIfExist(path) columnNamesPath := filepath.Join(path, columnNamesFilename) metaindexPath := filepath.Join(path, metaindexFilename) indexPath := filepath.Join(path, indexFilename) columnsHeaderIndexPath := filepath.Join(path, columnsHeaderIndexFilename) columnsHeaderPath := filepath.Join(path, columnsHeaderFilename) timestampsPath := filepath.Join(path, timestampsFilename) // Always cache columnNames files, since it is re-read immediately after part creation columnNamesWriter := filestream.MustCreate(columnNamesPath, false) // Always cache metaindex file, since it is re-read immediately after part creation metaindexWriter := filestream.MustCreate(metaindexPath, false) indexWriter := filestream.MustCreate(indexPath, nocache) columnsHeaderIndexWriter := filestream.MustCreate(columnsHeaderIndexPath, nocache) columnsHeaderWriter := filestream.MustCreate(columnsHeaderPath, nocache) timestampsWriter := filestream.MustCreate(timestampsPath, nocache) messageBloomFilterPath := filepath.Join(path, messageBloomFilename) messageValuesPath := filepath.Join(path, messageValuesFilename) messageBloomValuesWriter := bloomValuesStreamWriter{ bloom: filestream.MustCreate(messageBloomFilterPath, nocache), values: filestream.MustCreate(messageValuesPath, nocache), } bloomValuesShardsCount = adjustBloomValuesShardsCount(bloomValuesShardsCount) bloomValuesShards := make([]bloomValuesStreamWriter, bloomValuesShardsCount) for i := range bloomValuesShards { shard := &bloomValuesShards[i] bloomPath := getBloomFilePath(path, uint64(i)) shard.bloom = filestream.MustCreate(bloomPath, nocache) valuesPath := getValuesFilePath(path, uint64(i)) shard.values = filestream.MustCreate(valuesPath, nocache) } bsw.streamWriters.init(columnNamesWriter, metaindexWriter, indexWriter, columnsHeaderIndexWriter, columnsHeaderWriter, timestampsWriter, messageBloomValuesWriter, bloomValuesShards) } func adjustBloomValuesShardsCount(n uint64) uint64 { if n == 0 { // At least a single shard is needed for writing potential non-const fields, // which can appear after merging of const fields. // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7391 return 1 } n = 1 << bits.Len64(n-1) if n > bloomValuesMaxShardsCount { n = bloomValuesMaxShardsCount } return n } // MustWriteRows writes timestamps with rows under the given sid to bsw. // // timestamps must be sorted. // sid must be bigger or equal to the sid for the previously written rs. func (bsw *blockStreamWriter) MustWriteRows(sid *streamID, timestamps []int64, rows [][]Field) { if len(timestamps) == 0 { return } b := getBlock() for len(rows) > 0 { rowsOffset := b.MustInitFromRows(timestamps, rows) bsw.MustWriteBlock(sid, b) timestamps, rows = timestamps[rowsOffset:], rows[rowsOffset:] } putBlock(b) } // MustWriteBlockData writes bd to bsw. // // The bd.streamID must be bigger or equal to the streamID for the previously written blocks. func (bsw *blockStreamWriter) MustWriteBlockData(bd *blockData) { if bd.rowsCount == 0 { return } bsw.mustWriteBlockInternal(&bd.streamID, nil, bd) } // MustWriteBlock writes b under the given sid to bsw. // // The sid must be bigger or equal to the sid for the previously written blocks. // The minimum timestamp in b must be bigger or equal to the minimum timestamp written to the same sid. func (bsw *blockStreamWriter) MustWriteBlock(sid *streamID, b *block) { rowsCount := b.Len() if rowsCount == 0 { return } bsw.mustWriteBlockInternal(sid, b, nil) } func (bsw *blockStreamWriter) mustWriteBlockInternal(sid *streamID, b *block, bd *blockData) { if sid.less(&bsw.sidLast) { logger.Panicf("BUG: the sid=%s cannot be smaller than the previously written sid=%s", sid, &bsw.sidLast) } hasWrittenBlocks := bsw.hasWrittenBlocks if !hasWrittenBlocks { bsw.sidFirst = *sid bsw.hasWrittenBlocks = true } isSeenSid := sid.equal(&bsw.sidLast) bsw.sidLast = *sid bh := getBlockHeader() columnsLen := 0 if b != nil { b.mustWriteTo(sid, bh, &bsw.streamWriters, &bsw.columnNameIDGenerator) columnsLen = len(b.columns) } else { bd.mustWriteTo(bh, &bsw.streamWriters, &bsw.columnNameIDGenerator) columnsLen = len(bd.columnsData) } if bsw.bloomValuesFieldsCount < uint64(columnsLen) { bsw.bloomValuesFieldsCount = uint64(columnsLen) } th := &bh.timestampsHeader if bsw.globalRowsCount == 0 || th.minTimestamp < bsw.globalMinTimestamp { bsw.globalMinTimestamp = th.minTimestamp } if bsw.globalRowsCount == 0 || th.maxTimestamp > bsw.globalMaxTimestamp { bsw.globalMaxTimestamp = th.maxTimestamp } if !hasWrittenBlocks || th.minTimestamp < bsw.minTimestamp { bsw.minTimestamp = th.minTimestamp } if !hasWrittenBlocks || th.maxTimestamp > bsw.maxTimestamp { bsw.maxTimestamp = th.maxTimestamp } if isSeenSid && th.minTimestamp < bsw.minTimestampLast { logger.Panicf("BUG: the block for sid=%s cannot contain timestamp smaller than %d, but it contains timestamp %d", sid, bsw.minTimestampLast, th.minTimestamp) } bsw.minTimestampLast = th.minTimestamp bsw.globalUncompressedSizeBytes += bh.uncompressedSizeBytes bsw.globalRowsCount += bh.rowsCount bsw.globalBlocksCount++ // Marshal bh bsw.indexBlockData = bh.marshal(bsw.indexBlockData) putBlockHeader(bh) if len(bsw.indexBlockData) > maxUncompressedIndexBlockSize { bsw.mustFlushIndexBlock(bsw.indexBlockData) bsw.indexBlockData = bsw.indexBlockData[:0] } } func (bsw *blockStreamWriter) mustFlushIndexBlock(data []byte) { if len(data) > 0 { bsw.indexBlockHeader.mustWriteIndexBlock(data, bsw.sidFirst, bsw.minTimestamp, bsw.maxTimestamp, &bsw.streamWriters) bsw.metaindexData = bsw.indexBlockHeader.marshal(bsw.metaindexData) } bsw.hasWrittenBlocks = false bsw.minTimestamp = 0 bsw.maxTimestamp = 0 bsw.sidFirst.reset() } // Finalize() finalizes the data write process and updates ph with the finalized stats // // It closes the writers passed to MustInit(). // // bsw can be re-used after calling Finalize(). func (bsw *blockStreamWriter) Finalize(ph *partHeader) { ph.FormatVersion = partFormatLatestVersion ph.UncompressedSizeBytes = bsw.globalUncompressedSizeBytes ph.RowsCount = bsw.globalRowsCount ph.BlocksCount = bsw.globalBlocksCount ph.MinTimestamp = bsw.globalMinTimestamp ph.MaxTimestamp = bsw.globalMaxTimestamp ph.BloomValuesShardsCount = uint64(len(bsw.streamWriters.bloomValuesShards)) ph.BloomValuesFieldsCount = bsw.bloomValuesFieldsCount bsw.mustFlushIndexBlock(bsw.indexBlockData) // Write columnNames data mustWriteColumnNames(&bsw.streamWriters.columnNamesWriter, bsw.columnNameIDGenerator.columnNames) // Write metaindex data mustWriteIndexBlockHeaders(&bsw.streamWriters.metaindexWriter, bsw.metaindexData) ph.CompressedSizeBytes = bsw.streamWriters.totalBytesWritten() bsw.streamWriters.MustClose() bsw.reset() } var longTermBufPool bytesutil.ByteBufferPool // getBlockStreamWriter returns new blockStreamWriter from the pool. // // Return back the blockStreamWriter to the pool when it is no longer needed by calling putBlockStreamWriter. func getBlockStreamWriter() *blockStreamWriter { v := blockStreamWriterPool.Get() if v == nil { return &blockStreamWriter{} } return v.(*blockStreamWriter) } // putBlockStreamWriter returns bsw to the pool. func putBlockStreamWriter(bsw *blockStreamWriter) { bsw.reset() blockStreamWriterPool.Put(bsw) } var blockStreamWriterPool sync.Pool