VictoriaMetrics/lib/mergeset/part.go
Aliaksandr Valialkin fc3d826d7f
all: add Windows build for VictoriaMetrics
This commit changes background merge algorithm, so it becomes compatible with Windows file semantics.

The previous algorithm for background merge:

1. Merge source parts into a destination part inside tmp directory.
2. Create a file in txn directory with instructions on how to atomically
   swap source parts with the destination part.
3. Perform instructions from the file.
4. Delete the file with instructions.

This algorithm guarantees that either source parts or destination part
is visible in the partition after unclean shutdown at any step above,
since the remaining files with instructions is replayed on the next restart,
after that the remaining contents of the tmp directory is deleted.

Unfortunately this algorithm doesn't work under Windows because
it disallows removing and moving files, which are in use.

So the new algorithm for background merge has been implemented:

1. Merge source parts into a destination part inside the partition directory itself.
   E.g. now the partition directory may contain both complete and incomplete parts.
2. Atomically update the parts.json file with the new list of parts after the merge,
   e.g. remove the source parts from the list and add the destination part to the list
   before storing it to parts.json file.
3. Remove the source parts from disk when they are no longer used.

This algorithm guarantees that either source parts or destination part
is visible in the partition after unclean shutdown at any step above,
since incomplete partitions from step 1 or old source parts from step 3 are removed
on the next startup by inspecting parts.json file.

This algorithm should work under Windows, since it doesn't remove or move files in use.
This algorithm has also the following benefits:

- It should work better for NFS.
- It fits object storage semantics.

The new algorithm changes data storage format, so it is impossible to downgrade
to the previous versions of VictoriaMetrics after upgrading to this algorithm.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3236
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3821
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70
2023-03-19 23:28:26 -07:00

148 lines
3.6 KiB
Go

package mergeset
import (
"fmt"
"sync"
"unsafe"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/blockcache"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
)
var idxbCache = blockcache.NewCache(getMaxIndexBlocksCacheSize)
var ibCache = blockcache.NewCache(getMaxInmemoryBlocksCacheSize)
// SetIndexBlocksCacheSize overrides the default size of indexdb/indexBlock cache
func SetIndexBlocksCacheSize(size int) {
maxIndexBlockCacheSize = size
}
func getMaxIndexBlocksCacheSize() int {
maxIndexBlockCacheSizeOnce.Do(func() {
if maxIndexBlockCacheSize <= 0 {
maxIndexBlockCacheSize = int(0.10 * float64(memory.Allowed()))
}
})
return maxIndexBlockCacheSize
}
var (
maxIndexBlockCacheSize int
maxIndexBlockCacheSizeOnce sync.Once
)
// SetDataBlocksCacheSize overrides the default size of indexdb/dataBlocks cache
func SetDataBlocksCacheSize(size int) {
maxInmemoryBlockCacheSize = size
}
func getMaxInmemoryBlocksCacheSize() int {
maxInmemoryBlockCacheSizeOnce.Do(func() {
if maxInmemoryBlockCacheSize <= 0 {
maxInmemoryBlockCacheSize = int(0.25 * float64(memory.Allowed()))
}
})
return maxInmemoryBlockCacheSize
}
var (
maxInmemoryBlockCacheSize int
maxInmemoryBlockCacheSizeOnce sync.Once
)
type part struct {
ph partHeader
path string
size uint64
mrs []metaindexRow
indexFile fs.MustReadAtCloser
itemsFile fs.MustReadAtCloser
lensFile fs.MustReadAtCloser
}
func openFilePart(path string) (*part, error) {
var ph partHeader
if err := ph.ReadMetadata(path); err != nil {
return nil, fmt.Errorf("cannot read part metadata: %w", err)
}
metaindexPath := path + "/metaindex.bin"
metaindexFile, err := filestream.Open(metaindexPath, true)
if err != nil {
return nil, fmt.Errorf("cannot open %q: %w", metaindexPath, err)
}
metaindexSize := fs.MustFileSize(metaindexPath)
indexPath := path + "/index.bin"
indexFile := fs.MustOpenReaderAt(indexPath)
indexSize := fs.MustFileSize(indexPath)
itemsPath := path + "/items.bin"
itemsFile := fs.MustOpenReaderAt(itemsPath)
itemsSize := fs.MustFileSize(itemsPath)
lensPath := path + "/lens.bin"
lensFile := fs.MustOpenReaderAt(lensPath)
lensSize := fs.MustFileSize(lensPath)
size := metaindexSize + indexSize + itemsSize + lensSize
return newPart(&ph, path, size, metaindexFile, indexFile, itemsFile, lensFile)
}
func newPart(ph *partHeader, path string, size uint64, metaindexReader filestream.ReadCloser, indexFile, itemsFile, lensFile fs.MustReadAtCloser) (*part, error) {
var errors []error
mrs, err := unmarshalMetaindexRows(nil, metaindexReader)
if err != nil {
errors = append(errors, fmt.Errorf("cannot unmarshal metaindexRows: %w", err))
}
metaindexReader.MustClose()
var p part
p.path = path
p.size = size
p.mrs = mrs
p.indexFile = indexFile
p.itemsFile = itemsFile
p.lensFile = lensFile
p.ph.CopyFrom(ph)
if len(errors) > 0 {
// Return only the first error, since it has no sense in returning all errors.
err := fmt.Errorf("error opening part %s: %w", p.path, errors[0])
p.MustClose()
return nil, err
}
return &p, nil
}
func (p *part) MustClose() {
p.indexFile.MustClose()
p.itemsFile.MustClose()
p.lensFile.MustClose()
idxbCache.RemoveBlocksForPart(p)
ibCache.RemoveBlocksForPart(p)
}
type indexBlock struct {
bhs []blockHeader
// The buffer for holding the data referrred by bhs
buf []byte
}
func (idxb *indexBlock) SizeBytes() int {
bhs := idxb.bhs[:cap(idxb.bhs)]
n := int(unsafe.Sizeof(*idxb))
for i := range bhs {
n += bhs[i].SizeBytes()
}
return n
}