VictoriaMetrics/lib/logstorage/datadb.go

1274 lines
33 KiB
Go

package logstorage
import (
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
)
// The maximum size of big part.
//
// This number limits the maximum time required for building big part.
// This time shouldn't exceed a few days.
const maxBigPartSize = 1e12
// The maximum number of inmemory parts in the partition.
//
// The actual number of inmemory parts may exceed this value if in-memory mergers
// cannot keep up with the rate of creating new in-memory parts.
const maxInmemoryPartsPerPartition = 20
// The interval for guaranteed flush of recently ingested data from memory to on-disk parts,
// so they survive process crash.
var dataFlushInterval = 5 * time.Second
// Default number of parts to merge at once.
//
// This number has been obtained empirically - it gives the lowest possible overhead.
// See appendPartsToMerge tests for details.
const defaultPartsToMerge = 15
// minMergeMultiplier is the minimum multiplier for the size of the output part
// compared to the size of the maximum input part for the merge.
//
// Higher value reduces write amplification (disk write IO induced by the merge),
// while increases the number of unmerged parts.
// The 1.7 is good enough for production workloads.
const minMergeMultiplier = 1.7
// datadb represents a database with log data
type datadb struct {
// mergeIdx is used for generating unique directory names for parts
mergeIdx atomic.Uint64
inmemoryMergesTotal atomic.Uint64
inmemoryActiveMerges atomic.Int64
smallPartMergesTotal atomic.Uint64
smallPartActiveMerges atomic.Int64
bigPartMergesTotal atomic.Uint64
bigPartActiveMerges atomic.Int64
// pt is the partition the datadb belongs to
pt *partition
// path is the path to the directory with log data
path string
// flushInterval is interval for flushing the inmemory parts to disk
flushInterval time.Duration
// inmemoryParts contains a list of inmemory parts
inmemoryParts []*partWrapper
// smallParts contains a list of file-based small parts
smallParts []*partWrapper
// bigParts contains a list of file-based big parts
bigParts []*partWrapper
// partsLock protects parts from concurrent access
partsLock sync.Mutex
// wg is used for determining when background workers stop
//
// wg.Add() must be called under partsLock after checking whether stopCh isn't closed.
// This should prevent from calling wg.Add() after stopCh is closed and wg.Wait() is called.
wg sync.WaitGroup
// stopCh is used for notifying background workers to stop
//
// It must be closed under partsLock in order to prevent from calling wg.Add()
// after stopCh is closed.
stopCh chan struct{}
}
// partWrapper is a wrapper for opened part.
type partWrapper struct {
// refCount is the number of references to p.
//
// When the number of references reaches zero, then p is closed.
refCount atomic.Int32
// The flag, which is set when the part must be deleted after refCount reaches zero.
mustDrop atomic.Bool
// p is an opened part
p *part
// mp references inmemory part used for initializing p.
mp *inmemoryPart
// isInMerge is set to true if the part takes part in merge.
isInMerge bool
// The deadline when in-memory part must be flushed to disk.
flushDeadline time.Time
}
func (pw *partWrapper) incRef() {
pw.refCount.Add(1)
}
func (pw *partWrapper) decRef() {
n := pw.refCount.Add(-1)
if n > 0 {
return
}
deletePath := ""
if pw.mp == nil {
if pw.mustDrop.Load() {
deletePath = pw.p.path
}
} else {
putInmemoryPart(pw.mp)
pw.mp = nil
}
mustClosePart(pw.p)
pw.p = nil
if deletePath != "" {
fs.MustRemoveAll(deletePath)
}
}
func mustCreateDatadb(path string) {
fs.MustMkdirFailIfExist(path)
mustWritePartNames(path, nil, nil)
}
// mustOpenDatadb opens datadb at the given path with the given flushInterval for in-memory data.
func mustOpenDatadb(pt *partition, path string, flushInterval time.Duration) *datadb {
// Remove temporary directories, which may be left after unclean shutdown.
fs.MustRemoveTemporaryDirs(path)
partNames := mustReadPartNames(path)
mustRemoveUnusedDirs(path, partNames)
var smallParts []*partWrapper
var bigParts []*partWrapper
for _, partName := range partNames {
// Make sure the partName exists on disk.
// If it is missing, then manual action from the user is needed,
// since this is unexpected state, which cannot occur under normal operation,
// including unclean shutdown.
partPath := filepath.Join(path, partName)
if !fs.IsPathExist(partPath) {
partsFile := filepath.Join(path, partsFilename)
logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+
"ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts",
partPath, partsFile, partsFile, partsFile)
}
p := mustOpenFilePart(pt, partPath)
pw := newPartWrapper(p, nil, time.Time{})
if p.ph.CompressedSizeBytes > getMaxInmemoryPartSize() {
bigParts = append(bigParts, pw)
} else {
smallParts = append(smallParts, pw)
}
}
ddb := &datadb{
pt: pt,
flushInterval: flushInterval,
path: path,
smallParts: smallParts,
bigParts: bigParts,
stopCh: make(chan struct{}),
}
ddb.mergeIdx.Store(uint64(time.Now().UnixNano()))
ddb.startBackgroundWorkers()
return ddb
}
func (ddb *datadb) startBackgroundWorkers() {
// Start file parts mergers, so they could start merging unmerged parts if needed.
// There is no need in starting in-memory parts mergers, since there are no in-memory parts yet.
ddb.startSmallPartsMergers()
ddb.startBigPartsMergers()
ddb.startInmemoryPartsFlusher()
}
var (
inmemoryPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
smallPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
bigPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
)
func (ddb *datadb) startSmallPartsMergers() {
ddb.partsLock.Lock()
for i := 0; i < cap(smallPartsConcurrencyCh); i++ {
ddb.startSmallPartsMergerLocked()
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) startBigPartsMergers() {
ddb.partsLock.Lock()
for i := 0; i < cap(bigPartsConcurrencyCh); i++ {
ddb.startBigPartsMergerLocked()
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) startInmemoryPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.inmemoryPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startSmallPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.smallPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startBigPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.bigPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startInmemoryPartsFlusher() {
ddb.wg.Add(1)
go func() {
ddb.inmemoryPartsFlusher()
ddb.wg.Done()
}()
}
func (ddb *datadb) inmemoryPartsFlusher() {
// Do not add jitter to d in order to guarantee the flush interval
ticker := time.NewTicker(dataFlushInterval)
defer ticker.Stop()
for {
select {
case <-ddb.stopCh:
return
case <-ticker.C:
ddb.mustFlushInmemoryPartsToFiles(false)
}
}
}
func (ddb *datadb) mustFlushInmemoryPartsToFiles(isFinal bool) {
currentTime := time.Now()
var pws []*partWrapper
ddb.partsLock.Lock()
for _, pw := range ddb.inmemoryParts {
if !pw.isInMerge && (isFinal || pw.flushDeadline.Before(currentTime)) {
pw.isInMerge = true
pws = append(pws, pw)
}
}
ddb.partsLock.Unlock()
ddb.mustMergePartsToFiles(pws)
}
func (ddb *datadb) mustMergePartsToFiles(pws []*partWrapper) {
wg := getWaitGroup()
for len(pws) > 0 {
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
wg.Add(1)
inmemoryPartsConcurrencyCh <- struct{}{}
go func(pwsChunk []*partWrapper) {
defer func() {
<-inmemoryPartsConcurrencyCh
wg.Done()
}()
ddb.mustMergeParts(pwsChunk, true)
}(pwsToMerge)
pws = pwsRemaining
}
wg.Wait()
putWaitGroup(wg)
}
// getPartsForOptimalMerge returns parts from pws for optimal merge, plus the remaining parts.
//
// the pws items are replaced by nil after the call. This is needed for helping Go GC to reclaim the referenced items.
func getPartsForOptimalMerge(pws []*partWrapper) ([]*partWrapper, []*partWrapper) {
pwsToMerge := appendPartsToMerge(nil, pws, math.MaxUint64)
if len(pwsToMerge) == 0 {
return pws, nil
}
m := partsToMap(pwsToMerge)
pwsRemaining := make([]*partWrapper, 0, len(pws)-len(pwsToMerge))
for _, pw := range pws {
if _, ok := m[pw]; !ok {
pwsRemaining = append(pwsRemaining, pw)
}
}
// Clear references to pws items, so they could be reclaimed faster by Go GC.
for i := range pws {
pws[i] = nil
}
return pwsToMerge, pwsRemaining
}
func getWaitGroup() *sync.WaitGroup {
v := wgPool.Get()
if v == nil {
return &sync.WaitGroup{}
}
return v.(*sync.WaitGroup)
}
func putWaitGroup(wg *sync.WaitGroup) {
wgPool.Put(wg)
}
var wgPool sync.Pool
func (ddb *datadb) inmemoryPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.inmemoryParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
inmemoryPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-inmemoryPartsConcurrencyCh
}
}
func (ddb *datadb) smallPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.smallParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
smallPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-smallPartsConcurrencyCh
}
}
func (ddb *datadb) bigPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.bigParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
bigPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-bigPartsConcurrencyCh
}
}
// getPartsToMergeLocked returns optimal parts to merge from pws.
//
// The summary size of the returned parts must be smaller than maxOutBytes.
func getPartsToMergeLocked(pws []*partWrapper, maxOutBytes uint64) []*partWrapper {
pwsRemaining := make([]*partWrapper, 0, len(pws))
for _, pw := range pws {
if !pw.isInMerge {
pwsRemaining = append(pwsRemaining, pw)
}
}
pwsToMerge := appendPartsToMerge(nil, pwsRemaining, maxOutBytes)
for _, pw := range pwsToMerge {
if pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge cannot be set")
}
pw.isInMerge = true
}
return pwsToMerge
}
func assertIsInMerge(pws []*partWrapper) {
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to false")
}
}
}
// mustMergeParts merges pws to a single resulting part.
//
// if isFinal is set, then the resulting part is guaranteed to be saved to disk.
// if isFinal is set, then the merge process cannot be interrupted.
// The pws may remain unmerged after returning from the function if there is no enough disk space.
//
// All the parts inside pws must have isInMerge field set to true.
// The isInMerge field inside pws parts is set to false before returning from the function.
func (ddb *datadb) mustMergeParts(pws []*partWrapper, isFinal bool) {
if len(pws) == 0 {
// Nothing to merge.
return
}
assertIsInMerge(pws)
defer ddb.releasePartsToMerge(pws)
startTime := time.Now()
dstPartType := ddb.getDstPartType(pws, isFinal)
if dstPartType != partInmemory {
// Make sure there is enough disk space for performing the merge
partsSize := getCompressedSize(pws)
needReleaseDiskSpace := tryReserveDiskSpace(ddb.path, partsSize)
if needReleaseDiskSpace {
defer releaseDiskSpace(partsSize)
} else {
if !isFinal {
// There is no enough disk space for performing the non-final merge.
return
}
// Try performing final merge even if there is no enough disk space
// in order to persist in-memory data to disk.
// It is better to crash on out of memory error in this case.
}
}
switch dstPartType {
case partInmemory:
ddb.inmemoryMergesTotal.Add(1)
ddb.inmemoryActiveMerges.Add(1)
defer ddb.inmemoryActiveMerges.Add(-1)
case partSmall:
ddb.smallPartMergesTotal.Add(1)
ddb.smallPartActiveMerges.Add(1)
defer ddb.smallPartActiveMerges.Add(-1)
case partBig:
ddb.bigPartMergesTotal.Add(1)
ddb.bigPartActiveMerges.Add(1)
defer ddb.bigPartActiveMerges.Add(-1)
default:
logger.Panicf("BUG: unknown partType=%d", dstPartType)
}
// Initialize destination paths.
mergeIdx := ddb.nextMergeIdx()
dstPartPath := ddb.getDstPartPath(dstPartType, mergeIdx)
if isFinal && len(pws) == 1 && pws[0].mp != nil {
// Fast path: flush a single in-memory part to disk.
mp := pws[0].mp
mp.MustStoreToDisk(dstPartPath)
pwNew := ddb.openCreatedPart(&mp.ph, pws, nil, dstPartPath)
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
return
}
// Prepare blockStreamReaders for source parts.
bsrs := mustOpenBlockStreamReaders(pws)
// Prepare BlockStreamWriter for destination part.
srcSize := uint64(0)
srcRowsCount := uint64(0)
srcBlocksCount := uint64(0)
for _, pw := range pws {
srcSize += pw.p.ph.CompressedSizeBytes
srcRowsCount += pw.p.ph.RowsCount
srcBlocksCount += pw.p.ph.BlocksCount
}
bsw := getBlockStreamWriter()
var mpNew *inmemoryPart
if dstPartType == partInmemory {
mpNew = getInmemoryPart()
bsw.MustInitForInmemoryPart(mpNew)
} else {
nocache := dstPartType == partBig
bsw.MustInitForFilePart(dstPartPath, nocache)
}
// Merge source parts to destination part.
var ph partHeader
stopCh := ddb.stopCh
if isFinal {
// The final merge shouldn't be stopped even if ddb.stopCh is closed.
stopCh = nil
}
mustMergeBlockStreams(&ph, bsw, bsrs, stopCh)
putBlockStreamWriter(bsw)
for _, bsr := range bsrs {
putBlockStreamReader(bsr)
}
// Persist partHeader for destination part after the merge.
if mpNew != nil {
mpNew.ph = ph
} else {
ph.mustWriteMetadata(dstPartPath)
// Make sure the created part directory listing is synced.
fs.MustSyncPath(dstPartPath)
}
if needStop(stopCh) {
// Remove incomplete destination part
if dstPartType != partInmemory {
fs.MustRemoveAll(dstPartPath)
}
return
}
// Atomically swap the source parts with the newly created part.
pwNew := ddb.openCreatedPart(&ph, pws, mpNew, dstPartPath)
dstSize := uint64(0)
dstRowsCount := uint64(0)
dstBlocksCount := uint64(0)
if pwNew != nil {
pDst := pwNew.p
dstSize = pDst.ph.CompressedSizeBytes
dstRowsCount = pDst.ph.RowsCount
dstBlocksCount = pDst.ph.BlocksCount
}
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
d := time.Since(startTime)
if d <= time.Minute {
return
}
// Log stats for long merges.
durationSecs := d.Seconds()
rowsPerSec := int(float64(srcRowsCount) / durationSecs)
logger.Infof("merged (%d parts, %d rows, %d blocks, %d bytes) into (1 part, %d rows, %d blocks, %d bytes) in %.3f seconds at %d rows/sec to %q",
len(pws), srcRowsCount, srcBlocksCount, srcSize, dstRowsCount, dstBlocksCount, dstSize, durationSecs, rowsPerSec, dstPartPath)
}
func (ddb *datadb) nextMergeIdx() uint64 {
return ddb.mergeIdx.Add(1)
}
type partType int
var (
partInmemory = partType(0)
partSmall = partType(1)
partBig = partType(2)
)
func (ddb *datadb) getDstPartType(pws []*partWrapper, isFinal bool) partType {
dstPartSize := getCompressedSize(pws)
if dstPartSize > ddb.getMaxSmallPartSize() {
return partBig
}
if isFinal || dstPartSize > getMaxInmemoryPartSize() {
return partSmall
}
if !areAllInmemoryParts(pws) {
// If at least a single source part is located in file,
// then the destination part must be in file for durability reasons.
return partSmall
}
return partInmemory
}
func (ddb *datadb) getDstPartPath(dstPartType partType, mergeIdx uint64) string {
ptPath := ddb.path
dstPartPath := ""
if dstPartType != partInmemory {
dstPartPath = filepath.Join(ptPath, fmt.Sprintf("%016X", mergeIdx))
}
return dstPartPath
}
func (ddb *datadb) openCreatedPart(ph *partHeader, pws []*partWrapper, mpNew *inmemoryPart, dstPartPath string) *partWrapper {
// Open the created part.
if ph.RowsCount == 0 {
// The created part is empty. Remove it
if mpNew == nil {
fs.MustRemoveAll(dstPartPath)
}
return nil
}
var p *part
var flushDeadline time.Time
if mpNew != nil {
// Open the created part from memory.
p = mustOpenInmemoryPart(ddb.pt, mpNew)
flushDeadline = ddb.getFlushToDiskDeadline(pws)
} else {
// Open the created part from disk.
p = mustOpenFilePart(ddb.pt, dstPartPath)
}
return newPartWrapper(p, mpNew, flushDeadline)
}
func (ddb *datadb) mustAddRows(lr *LogRows) {
if len(lr.streamIDs) == 0 {
return
}
inmemoryPartsConcurrencyCh <- struct{}{}
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
p := mustOpenInmemoryPart(ddb.pt, mp)
<-inmemoryPartsConcurrencyCh
flushDeadline := time.Now().Add(ddb.flushInterval)
pw := newPartWrapper(p, mp, flushDeadline)
ddb.partsLock.Lock()
ddb.inmemoryParts = append(ddb.inmemoryParts, pw)
ddb.startInmemoryPartsMergerLocked()
ddb.partsLock.Unlock()
}
// DatadbStats contains various stats for datadb.
type DatadbStats struct {
// InmemoryMergesTotal is the number of inmemory merges performed in the given datadb.
InmemoryMergesTotal uint64
// InmemoryActiveMerges is the number of currently active inmemory merges performed by the given datadb.
InmemoryActiveMerges uint64
// SmallPartMergesTotal is the number of small file merges performed in the given datadb.
SmallPartMergesTotal uint64
// SmallPartActiveMerges is the number of currently active small file merges performed by the given datadb.
SmallPartActiveMerges uint64
// BigPartMergesTotal is the number of big file merges performed in the given datadb.
BigPartMergesTotal uint64
// BigPartActiveMerges is the number of currently active big file merges performed by the given datadb.
BigPartActiveMerges uint64
// InmemoryRowsCount is the number of rows, which weren't flushed to disk yet.
InmemoryRowsCount uint64
// SmallPartRowsCount is the number of rows stored on disk in small parts.
SmallPartRowsCount uint64
// BigPartRowsCount is the number of rows stored on disk in big parts.
BigPartRowsCount uint64
// InmemoryParts is the number of in-memory parts, which weren't flushed to disk yet.
InmemoryParts uint64
// SmallParts is the number of file-based small parts stored on disk.
SmallParts uint64
// BigParts is the number of file-based big parts stored on disk.
BigParts uint64
// InmemoryBlocks is the number of in-memory blocks, which weren't flushed to disk yet.
InmemoryBlocks uint64
// SmallPartBlocks is the number of file-based small blocks stored on disk.
SmallPartBlocks uint64
// BigPartBlocks is the number of file-based big blocks stored on disk.
BigPartBlocks uint64
// CompressedInmemorySize is the size of compressed data stored in memory.
CompressedInmemorySize uint64
// CompressedSmallPartSize is the size of compressed small parts data stored on disk.
CompressedSmallPartSize uint64
// CompressedBigPartSize is the size of compressed big data stored on disk.
CompressedBigPartSize uint64
// UncompressedInmemorySize is the size of uncompressed data stored in memory.
UncompressedInmemorySize uint64
// UncompressedSmallPartSize is the size of uncompressed small data stored on disk.
UncompressedSmallPartSize uint64
// UncompressedBigPartSize is the size of uncompressed big data stored on disk.
UncompressedBigPartSize uint64
}
func (s *DatadbStats) reset() {
*s = DatadbStats{}
}
// RowsCount returns the number of rows stored in datadb.
func (s *DatadbStats) RowsCount() uint64 {
return s.InmemoryRowsCount + s.SmallPartRowsCount + s.BigPartRowsCount
}
// updateStats updates s with ddb stats.
func (ddb *datadb) updateStats(s *DatadbStats) {
s.InmemoryMergesTotal += ddb.inmemoryMergesTotal.Load()
s.InmemoryActiveMerges += uint64(ddb.inmemoryActiveMerges.Load())
s.SmallPartMergesTotal += ddb.smallPartMergesTotal.Load()
s.SmallPartActiveMerges += uint64(ddb.smallPartActiveMerges.Load())
s.BigPartMergesTotal += ddb.bigPartMergesTotal.Load()
s.BigPartActiveMerges += uint64(ddb.bigPartActiveMerges.Load())
ddb.partsLock.Lock()
s.InmemoryRowsCount += getRowsCount(ddb.inmemoryParts)
s.SmallPartRowsCount += getRowsCount(ddb.smallParts)
s.BigPartRowsCount += getRowsCount(ddb.bigParts)
s.InmemoryParts += uint64(len(ddb.inmemoryParts))
s.SmallParts += uint64(len(ddb.smallParts))
s.BigParts += uint64(len(ddb.bigParts))
s.InmemoryBlocks += getBlocksCount(ddb.inmemoryParts)
s.SmallPartBlocks += getBlocksCount(ddb.smallParts)
s.BigPartBlocks += getBlocksCount(ddb.bigParts)
s.CompressedInmemorySize += getCompressedSize(ddb.inmemoryParts)
s.CompressedSmallPartSize += getCompressedSize(ddb.smallParts)
s.CompressedBigPartSize += getCompressedSize(ddb.bigParts)
s.UncompressedInmemorySize += getUncompressedSize(ddb.inmemoryParts)
s.UncompressedSmallPartSize += getUncompressedSize(ddb.smallParts)
s.UncompressedBigPartSize += getUncompressedSize(ddb.bigParts)
ddb.partsLock.Unlock()
}
// debugFlush() makes sure that the recently ingested data is availalbe for search.
func (ddb *datadb) debugFlush() {
// Nothing to do, since all the ingested data is available for search via ddb.inmemoryParts.
}
func (ddb *datadb) swapSrcWithDstParts(pws []*partWrapper, pwNew *partWrapper, dstPartType partType) {
// Atomically unregister old parts and add new part to pt.
partsToRemove := partsToMap(pws)
removedInmemoryParts := 0
removedSmallParts := 0
removedBigParts := 0
ddb.partsLock.Lock()
ddb.inmemoryParts, removedInmemoryParts = removeParts(ddb.inmemoryParts, partsToRemove)
ddb.smallParts, removedSmallParts = removeParts(ddb.smallParts, partsToRemove)
ddb.bigParts, removedBigParts = removeParts(ddb.bigParts, partsToRemove)
if pwNew != nil {
switch dstPartType {
case partInmemory:
ddb.inmemoryParts = append(ddb.inmemoryParts, pwNew)
ddb.startInmemoryPartsMergerLocked()
case partSmall:
ddb.smallParts = append(ddb.smallParts, pwNew)
ddb.startSmallPartsMergerLocked()
case partBig:
ddb.bigParts = append(ddb.bigParts, pwNew)
ddb.startBigPartsMergerLocked()
default:
logger.Panicf("BUG: unknown partType=%d", dstPartType)
}
}
// Atomically store the updated list of file-based parts on disk.
// This must be performed under partsLock in order to prevent from races
// when multiple concurrently running goroutines update the list.
if removedSmallParts > 0 || removedBigParts > 0 || pwNew != nil && dstPartType != partInmemory {
smallPartNames := getPartNames(ddb.smallParts)
bigPartNames := getPartNames(ddb.bigParts)
mustWritePartNames(ddb.path, smallPartNames, bigPartNames)
}
ddb.partsLock.Unlock()
removedParts := removedInmemoryParts + removedSmallParts + removedBigParts
if removedParts != len(partsToRemove) {
logger.Panicf("BUG: unexpected number of parts removed; got %d, want %d", removedParts, len(partsToRemove))
}
// Mark old parts as must be deleted and decrement reference count, so they are eventually closed and deleted.
for _, pw := range pws {
pw.mustDrop.Store(true)
pw.decRef()
}
}
func partsToMap(pws []*partWrapper) map[*partWrapper]struct{} {
m := make(map[*partWrapper]struct{}, len(pws))
for _, pw := range pws {
m[pw] = struct{}{}
}
if len(m) != len(pws) {
logger.Panicf("BUG: %d duplicate parts found out of %d parts", len(pws)-len(m), len(pws))
}
return m
}
func removeParts(pws []*partWrapper, partsToRemove map[*partWrapper]struct{}) ([]*partWrapper, int) {
dst := pws[:0]
for _, pw := range pws {
if _, ok := partsToRemove[pw]; !ok {
dst = append(dst, pw)
}
}
for i := len(dst); i < len(pws); i++ {
pws[i] = nil
}
return dst, len(pws) - len(dst)
}
func mustOpenBlockStreamReaders(pws []*partWrapper) []*blockStreamReader {
bsrs := make([]*blockStreamReader, 0, len(pws))
for _, pw := range pws {
bsr := getBlockStreamReader()
if pw.mp != nil {
bsr.MustInitFromInmemoryPart(pw.mp)
} else {
bsr.MustInitFromFilePart(pw.p.path)
}
bsrs = append(bsrs, bsr)
}
return bsrs
}
func newPartWrapper(p *part, mp *inmemoryPart, flushDeadline time.Time) *partWrapper {
pw := &partWrapper{
p: p,
mp: mp,
flushDeadline: flushDeadline,
}
// Increase reference counter for newly created part - it is decreased when the part
// is removed from the list of open parts.
pw.incRef()
return pw
}
func (ddb *datadb) getFlushToDiskDeadline(pws []*partWrapper) time.Time {
d := time.Now().Add(ddb.flushInterval)
for _, pw := range pws {
if pw.mp != nil && pw.flushDeadline.Before(d) {
d = pw.flushDeadline
}
}
return d
}
func getMaxInmemoryPartSize() uint64 {
// Allocate 10% of allowed memory for in-memory parts.
n := uint64(0.1 * float64(memory.Allowed()) / maxInmemoryPartsPerPartition)
if n < 1e6 {
n = 1e6
}
return n
}
func areAllInmemoryParts(pws []*partWrapper) bool {
for _, pw := range pws {
if pw.mp == nil {
return false
}
}
return true
}
func (ddb *datadb) releasePartsToMerge(pws []*partWrapper) {
ddb.partsLock.Lock()
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: missing isInMerge flag on the part %q", pw.p.path)
}
pw.isInMerge = false
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) getMaxBigPartSize() uint64 {
return getMaxOutBytes(ddb.path)
}
func (ddb *datadb) getMaxSmallPartSize() uint64 {
// Small parts are cached in the OS page cache,
// so limit their size by the remaining free RAM.
mem := memory.Remaining()
n := uint64(mem) / defaultPartsToMerge
if n < 10e6 {
n = 10e6
}
// Make sure the output part fits available disk space for small parts.
sizeLimit := getMaxOutBytes(ddb.path)
if n > sizeLimit {
n = sizeLimit
}
return n
}
func getMaxOutBytes(path string) uint64 {
n := availableDiskSpace(path)
if n > maxBigPartSize {
n = maxBigPartSize
}
return n
}
func availableDiskSpace(path string) uint64 {
available := fs.MustGetFreeSpace(path)
reserved := reservedDiskSpace.Load()
if available < reserved {
return 0
}
return available - reserved
}
func tryReserveDiskSpace(path string, n uint64) bool {
available := fs.MustGetFreeSpace(path)
reserved := reserveDiskSpace(n)
if available >= reserved {
return true
}
releaseDiskSpace(n)
return false
}
func reserveDiskSpace(n uint64) uint64 {
return reservedDiskSpace.Add(n)
}
func releaseDiskSpace(n uint64) {
reservedDiskSpace.Add(^(n - 1))
}
// reservedDiskSpace tracks global reserved disk space for currently executed
// background merges across all the partitions.
//
// It should allow avoiding background merges when there is no free disk space.
var reservedDiskSpace atomic.Uint64
func needStop(stopCh <-chan struct{}) bool {
select {
case <-stopCh:
return true
default:
return false
}
}
// mustCloseDatadb can be called only when nobody accesses ddb.
func mustCloseDatadb(ddb *datadb) {
// Notify background workers to stop.
// Make it under ddb.partsLock in order to prevent from calling ddb.wg.Add()
// after ddb.stopCh is closed and ddb.wg.Wait() is called.
ddb.partsLock.Lock()
close(ddb.stopCh)
ddb.partsLock.Unlock()
// Wait for background workers to stop.
ddb.wg.Wait()
// flush in-memory data to disk
ddb.mustFlushInmemoryPartsToFiles(true)
if len(ddb.inmemoryParts) > 0 {
logger.Panicf("BUG: the number of in-memory parts must be zero after flushing them to disk; got %d", len(ddb.inmemoryParts))
}
ddb.inmemoryParts = nil
// close small parts
for _, pw := range ddb.smallParts {
pw.decRef()
if n := pw.refCount.Load(); n != 0 {
logger.Panicf("BUG: there are %d references to smallPart", n)
}
}
ddb.smallParts = nil
// close big parts
for _, pw := range ddb.bigParts {
pw.decRef()
if n := pw.refCount.Load(); n != 0 {
logger.Panicf("BUG: there are %d references to bigPart", n)
}
}
ddb.bigParts = nil
ddb.path = ""
ddb.pt = nil
}
func getPartNames(pws []*partWrapper) []string {
partNames := make([]string, 0, len(pws))
for _, pw := range pws {
if pw.mp != nil {
// Skip in-memory parts
continue
}
partName := filepath.Base(pw.p.path)
partNames = append(partNames, partName)
}
sort.Strings(partNames)
return partNames
}
func mustWritePartNames(path string, smallPartNames, bigPartNames []string) {
partNames := append([]string{}, smallPartNames...)
partNames = append(partNames, bigPartNames...)
data, err := json.Marshal(partNames)
if err != nil {
logger.Panicf("BUG: cannot marshal partNames to JSON: %s", err)
}
partNamesPath := filepath.Join(path, partsFilename)
fs.MustWriteAtomic(partNamesPath, data, true)
}
func mustReadPartNames(path string) []string {
partNamesPath := filepath.Join(path, partsFilename)
data, err := os.ReadFile(partNamesPath)
if err != nil {
logger.Panicf("FATAL: cannot read %s: %s", partNamesPath, err)
}
var partNames []string
if err := json.Unmarshal(data, &partNames); err != nil {
logger.Panicf("FATAL: cannot parse %s: %s", partNamesPath, err)
}
return partNames
}
// mustRemoveUnusedDirs removes dirs at path, which are missing in partNames.
//
// These dirs may be left after unclean shutdown.
func mustRemoveUnusedDirs(path string, partNames []string) {
des := fs.MustReadDir(path)
m := make(map[string]struct{}, len(partNames))
for _, partName := range partNames {
m[partName] = struct{}{}
}
removedDirs := 0
for _, de := range des {
if !fs.IsDirOrSymlink(de) {
// Skip non-directories.
continue
}
fn := de.Name()
if _, ok := m[fn]; !ok {
deletePath := filepath.Join(path, fn)
fs.MustRemoveAll(deletePath)
removedDirs++
}
}
if removedDirs > 0 {
fs.MustSyncPath(path)
}
}
// appendPartsToMerge finds optimal parts to merge from src,
// appends them to dst and returns the result.
func appendPartsToMerge(dst, src []*partWrapper, maxOutBytes uint64) []*partWrapper {
if len(src) < 2 {
// There is no need in merging zero or one part :)
return dst
}
// Filter out too big parts.
// This should reduce N for O(N^2) algorithm below.
maxInPartBytes := uint64(float64(maxOutBytes) / minMergeMultiplier)
tmp := make([]*partWrapper, 0, len(src))
for _, pw := range src {
if pw.p.ph.CompressedSizeBytes > maxInPartBytes {
continue
}
tmp = append(tmp, pw)
}
src = tmp
sortPartsForOptimalMerge(src)
maxSrcParts := defaultPartsToMerge
if maxSrcParts > len(src) {
maxSrcParts = len(src)
}
minSrcParts := (maxSrcParts + 1) / 2
if minSrcParts < 2 {
minSrcParts = 2
}
// Exhaustive search for parts giving the lowest write amplification when merged.
var pws []*partWrapper
maxM := float64(0)
for i := minSrcParts; i <= maxSrcParts; i++ {
for j := 0; j <= len(src)-i; j++ {
a := src[j : j+i]
if a[0].p.ph.CompressedSizeBytes*uint64(len(a)) < a[len(a)-1].p.ph.CompressedSizeBytes {
// Do not merge parts with too big difference in size,
// since this results in unbalanced merges.
continue
}
outSize := getCompressedSize(a)
if outSize > maxOutBytes {
// There is no need in verifying remaining parts with bigger sizes.
break
}
m := float64(outSize) / float64(a[len(a)-1].p.ph.CompressedSizeBytes)
if m < maxM {
continue
}
maxM = m
pws = a
}
}
minM := float64(defaultPartsToMerge) / 2
if minM < minMergeMultiplier {
minM = minMergeMultiplier
}
if maxM < minM {
// There is no sense in merging parts with too small m,
// since this leads to high disk write IO.
return dst
}
return append(dst, pws...)
}
func sortPartsForOptimalMerge(pws []*partWrapper) {
// Sort src parts by size and backwards timestamp.
// This should improve adjanced points' locality in the merged parts.
sort.Slice(pws, func(i, j int) bool {
a := &pws[i].p.ph
b := &pws[j].p.ph
if a.CompressedSizeBytes == b.CompressedSizeBytes {
return a.MinTimestamp > b.MinTimestamp
}
return a.CompressedSizeBytes < b.CompressedSizeBytes
})
}
func getCompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.CompressedSizeBytes
}
return n
}
func getUncompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.UncompressedSizeBytes
}
return n
}
func getRowsCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.RowsCount
}
return n
}
func getBlocksCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.BlocksCount
}
return n
}
func (ddb *datadb) mustForceMergeAllParts() {
// Flush inmemory parts to files before forced merge
ddb.mustFlushInmemoryPartsToFiles(true)
var pws []*partWrapper
// Collect all the file parts for forced merge
ddb.partsLock.Lock()
pws = appendAllPartsForMergeLocked(pws, ddb.smallParts)
pws = appendAllPartsForMergeLocked(pws, ddb.bigParts)
ddb.partsLock.Unlock()
// If len(pws) == 1, then the merge must run anyway.
// This allows applying the configured retention, removing the deleted data, etc.
// Merge pws optimally
wg := getWaitGroup()
for len(pws) > 0 {
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
wg.Add(1)
bigPartsConcurrencyCh <- struct{}{}
go func(pwsChunk []*partWrapper) {
defer func() {
<-bigPartsConcurrencyCh
wg.Done()
}()
ddb.mustMergeParts(pwsChunk, false)
}(pwsToMerge)
pws = pwsRemaining
}
wg.Wait()
putWaitGroup(wg)
}
func appendAllPartsForMergeLocked(dst, src []*partWrapper) []*partWrapper {
for _, pw := range src {
if !pw.isInMerge {
pw.isInMerge = true
dst = append(dst, pw)
}
}
return dst
}