2019-05-22 21:16:55 +00:00
|
|
|
package storage
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"path/filepath"
|
2020-09-17 09:01:53 +00:00
|
|
|
"strings"
|
2019-05-22 21:16:55 +00:00
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
|
|
|
"time"
|
|
|
|
|
2020-05-14 19:01:51 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
2019-05-22 21:16:55 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
2024-01-22 16:12:37 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timeutil"
|
2019-05-22 21:16:55 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// table represents a single table with time series data.
|
|
|
|
type table struct {
|
|
|
|
path string
|
|
|
|
smallPartitionsPath string
|
|
|
|
bigPartitionsPath string
|
|
|
|
|
2022-10-23 22:30:50 +00:00
|
|
|
s *Storage
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
ptws []*partitionWrapper
|
|
|
|
ptwsLock sync.Mutex
|
|
|
|
|
2024-04-02 18:24:57 +00:00
|
|
|
stopCh chan struct{}
|
2019-05-22 21:16:55 +00:00
|
|
|
|
2021-12-15 13:58:27 +00:00
|
|
|
retentionWatcherWG sync.WaitGroup
|
|
|
|
finalDedupWatcherWG sync.WaitGroup
|
2024-04-02 18:24:57 +00:00
|
|
|
forceMergeWG sync.WaitGroup
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// partitionWrapper provides refcounting mechanism for the partition.
|
|
|
|
type partitionWrapper struct {
|
2024-02-23 20:54:55 +00:00
|
|
|
// refCount is the number of open references to partitionWrapper.
|
|
|
|
refCount atomic.Int32
|
2019-10-17 15:22:56 +00:00
|
|
|
|
2024-02-23 20:54:55 +00:00
|
|
|
// if mustDrop is true, then the partition must be dropped after refCount reaches zero.
|
|
|
|
mustDrop atomic.Bool
|
2019-10-17 15:22:56 +00:00
|
|
|
|
2019-10-17 17:04:26 +00:00
|
|
|
pt *partition
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (ptw *partitionWrapper) incRef() {
|
2024-02-23 20:54:55 +00:00
|
|
|
ptw.refCount.Add(1)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (ptw *partitionWrapper) decRef() {
|
2024-02-23 20:54:55 +00:00
|
|
|
n := ptw.refCount.Add(-1)
|
|
|
|
if n < 0 {
|
|
|
|
logger.Panicf("BUG: pts.refCount must be positive; got %d", n)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
if n > 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// refCount is zero. Close the partition.
|
|
|
|
ptw.pt.MustClose()
|
|
|
|
|
2024-02-23 20:54:55 +00:00
|
|
|
if !ptw.mustDrop.Load() {
|
2019-05-22 21:16:55 +00:00
|
|
|
ptw.pt = nil
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2024-02-23 20:54:55 +00:00
|
|
|
// Drop the partition.
|
2019-05-22 21:16:55 +00:00
|
|
|
ptw.pt.Drop()
|
|
|
|
ptw.pt = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ptw *partitionWrapper) scheduleToDrop() {
|
2024-02-23 20:54:55 +00:00
|
|
|
ptw.mustDrop.Store(true)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
2023-04-15 06:01:20 +00:00
|
|
|
// mustOpenTable opens a table on the given path.
|
2019-05-22 21:16:55 +00:00
|
|
|
//
|
|
|
|
// The table is created if it doesn't exist.
|
2023-04-15 06:01:20 +00:00
|
|
|
func mustOpenTable(path string, s *Storage) *table {
|
2019-05-22 21:16:55 +00:00
|
|
|
path = filepath.Clean(path)
|
|
|
|
|
|
|
|
// Create a directory for the table if it doesn't exist yet.
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirIfNotExist(path)
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
// Create directories for small and big partitions if they don't exist yet.
|
2023-03-25 21:33:54 +00:00
|
|
|
smallPartitionsPath := filepath.Join(path, smallDirname)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirIfNotExist(smallPartitionsPath)
|
2022-09-13 10:37:34 +00:00
|
|
|
fs.MustRemoveTemporaryDirs(smallPartitionsPath)
|
2023-04-14 05:11:56 +00:00
|
|
|
|
2023-03-25 21:33:54 +00:00
|
|
|
smallSnapshotsPath := filepath.Join(smallPartitionsPath, snapshotsDirname)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirIfNotExist(smallSnapshotsPath)
|
2022-09-13 10:10:33 +00:00
|
|
|
fs.MustRemoveTemporaryDirs(smallSnapshotsPath)
|
|
|
|
|
2023-03-25 21:33:54 +00:00
|
|
|
bigPartitionsPath := filepath.Join(path, bigDirname)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirIfNotExist(bigPartitionsPath)
|
2022-09-13 10:37:34 +00:00
|
|
|
fs.MustRemoveTemporaryDirs(bigPartitionsPath)
|
2023-04-14 05:11:56 +00:00
|
|
|
|
2023-03-25 21:33:54 +00:00
|
|
|
bigSnapshotsPath := filepath.Join(bigPartitionsPath, snapshotsDirname)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirIfNotExist(bigSnapshotsPath)
|
2022-09-13 10:10:33 +00:00
|
|
|
fs.MustRemoveTemporaryDirs(bigSnapshotsPath)
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
// Open partitions.
|
2023-04-15 06:01:20 +00:00
|
|
|
pts := mustOpenPartitions(smallPartitionsPath, bigPartitionsPath, s)
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
tb := &table{
|
|
|
|
path: path,
|
|
|
|
smallPartitionsPath: smallPartitionsPath,
|
|
|
|
bigPartitionsPath: bigPartitionsPath,
|
2022-10-23 13:08:54 +00:00
|
|
|
s: s,
|
2019-05-22 21:16:55 +00:00
|
|
|
|
2024-04-02 18:24:57 +00:00
|
|
|
stopCh: make(chan struct{}),
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
for _, pt := range pts {
|
|
|
|
tb.addPartitionNolock(pt)
|
|
|
|
}
|
|
|
|
tb.startRetentionWatcher()
|
2021-12-15 13:58:27 +00:00
|
|
|
tb.startFinalDedupWatcher()
|
2023-04-15 06:01:20 +00:00
|
|
|
return tb
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
2024-02-23 02:46:11 +00:00
|
|
|
// MustCreateSnapshot creates tb snapshot and returns paths to small and big parts of it.
|
|
|
|
func (tb *table) MustCreateSnapshot(snapshotName string) (string, string) {
|
2019-05-22 21:16:55 +00:00
|
|
|
logger.Infof("creating table snapshot of %q...", tb.path)
|
|
|
|
startTime := time.Now()
|
|
|
|
|
|
|
|
ptws := tb.GetPartitions(nil)
|
|
|
|
defer tb.PutPartitions(ptws)
|
|
|
|
|
2023-03-25 21:33:54 +00:00
|
|
|
dstSmallDir := filepath.Join(tb.path, smallDirname, snapshotsDirname, snapshotName)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirFailIfExist(dstSmallDir)
|
|
|
|
|
2023-03-25 21:33:54 +00:00
|
|
|
dstBigDir := filepath.Join(tb.path, bigDirname, snapshotsDirname, snapshotName)
|
2023-04-14 05:11:56 +00:00
|
|
|
fs.MustMkdirFailIfExist(dstBigDir)
|
2019-05-22 21:16:55 +00:00
|
|
|
|
2023-02-27 20:57:22 +00:00
|
|
|
for _, ptw := range ptws {
|
2023-03-25 21:33:54 +00:00
|
|
|
smallPath := filepath.Join(dstSmallDir, ptw.pt.name)
|
|
|
|
bigPath := filepath.Join(dstBigDir, ptw.pt.name)
|
2023-04-14 06:02:55 +00:00
|
|
|
ptw.pt.MustCreateSnapshotAt(smallPath, bigPath)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
2019-06-11 20:13:04 +00:00
|
|
|
fs.MustSyncPath(dstSmallDir)
|
|
|
|
fs.MustSyncPath(dstBigDir)
|
|
|
|
fs.MustSyncPath(filepath.Dir(dstSmallDir))
|
|
|
|
fs.MustSyncPath(filepath.Dir(dstBigDir))
|
2019-05-22 21:16:55 +00:00
|
|
|
|
2020-01-22 16:27:44 +00:00
|
|
|
logger.Infof("created table snapshot for %q at (%q, %q) in %.3f seconds", tb.path, dstSmallDir, dstBigDir, time.Since(startTime).Seconds())
|
2024-02-23 02:46:11 +00:00
|
|
|
return dstSmallDir, dstBigDir
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// MustDeleteSnapshot deletes snapshot with the given snapshotName.
|
|
|
|
func (tb *table) MustDeleteSnapshot(snapshotName string) {
|
2023-03-25 21:33:54 +00:00
|
|
|
smallDir := filepath.Join(tb.path, smallDirname, snapshotsDirname, snapshotName)
|
2022-09-13 10:10:33 +00:00
|
|
|
fs.MustRemoveDirAtomic(smallDir)
|
2023-03-25 21:33:54 +00:00
|
|
|
bigDir := filepath.Join(tb.path, bigDirname, snapshotsDirname, snapshotName)
|
2022-09-13 10:10:33 +00:00
|
|
|
fs.MustRemoveDirAtomic(bigDir)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (tb *table) addPartitionNolock(pt *partition) {
|
|
|
|
ptw := &partitionWrapper{
|
2024-02-23 20:54:55 +00:00
|
|
|
pt: pt,
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
2024-02-23 20:54:55 +00:00
|
|
|
ptw.incRef()
|
2019-05-22 21:16:55 +00:00
|
|
|
tb.ptws = append(tb.ptws, ptw)
|
|
|
|
}
|
|
|
|
|
|
|
|
// MustClose closes the table.
|
2021-02-17 12:59:04 +00:00
|
|
|
// It is expected that all the pending searches on the table are finished before calling MustClose.
|
2019-05-22 21:16:55 +00:00
|
|
|
func (tb *table) MustClose() {
|
2024-04-02 18:24:57 +00:00
|
|
|
close(tb.stopCh)
|
2019-05-22 21:16:55 +00:00
|
|
|
tb.retentionWatcherWG.Wait()
|
2021-12-15 13:58:27 +00:00
|
|
|
tb.finalDedupWatcherWG.Wait()
|
2024-04-02 18:24:57 +00:00
|
|
|
tb.forceMergeWG.Wait()
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
ptws := tb.ptws
|
|
|
|
tb.ptws = nil
|
|
|
|
tb.ptwsLock.Unlock()
|
|
|
|
|
|
|
|
for _, ptw := range ptws {
|
2024-02-23 20:54:55 +00:00
|
|
|
if n := ptw.refCount.Load(); n != 1 {
|
2021-02-17 12:59:04 +00:00
|
|
|
logger.Panicf("BUG: unexpected refCount=%d when closing the partition; probably there are pending searches", n)
|
|
|
|
}
|
2021-02-18 10:47:36 +00:00
|
|
|
ptw.decRef()
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-05 23:15:00 +00:00
|
|
|
// flushPendingRows flushes all the pending raw rows, so they become visible to search.
|
2019-05-22 21:16:55 +00:00
|
|
|
//
|
|
|
|
// This function is for debug purposes only.
|
2022-12-04 06:17:46 +00:00
|
|
|
func (tb *table) flushPendingRows() {
|
2019-05-22 21:16:55 +00:00
|
|
|
ptws := tb.GetPartitions(nil)
|
|
|
|
defer tb.PutPartitions(ptws)
|
|
|
|
|
|
|
|
for _, ptw := range ptws {
|
2024-02-22 15:22:23 +00:00
|
|
|
ptw.pt.flushPendingRows(true)
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 20:39:49 +00:00
|
|
|
func (tb *table) NotifyReadWriteMode() {
|
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
for _, ptw := range tb.ptws {
|
|
|
|
ptw.pt.NotifyReadWriteMode()
|
|
|
|
}
|
|
|
|
tb.ptwsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
2019-05-22 21:16:55 +00:00
|
|
|
// TableMetrics contains essential metrics for the table.
|
|
|
|
type TableMetrics struct {
|
|
|
|
partitionMetrics
|
|
|
|
|
2024-02-15 12:51:19 +00:00
|
|
|
// LastPartition contains metrics for the last partition.
|
|
|
|
// These metrics are important, since the majority of data ingestion
|
|
|
|
// and querying goes to the last partition.
|
|
|
|
LastPartition partitionMetrics
|
|
|
|
|
2019-05-22 21:16:55 +00:00
|
|
|
PartitionsRefCount uint64
|
|
|
|
}
|
|
|
|
|
|
|
|
// UpdateMetrics updates m with metrics from tb.
|
|
|
|
func (tb *table) UpdateMetrics(m *TableMetrics) {
|
2024-02-15 12:51:19 +00:00
|
|
|
ptws := tb.GetPartitions(nil)
|
|
|
|
defer tb.PutPartitions(ptws)
|
|
|
|
|
|
|
|
for _, ptw := range ptws {
|
2019-05-22 21:16:55 +00:00
|
|
|
ptw.pt.UpdateMetrics(&m.partitionMetrics)
|
2024-02-23 20:54:55 +00:00
|
|
|
m.PartitionsRefCount += uint64(ptw.refCount.Load())
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
2024-02-15 12:51:19 +00:00
|
|
|
|
|
|
|
// Collect separate metrics for the last partition.
|
|
|
|
if len(ptws) > 0 {
|
|
|
|
ptwLast := ptws[0]
|
|
|
|
for _, ptw := range ptws[1:] {
|
|
|
|
if ptw.pt.tr.MinTimestamp > ptwLast.pt.tr.MinTimestamp {
|
|
|
|
ptwLast = ptw
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptwLast.pt.UpdateMetrics(&m.LastPartition)
|
|
|
|
}
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
2020-09-17 09:01:53 +00:00
|
|
|
// ForceMergePartitions force-merges partitions in tb with names starting from the given partitionNamePrefix.
|
|
|
|
//
|
|
|
|
// Partitions are merged sequentially in order to reduce load on the system.
|
|
|
|
func (tb *table) ForceMergePartitions(partitionNamePrefix string) error {
|
|
|
|
ptws := tb.GetPartitions(nil)
|
|
|
|
defer tb.PutPartitions(ptws)
|
2024-04-02 18:24:57 +00:00
|
|
|
|
|
|
|
tb.forceMergeWG.Add(1)
|
|
|
|
defer tb.forceMergeWG.Done()
|
|
|
|
|
2020-09-17 09:01:53 +00:00
|
|
|
for _, ptw := range ptws {
|
|
|
|
if !strings.HasPrefix(ptw.pt.name, partitionNamePrefix) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
logger.Infof("starting forced merge for partition %q", ptw.pt.name)
|
|
|
|
startTime := time.Now()
|
2024-04-02 18:24:57 +00:00
|
|
|
if err := ptw.pt.ForceMergeAllParts(tb.stopCh); err != nil {
|
2020-09-17 09:01:53 +00:00
|
|
|
return fmt.Errorf("cannot complete forced merge for partition %q: %w", ptw.pt.name, err)
|
|
|
|
}
|
|
|
|
logger.Infof("forced merge for partition %q has been finished in %.3f seconds", ptw.pt.name, time.Since(startTime).Seconds())
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-04-14 05:11:56 +00:00
|
|
|
// MustAddRows adds the given rows to the table tb.
|
|
|
|
func (tb *table) MustAddRows(rows []rawRow) {
|
2019-05-22 21:16:55 +00:00
|
|
|
if len(rows) == 0 {
|
2023-04-14 05:11:56 +00:00
|
|
|
return
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
2024-07-25 12:15:07 +00:00
|
|
|
|
2019-05-22 21:16:55 +00:00
|
|
|
// Verify whether all the rows may be added to a single partition.
|
|
|
|
ptwsX := getPartitionWrappers()
|
|
|
|
defer putPartitionWrappers(ptwsX)
|
|
|
|
|
|
|
|
ptwsX.a = tb.GetPartitions(ptwsX.a[:0])
|
|
|
|
ptws := ptwsX.a
|
2021-02-17 12:59:04 +00:00
|
|
|
for i, ptw := range ptws {
|
2024-07-25 12:15:07 +00:00
|
|
|
singlePt := true
|
|
|
|
for j := range rows {
|
|
|
|
if !ptw.pt.HasTimestamp(rows[j].Timestamp) {
|
|
|
|
singlePt = false
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !singlePt {
|
2019-05-22 21:16:55 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-02-17 12:59:04 +00:00
|
|
|
if i != 0 {
|
|
|
|
// Move the partition with the matching rows to the front of tb.ptws,
|
|
|
|
// so it will be detected faster next time.
|
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
for j := range tb.ptws {
|
|
|
|
if ptw == tb.ptws[j] {
|
|
|
|
tb.ptws[0], tb.ptws[j] = tb.ptws[j], tb.ptws[0]
|
|
|
|
break
|
|
|
|
}
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
2021-02-17 12:59:04 +00:00
|
|
|
tb.ptwsLock.Unlock()
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Fast path - add all the rows into the ptw.
|
|
|
|
ptw.pt.AddRows(rows)
|
|
|
|
tb.PutPartitions(ptws)
|
2023-04-14 05:11:56 +00:00
|
|
|
return
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Slower path - split rows into per-partition buckets.
|
|
|
|
ptBuckets := make(map[*partitionWrapper][]rawRow)
|
|
|
|
var missingRows []rawRow
|
|
|
|
for i := range rows {
|
|
|
|
r := &rows[i]
|
|
|
|
ptFound := false
|
|
|
|
for _, ptw := range ptws {
|
|
|
|
if ptw.pt.HasTimestamp(r.Timestamp) {
|
|
|
|
ptBuckets[ptw] = append(ptBuckets[ptw], *r)
|
|
|
|
ptFound = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !ptFound {
|
|
|
|
missingRows = append(missingRows, *r)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for ptw, ptRows := range ptBuckets {
|
|
|
|
ptw.pt.AddRows(ptRows)
|
|
|
|
}
|
|
|
|
tb.PutPartitions(ptws)
|
|
|
|
if len(missingRows) == 0 {
|
2023-04-14 05:11:56 +00:00
|
|
|
return
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// The slowest path - there are rows that don't fit any existing partition.
|
|
|
|
// Create new partitions for these rows.
|
|
|
|
// Do this under tb.ptwsLock.
|
2019-07-11 14:04:56 +00:00
|
|
|
minTimestamp, maxTimestamp := tb.getMinMaxTimestamps()
|
2019-05-22 21:16:55 +00:00
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
for i := range missingRows {
|
|
|
|
r := &missingRows[i]
|
|
|
|
|
2019-07-11 14:04:56 +00:00
|
|
|
if r.Timestamp < minTimestamp || r.Timestamp > maxTimestamp {
|
|
|
|
// Silently skip row outside retention, since it should be deleted anyway.
|
2019-05-22 21:16:55 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure the partition for the r hasn't been added by another goroutines.
|
|
|
|
ptFound := false
|
|
|
|
for _, ptw := range tb.ptws {
|
|
|
|
if ptw.pt.HasTimestamp(r.Timestamp) {
|
|
|
|
ptFound = true
|
|
|
|
ptw.pt.AddRows(missingRows[i : i+1])
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ptFound {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2023-04-14 05:11:56 +00:00
|
|
|
pt := mustCreatePartition(r.Timestamp, tb.smallPartitionsPath, tb.bigPartitionsPath, tb.s)
|
2019-05-22 21:16:55 +00:00
|
|
|
pt.AddRows(missingRows[i : i+1])
|
|
|
|
tb.addPartitionNolock(pt)
|
|
|
|
}
|
|
|
|
tb.ptwsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
2019-07-11 14:04:56 +00:00
|
|
|
func (tb *table) getMinMaxTimestamps() (int64, int64) {
|
2020-05-14 19:01:51 +00:00
|
|
|
now := int64(fasttime.UnixTimestamp() * 1000)
|
2022-10-23 22:30:50 +00:00
|
|
|
minTimestamp := now - tb.s.retentionMsecs
|
2019-07-11 14:04:56 +00:00
|
|
|
maxTimestamp := now + 2*24*3600*1000 // allow max +2 days from now due to timezones shit :)
|
|
|
|
if minTimestamp < 0 {
|
|
|
|
// Negative timestamps aren't supported by the storage.
|
|
|
|
minTimestamp = 0
|
|
|
|
}
|
|
|
|
if maxTimestamp < 0 {
|
|
|
|
maxTimestamp = (1 << 63) - 1
|
|
|
|
}
|
|
|
|
return minTimestamp, maxTimestamp
|
|
|
|
}
|
|
|
|
|
2019-05-22 21:16:55 +00:00
|
|
|
func (tb *table) startRetentionWatcher() {
|
|
|
|
tb.retentionWatcherWG.Add(1)
|
|
|
|
go func() {
|
|
|
|
tb.retentionWatcher()
|
|
|
|
tb.retentionWatcherWG.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (tb *table) retentionWatcher() {
|
2024-01-22 16:12:37 +00:00
|
|
|
d := timeutil.AddJitterToDuration(time.Minute)
|
|
|
|
ticker := time.NewTicker(d)
|
2020-02-13 10:55:58 +00:00
|
|
|
defer ticker.Stop()
|
2019-05-22 21:16:55 +00:00
|
|
|
for {
|
|
|
|
select {
|
2024-04-02 18:24:57 +00:00
|
|
|
case <-tb.stopCh:
|
2019-05-22 21:16:55 +00:00
|
|
|
return
|
2020-02-13 10:55:58 +00:00
|
|
|
case <-ticker.C:
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
|
2022-10-23 22:30:50 +00:00
|
|
|
minTimestamp := int64(fasttime.UnixTimestamp()*1000) - tb.s.retentionMsecs
|
2019-05-22 21:16:55 +00:00
|
|
|
var ptwsDrop []*partitionWrapper
|
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
dst := tb.ptws[:0]
|
|
|
|
for _, ptw := range tb.ptws {
|
|
|
|
if ptw.pt.tr.MaxTimestamp < minTimestamp {
|
|
|
|
ptwsDrop = append(ptwsDrop, ptw)
|
|
|
|
} else {
|
|
|
|
dst = append(dst, ptw)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tb.ptws = dst
|
|
|
|
tb.ptwsLock.Unlock()
|
|
|
|
|
|
|
|
if len(ptwsDrop) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2023-02-13 12:27:13 +00:00
|
|
|
// There are partitions to drop. Drop them.
|
2019-05-22 21:16:55 +00:00
|
|
|
|
|
|
|
// Remove table references from partitions, so they will be eventually
|
|
|
|
// closed and dropped after all the pending searches are done.
|
|
|
|
for _, ptw := range ptwsDrop {
|
|
|
|
ptw.scheduleToDrop()
|
|
|
|
ptw.decRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-15 13:58:27 +00:00
|
|
|
func (tb *table) startFinalDedupWatcher() {
|
|
|
|
tb.finalDedupWatcherWG.Add(1)
|
|
|
|
go func() {
|
|
|
|
tb.finalDedupWatcher()
|
|
|
|
tb.finalDedupWatcherWG.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (tb *table) finalDedupWatcher() {
|
|
|
|
if !isDedupEnabled() {
|
|
|
|
// Deduplication is disabled.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
f := func() {
|
|
|
|
ptws := tb.GetPartitions(nil)
|
|
|
|
defer tb.PutPartitions(ptws)
|
|
|
|
timestamp := timestampFromTime(time.Now())
|
|
|
|
currentPartitionName := timestampToPartitionName(timestamp)
|
2022-06-15 15:37:52 +00:00
|
|
|
var ptwsToDedup []*partitionWrapper
|
2021-12-15 13:58:27 +00:00
|
|
|
for _, ptw := range ptws {
|
2024-03-30 01:22:51 +00:00
|
|
|
if ptw.pt.name == currentPartitionName {
|
2021-12-15 13:58:27 +00:00
|
|
|
// Do not run final dedup for the current month.
|
|
|
|
continue
|
|
|
|
}
|
2024-03-30 01:22:51 +00:00
|
|
|
if !ptw.pt.isFinalDedupNeeded() {
|
|
|
|
// There is no need to run final dedup for the given partition.
|
|
|
|
continue
|
|
|
|
}
|
2022-06-15 15:37:52 +00:00
|
|
|
// mark partition with final deduplication marker
|
|
|
|
ptw.pt.isDedupScheduled.Store(true)
|
|
|
|
ptwsToDedup = append(ptwsToDedup, ptw)
|
|
|
|
}
|
|
|
|
for _, ptw := range ptwsToDedup {
|
2024-04-02 18:24:57 +00:00
|
|
|
if err := ptw.pt.runFinalDedup(tb.stopCh); err != nil {
|
2021-12-15 13:58:27 +00:00
|
|
|
logger.Errorf("cannot run final dedup for partition %s: %s", ptw.pt.name, err)
|
|
|
|
}
|
2022-06-15 15:37:52 +00:00
|
|
|
ptw.pt.isDedupScheduled.Store(false)
|
2021-12-15 13:58:27 +00:00
|
|
|
}
|
|
|
|
}
|
2024-01-22 16:12:37 +00:00
|
|
|
d := timeutil.AddJitterToDuration(time.Hour)
|
|
|
|
t := time.NewTicker(d)
|
2021-12-15 13:58:27 +00:00
|
|
|
defer t.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
2024-04-02 18:24:57 +00:00
|
|
|
case <-tb.stopCh:
|
2021-12-15 13:58:27 +00:00
|
|
|
return
|
|
|
|
case <-t.C:
|
|
|
|
f()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-22 21:16:55 +00:00
|
|
|
// GetPartitions appends tb's partitions snapshot to dst and returns the result.
|
|
|
|
//
|
|
|
|
// The returned partitions must be passed to PutPartitions
|
|
|
|
// when they no longer needed.
|
|
|
|
func (tb *table) GetPartitions(dst []*partitionWrapper) []*partitionWrapper {
|
|
|
|
tb.ptwsLock.Lock()
|
|
|
|
for _, ptw := range tb.ptws {
|
|
|
|
ptw.incRef()
|
|
|
|
dst = append(dst, ptw)
|
|
|
|
}
|
|
|
|
tb.ptwsLock.Unlock()
|
|
|
|
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
// PutPartitions deregisters ptws obtained via GetPartitions.
|
|
|
|
func (tb *table) PutPartitions(ptws []*partitionWrapper) {
|
|
|
|
for _, ptw := range ptws {
|
|
|
|
ptw.decRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-15 06:01:20 +00:00
|
|
|
func mustOpenPartitions(smallPartitionsPath, bigPartitionsPath string, s *Storage) []*partition {
|
2019-11-06 17:48:01 +00:00
|
|
|
// Certain partition directories in either `big` or `small` dir may be missing
|
|
|
|
// after restoring from backup. So populate partition names from both dirs.
|
|
|
|
ptNames := make(map[string]bool)
|
2023-04-15 05:08:43 +00:00
|
|
|
mustPopulatePartitionNames(smallPartitionsPath, ptNames)
|
|
|
|
mustPopulatePartitionNames(bigPartitionsPath, ptNames)
|
2019-11-06 17:48:01 +00:00
|
|
|
var pts []*partition
|
|
|
|
for ptName := range ptNames {
|
2023-03-25 18:43:19 +00:00
|
|
|
smallPartsPath := filepath.Join(smallPartitionsPath, ptName)
|
|
|
|
bigPartsPath := filepath.Join(bigPartitionsPath, ptName)
|
2023-04-15 06:01:20 +00:00
|
|
|
pt := mustOpenPartition(smallPartsPath, bigPartsPath, s)
|
2019-11-06 17:48:01 +00:00
|
|
|
pts = append(pts, pt)
|
|
|
|
}
|
2023-04-15 06:01:20 +00:00
|
|
|
return pts
|
2019-11-06 17:48:01 +00:00
|
|
|
}
|
|
|
|
|
2023-04-15 05:08:43 +00:00
|
|
|
func mustPopulatePartitionNames(partitionsPath string, ptNames map[string]bool) {
|
|
|
|
des := fs.MustReadDir(partitionsPath)
|
2023-03-18 04:03:34 +00:00
|
|
|
for _, de := range des {
|
|
|
|
if !fs.IsDirOrSymlink(de) {
|
2019-05-22 21:16:55 +00:00
|
|
|
// Skip non-directories
|
|
|
|
continue
|
|
|
|
}
|
2023-03-18 04:03:34 +00:00
|
|
|
ptName := de.Name()
|
2023-03-25 21:33:54 +00:00
|
|
|
if ptName == snapshotsDirname {
|
2019-11-06 17:48:01 +00:00
|
|
|
// Skip directory with snapshots
|
2019-05-22 21:16:55 +00:00
|
|
|
continue
|
|
|
|
}
|
2019-11-06 17:48:01 +00:00
|
|
|
ptNames[ptName] = true
|
2019-05-22 21:16:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type partitionWrappers struct {
|
|
|
|
a []*partitionWrapper
|
|
|
|
}
|
|
|
|
|
|
|
|
func getPartitionWrappers() *partitionWrappers {
|
|
|
|
v := ptwsPool.Get()
|
|
|
|
if v == nil {
|
|
|
|
return &partitionWrappers{}
|
|
|
|
}
|
|
|
|
return v.(*partitionWrappers)
|
|
|
|
}
|
|
|
|
|
|
|
|
func putPartitionWrappers(ptwsX *partitionWrappers) {
|
|
|
|
ptwsX.a = ptwsX.a[:0]
|
|
|
|
ptwsPool.Put(ptwsX)
|
|
|
|
}
|
|
|
|
|
|
|
|
var ptwsPool sync.Pool
|