lib/{mergeset,storage}: make background merge more responsive and scalable

- Maintain a separate worker pool per each part type (in-memory, file, big and small). Previously a shared pool was used for merging all the part types. A single merge worker could merge parts with mixed types at once. For example, it could merge simultaneously an in-memory part plus a big file part. Such a merge could take hours for big file part. During the duration of this merge the in-memory part was pinned in memory and couldn't be persisted to disk under the configured -inmemoryDataFlushInterval . Another common issue, which could happen when parts with mixed types are merged, is uncontrolled growth of in-memory parts or small parts when all the merge workers were busy with merging big files. Such growth could lead to significant performance degradataion for queries, since every query needs to check ever growing list of parts. This could also slow down the registration of new time series, since VictoriaMetrics searches for the internal series_id in the indexdb for every new time series. The third issue is graceful shutdown duration, which could be very long when a background merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted, since it merges in-memory parts. A separate pool of merge workers per every part type elegantly resolves both issues: - In-memory parts are merged to file-based parts in a timely manner, since the maximum size of in-memory parts is limited. - Long-running merges for big parts do not block merges for in-memory parts and small parts. - Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files. Merging for file parts is instantly canceled on graceful shutdown now. - Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm should automatically self-tune according to the number of available CPU cores. - Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly. It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge - Tune the number of shards for pending rows and items before the data goes to in-memory parts and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate for registration of new time series. This should reduce the duration of data ingestion slowdown in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily unavailable. - Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown. This is a follow-up for fa566c68a6 . Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2025-03-11 15:34:56 +00:00 · 2024-01-26 21:39:49 +01:00 · 2024-01-26 21:39:49 +01:00 · bb7a419cc3
commit bb7a419cc3
parent 97937d58c4
6 changed files with 925 additions and 732 deletions
--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@ -38,13 +38,10 @@ var (
 	// DataPath is a path to storage data.
 	DataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to storage data")

-	finalMergeDelay = flag.Duration("finalMergeDelay", 0, "The delay before starting final merge for per-month partition after no new data is ingested into it. "+
-		"Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. "+
-		"Zero value disables final merge")
-	_ = flag.Int("bigMergeConcurrency", 0, "Deprecated: this flag does nothing. Please use -smallMergeConcurrency "+
-		"for controlling the concurrency of background merges. See https://docs.victoriametrics.com/#storage")
-	smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of workers for background merges. See https://docs.victoriametrics.com/#storage . "+
-		"It isn't recommended tuning this flag in general case, since this may lead to uncontrolled increase in the number of parts and increased CPU usage during queries")
+	_ = flag.Duration("finalMergeDelay", 0, "Deprecated: this flag does nothing")
+	_ = flag.Int("bigMergeConcurrency", 0, "Deprecated: this flag does nothing")
+	_ = flag.Int("smallMergeConcurrency", 0, "Deprecated: this flag does nothing")
+
 	retentionTimezoneOffset = flag.Duration("retentionTimezoneOffset", 0, "The offset for performing indexdb rotation. "+
 		"If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. "+
 		"If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)")
@ -96,8 +93,6 @@ func Init(resetCacheIfNeeded func(mrs []storage.MetricRow)) {

 	resetResponseCacheIfNeeded = resetCacheIfNeeded
 	storage.SetLogNewSeries(*logNewSeries)
-	storage.SetFinalMergeDelay(*finalMergeDelay)
-	storage.SetMergeWorkersCount(*smallMergeConcurrency)
 	storage.SetRetentionTimezoneOffset(*retentionTimezoneOffset)
 	storage.SetFreeDiskSpaceLimit(minFreeDiskSpaceBytes.N)
 	storage.SetTSIDCacheSize(cacheSizeStorageTSID.IntN())
@ -496,11 +491,8 @@ func writeStorageMetrics(w io.Writer, strg *storage.Storage) {
 	metrics.WriteCounterUint64(w, `vm_composite_filter_success_conversions_total`, idbm.CompositeFilterSuccessConversions)
 	metrics.WriteCounterUint64(w, `vm_composite_filter_missing_conversions_total`, idbm.CompositeFilterMissingConversions)

-	metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="storage/inmemory"}`, tm.InmemoryAssistedMergesCount)
-	metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="storage/small"}`, tm.SmallAssistedMergesCount)
-
-	metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/inmemory"}`, idbm.InmemoryAssistedMergesCount)
-	metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/file"}`, idbm.FileAssistedMergesCount)
+	// vm_assisted_merges_total name is used for backwards compatibility.
+	metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/inmemory"}`, idbm.InmemoryPartsLimitReachedCount)

 	metrics.WriteCounterUint64(w, `vm_indexdb_items_added_total`, idbm.ItemsAdded)
 	metrics.WriteCounterUint64(w, `vm_indexdb_items_added_size_bytes_total`, idbm.ItemsAddedSizeBytes)
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -31,6 +31,7 @@ The sandbox cluster installation is running under the constant load generated by
 * SECURITY: upgrade Go builder from Go1.21.5 to Go1.21.6. See [the list of issues addressed in Go1.21.6](https://github.com/golang/go/issues?q=milestone%3AGo1.21.6+label%3ACherryPickApproved).

 * FEATURE: improve new [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) registration speed on systems with high number of CPU cores. Thanks to @misutoth for the initial idea and [implementation](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212).
+* FEATURE: make [background merge](https://docs.victoriametrics.com/#storage) more responsive and scalable. This should help the following issues: [5190](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190), [3425](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425), [648](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for discovering [Hetzner Cloud](https://www.hetzner.com/cloud) and [Hetzner Robot](https://docs.hetzner.com/robot) scrape targets. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3154) and [these docs](https://docs.victoriametrics.com/sd_configs.html#hetzner_sd_configs).
 * FEATURE: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): add support for negative index in `groupByNode` and `aliasByNode` functions. Thanks to @rbizos for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5581).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for [DataDog v2 data ingestion protocol](https://docs.datadoghq.com/api/latest/metrics/#submit-metrics). See [these docs](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4451).
--- a/lib/mergeset/table.go
+++ b/lib/mergeset/table.go
--- a/lib/storage/partition.go
+++ b/lib/storage/partition.go
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@ -663,7 +663,8 @@ func (s *Storage) startFreeDiskSpaceWatcher() {
 		// Use atomic.LoadUint32 in front of atomic.CompareAndSwapUint32 in order to avoid slow inter-CPU synchronization
 		// when the storage isn't in read-only mode.
 		if atomic.LoadUint32(&s.isReadOnly) == 1 && atomic.CompareAndSwapUint32(&s.isReadOnly, 1, 0) {
-			logger.Warnf("enabling writing to the storage at %s, since it has more than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left",
+			s.notifyReadWriteMode()
+			logger.Warnf("switching the storage at %s to read-write mode, since it has more than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left",
 				s.path, freeDiskSpaceLimitBytes, freeSpaceBytes)
 		}
 	}
@ -685,6 +686,16 @@ func (s *Storage) startFreeDiskSpaceWatcher() {
 	}()
 }

+func (s *Storage) notifyReadWriteMode() {
+	s.tb.NotifyReadWriteMode()
+
+	idb := s.idb()
+	idb.tb.NotifyReadWriteMode()
+	idb.doExtDB(func(extDB *indexDB) {
+		extDB.tb.NotifyReadWriteMode()
+	})
+}
+
 func (s *Storage) startRetentionWatcher() {
 	s.retentionWatcherWG.Add(1)
 	go func() {
--- a/lib/storage/table.go
+++ b/lib/storage/table.go
@ -205,6 +205,14 @@ func (tb *table) flushPendingRows() {
 	}
 }

+func (tb *table) NotifyReadWriteMode() {
+	tb.ptwsLock.Lock()
+	for _, ptw := range tb.ptws {
+		ptw.pt.NotifyReadWriteMode()
+	}
+	tb.ptwsLock.Unlock()
+}
+
 // TableMetrics contains essential metrics for the table.
 type TableMetrics struct {
 	partitionMetrics