lib/{mergeset,storage}: make background merge more responsive and scalable

- Maintain a separate worker pool per each part type (in-memory, file, big and small).
  Previously a shared pool was used for merging all the part types.
  A single merge worker could merge parts with mixed types at once. For example,
  it could merge simultaneously an in-memory part plus a big file part.
  Such a merge could take hours for big file part. During the duration of this merge
  the in-memory part was pinned in memory and couldn't be persisted to disk
  under the configured -inmemoryDataFlushInterval .

  Another common issue, which could happen when parts with mixed types are merged,
  is uncontrolled growth of in-memory parts or small parts when all the merge workers
  were busy with merging big files. Such growth could lead to significant performance
  degradataion for queries, since every query needs to check ever growing list of parts.
  This could also slow down the registration of new time series, since VictoriaMetrics
  searches for the internal series_id in the indexdb for every new time series.

  The third issue is graceful shutdown duration, which could be very long when a background
  merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
  since it merges in-memory parts.

  A separate pool of merge workers per every part type elegantly resolves both issues:
  - In-memory parts are merged to file-based parts in a timely manner, since the maximum
    size of in-memory parts is limited.
  - Long-running merges for big parts do not block merges for in-memory parts and small parts.
  - Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
    Merging for file parts is instantly canceled on graceful shutdown now.

- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
  should automatically self-tune according to the number of available CPU cores.

- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
  It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge

- Tune the number of shards for pending rows and items before the data goes to in-memory parts
  and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
  for registration of new time series. This should reduce the duration of data ingestion slowdown
  in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
  unavailable.

- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.

This is a follow-up for fa566c68a6 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
This commit is contained in:
Aliaksandr Valialkin 2024-01-26 21:39:49 +01:00
parent 97937d58c4
commit bb7a419cc3
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
6 changed files with 925 additions and 732 deletions

View file

@ -38,13 +38,10 @@ var (
// DataPath is a path to storage data. // DataPath is a path to storage data.
DataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to storage data") DataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to storage data")
finalMergeDelay = flag.Duration("finalMergeDelay", 0, "The delay before starting final merge for per-month partition after no new data is ingested into it. "+ _ = flag.Duration("finalMergeDelay", 0, "Deprecated: this flag does nothing")
"Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. "+ _ = flag.Int("bigMergeConcurrency", 0, "Deprecated: this flag does nothing")
"Zero value disables final merge") _ = flag.Int("smallMergeConcurrency", 0, "Deprecated: this flag does nothing")
_ = flag.Int("bigMergeConcurrency", 0, "Deprecated: this flag does nothing. Please use -smallMergeConcurrency "+
"for controlling the concurrency of background merges. See https://docs.victoriametrics.com/#storage")
smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of workers for background merges. See https://docs.victoriametrics.com/#storage . "+
"It isn't recommended tuning this flag in general case, since this may lead to uncontrolled increase in the number of parts and increased CPU usage during queries")
retentionTimezoneOffset = flag.Duration("retentionTimezoneOffset", 0, "The offset for performing indexdb rotation. "+ retentionTimezoneOffset = flag.Duration("retentionTimezoneOffset", 0, "The offset for performing indexdb rotation. "+
"If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. "+ "If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. "+
"If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)") "If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)")
@ -96,8 +93,6 @@ func Init(resetCacheIfNeeded func(mrs []storage.MetricRow)) {
resetResponseCacheIfNeeded = resetCacheIfNeeded resetResponseCacheIfNeeded = resetCacheIfNeeded
storage.SetLogNewSeries(*logNewSeries) storage.SetLogNewSeries(*logNewSeries)
storage.SetFinalMergeDelay(*finalMergeDelay)
storage.SetMergeWorkersCount(*smallMergeConcurrency)
storage.SetRetentionTimezoneOffset(*retentionTimezoneOffset) storage.SetRetentionTimezoneOffset(*retentionTimezoneOffset)
storage.SetFreeDiskSpaceLimit(minFreeDiskSpaceBytes.N) storage.SetFreeDiskSpaceLimit(minFreeDiskSpaceBytes.N)
storage.SetTSIDCacheSize(cacheSizeStorageTSID.IntN()) storage.SetTSIDCacheSize(cacheSizeStorageTSID.IntN())
@ -496,11 +491,8 @@ func writeStorageMetrics(w io.Writer, strg *storage.Storage) {
metrics.WriteCounterUint64(w, `vm_composite_filter_success_conversions_total`, idbm.CompositeFilterSuccessConversions) metrics.WriteCounterUint64(w, `vm_composite_filter_success_conversions_total`, idbm.CompositeFilterSuccessConversions)
metrics.WriteCounterUint64(w, `vm_composite_filter_missing_conversions_total`, idbm.CompositeFilterMissingConversions) metrics.WriteCounterUint64(w, `vm_composite_filter_missing_conversions_total`, idbm.CompositeFilterMissingConversions)
metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="storage/inmemory"}`, tm.InmemoryAssistedMergesCount) // vm_assisted_merges_total name is used for backwards compatibility.
metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="storage/small"}`, tm.SmallAssistedMergesCount) metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/inmemory"}`, idbm.InmemoryPartsLimitReachedCount)
metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/inmemory"}`, idbm.InmemoryAssistedMergesCount)
metrics.WriteCounterUint64(w, `vm_assisted_merges_total{type="indexdb/file"}`, idbm.FileAssistedMergesCount)
metrics.WriteCounterUint64(w, `vm_indexdb_items_added_total`, idbm.ItemsAdded) metrics.WriteCounterUint64(w, `vm_indexdb_items_added_total`, idbm.ItemsAdded)
metrics.WriteCounterUint64(w, `vm_indexdb_items_added_size_bytes_total`, idbm.ItemsAddedSizeBytes) metrics.WriteCounterUint64(w, `vm_indexdb_items_added_size_bytes_total`, idbm.ItemsAddedSizeBytes)

View file

@ -31,6 +31,7 @@ The sandbox cluster installation is running under the constant load generated by
* SECURITY: upgrade Go builder from Go1.21.5 to Go1.21.6. See [the list of issues addressed in Go1.21.6](https://github.com/golang/go/issues?q=milestone%3AGo1.21.6+label%3ACherryPickApproved). * SECURITY: upgrade Go builder from Go1.21.5 to Go1.21.6. See [the list of issues addressed in Go1.21.6](https://github.com/golang/go/issues?q=milestone%3AGo1.21.6+label%3ACherryPickApproved).
* FEATURE: improve new [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) registration speed on systems with high number of CPU cores. Thanks to @misutoth for the initial idea and [implementation](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212). * FEATURE: improve new [time series](https://docs.victoriametrics.com/keyConcepts.html#time-series) registration speed on systems with high number of CPU cores. Thanks to @misutoth for the initial idea and [implementation](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212).
* FEATURE: make [background merge](https://docs.victoriametrics.com/#storage) more responsive and scalable. This should help the following issues: [5190](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190), [3425](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425), [648](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for discovering [Hetzner Cloud](https://www.hetzner.com/cloud) and [Hetzner Robot](https://docs.hetzner.com/robot) scrape targets. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3154) and [these docs](https://docs.victoriametrics.com/sd_configs.html#hetzner_sd_configs). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for discovering [Hetzner Cloud](https://www.hetzner.com/cloud) and [Hetzner Robot](https://docs.hetzner.com/robot) scrape targets. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3154) and [these docs](https://docs.victoriametrics.com/sd_configs.html#hetzner_sd_configs).
* FEATURE: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): add support for negative index in `groupByNode` and `aliasByNode` functions. Thanks to @rbizos for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5581). * FEATURE: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): add support for negative index in `groupByNode` and `aliasByNode` functions. Thanks to @rbizos for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5581).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for [DataDog v2 data ingestion protocol](https://docs.datadoghq.com/api/latest/metrics/#submit-metrics). See [these docs](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4451). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for [DataDog v2 data ingestion protocol](https://docs.datadoghq.com/api/latest/metrics/#submit-metrics). See [these docs](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4451).

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -663,7 +663,8 @@ func (s *Storage) startFreeDiskSpaceWatcher() {
// Use atomic.LoadUint32 in front of atomic.CompareAndSwapUint32 in order to avoid slow inter-CPU synchronization // Use atomic.LoadUint32 in front of atomic.CompareAndSwapUint32 in order to avoid slow inter-CPU synchronization
// when the storage isn't in read-only mode. // when the storage isn't in read-only mode.
if atomic.LoadUint32(&s.isReadOnly) == 1 && atomic.CompareAndSwapUint32(&s.isReadOnly, 1, 0) { if atomic.LoadUint32(&s.isReadOnly) == 1 && atomic.CompareAndSwapUint32(&s.isReadOnly, 1, 0) {
logger.Warnf("enabling writing to the storage at %s, since it has more than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left", s.notifyReadWriteMode()
logger.Warnf("switching the storage at %s to read-write mode, since it has more than -storage.minFreeDiskSpaceBytes=%d of free space: %d bytes left",
s.path, freeDiskSpaceLimitBytes, freeSpaceBytes) s.path, freeDiskSpaceLimitBytes, freeSpaceBytes)
} }
} }
@ -685,6 +686,16 @@ func (s *Storage) startFreeDiskSpaceWatcher() {
}() }()
} }
func (s *Storage) notifyReadWriteMode() {
s.tb.NotifyReadWriteMode()
idb := s.idb()
idb.tb.NotifyReadWriteMode()
idb.doExtDB(func(extDB *indexDB) {
extDB.tb.NotifyReadWriteMode()
})
}
func (s *Storage) startRetentionWatcher() { func (s *Storage) startRetentionWatcher() {
s.retentionWatcherWG.Add(1) s.retentionWatcherWG.Add(1)
go func() { go func() {

View file

@ -205,6 +205,14 @@ func (tb *table) flushPendingRows() {
} }
} }
func (tb *table) NotifyReadWriteMode() {
tb.ptwsLock.Lock()
for _, ptw := range tb.ptws {
ptw.pt.NotifyReadWriteMode()
}
tb.ptwsLock.Unlock()
}
// TableMetrics contains essential metrics for the table. // TableMetrics contains essential metrics for the table.
type TableMetrics struct { type TableMetrics struct {
partitionMetrics partitionMetrics