From 1ad0d22e80a1347007a76a43430c8894a5de313e Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Mon, 27 Feb 2023 12:57:22 -0800
Subject: [PATCH] lib/storage: follow-up for
 39cdc546ddf794a56a306134ff814e2e3a0ea39a

- Use flag.Duration instead of flagutil.Duration for -snapshotCreateTimeout,
  since the flagutil.Duration is intended mostly for big durations, e.g. days, months and years,
  while the -snapshotCreateTimeout is usually smaller than one hour.
- Add links to https://docs.victoriametrics.com/#how-to-work-with-snapshots in docs/CHANGELOG.md,
  so readers could easily find the corresponding docs when reading the changelog.
- Properly remove all the created directories on unsuccessful attempt to create
  snapshot in Storage.CreateSnapshot().

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
---
 README.md                             |  2 ++
 app/vmstorage/main.go                 | 11 +++----
 docs/CHANGELOG.md                     |  6 ++--
 docs/Cluster-VictoriaMetrics.md       |  5 ++--
 docs/README.md                        |  2 ++
 docs/Single-server-VictoriaMetrics.md |  5 ++--
 lib/mergeset/table.go                 | 11 ++++---
 lib/storage/partition.go              |  3 ++
 lib/storage/storage.go                | 42 +++++++++++++++------------
 lib/storage/table.go                  | 13 +++++----
 10 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index bbbfeeb515..4d58fc6ba4 100644
--- a/README.md
+++ b/README.md
@@ -1357,6 +1357,8 @@ Below is the output for `/path/to/vmstorage -help`:
      The maximum number of CPU cores to use for small merges. Default value is used if set to 0
   -snapshotAuthKey string
      authKey, which must be passed in query string to /snapshot* pages
+  -snapshotCreateTimeout duration
+     The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
   -snapshotsMaxAge value
      Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
      The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go
index 64c6fbc4ee..ab7423e252 100644
--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@@ -37,7 +37,7 @@ var (
 	forceMergeAuthKey     = flag.String("forceMergeAuthKey", "", "authKey, which must be passed in query string to /internal/force_merge pages")
 	forceFlushAuthKey     = flag.String("forceFlushAuthKey", "", "authKey, which must be passed in query string to /internal/force_flush pages")
 	snapshotsMaxAge       = flagutil.NewDuration("snapshotsMaxAge", "0", "Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted")
-	snapshotCreateTimeout = flagutil.NewDuration("snapshotCreateTimeout", "0", "Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period.")
+	snapshotCreateTimeout = flag.Duration("snapshotCreateTimeout", 0, "The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period")
 
 	finalMergeDelay = flag.Duration("finalMergeDelay", 0, "The delay before starting final merge for per-month partition after no new data is ingested into it. "+
 		"Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. "+
@@ -213,8 +213,8 @@ func requestHandler(w http.ResponseWriter, r *http.Request, strg *storage.Storag
 		snapshotsCreateTotal.Inc()
 		w.Header().Set("Content-Type", "application/json")
 		deadline := uint64(0)
-		if snapshotCreateTimeout.Msecs > 0 {
-			deadline = fasttime.UnixTimestamp() + uint64(snapshotCreateTimeout.Msecs/1e3)
+		if *snapshotCreateTimeout > 0 {
+			deadline = fasttime.UnixTimestamp() + uint64(snapshotCreateTimeout.Seconds())
 		}
 		snapshotPath, err := strg.CreateSnapshot(deadline)
 		if err != nil {
@@ -269,7 +269,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request, strg *storage.Storag
 			}
 		}
 
-		err = fmt.Errorf("cannot find snapshot %q: %w", snapshotName, err)
+		err = fmt.Errorf("cannot find snapshot %q", snapshotName)
 		jsonResponseError(w, err)
 		return true
 	case "/delete_all":
@@ -333,7 +333,8 @@ var (
 )
 
 var (
-	activeForceMerges          = metrics.NewCounter("vm_active_force_merges")
+	activeForceMerges = metrics.NewCounter("vm_active_force_merges")
+
 	snapshotsCreateTotal       = metrics.NewCounter(`vm_http_requests_total{path="/snapshot/create"}`)
 	snapshotsCreateErrorsTotal = metrics.NewCounter(`vm_http_request_errors_total{path="/snapshot/create"}`)
 
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 433cb0fdf3..479f75ed0b 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -15,15 +15,15 @@ The following tip changes can be tested by building VictoriaMetrics components f
 
 ## tip
 
-* FEATURE: add `-snapshotCreateTimeout` flag to allow configuring timeout for snapshot process. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551).
-* FEATURE: expose `vm_http_requests_total` and `vm_http_request_errors_total` metrics for `snapshot/*` paths at [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) `vmstorage` and [VictoriaMetrics Single](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551).
+* FEATURE: add `-snapshotCreateTimeout` flag to allow configuring timeout for [snapshot process](https://docs.victoriametrics.com/#how-to-work-with-snapshots). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551).
+* FEATURE: expose `vm_http_requests_total` and `vm_http_request_errors_total` metrics for `snapshot/*` [paths](https://docs.victoriametrics.com/#how-to-work-with-snapshots) at [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) `vmstorage` and [VictoriaMetrics Single](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551).
 * FEATURE: [vmgateway](https://docs.victoriametrics.com/vmgateway.html): add the ability to discover keys for JWT verification via [OpenID discovery endpoint](https://openid.net/specs/openid-connect-discovery-1_0.html). See [these docs](https://docs.victoriametrics.com/vmgateway.html#using-openid-discovery-endpoint-for-jwt-signature-verification).
 
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): fix panic when executing the query `aggr_func(rollup*(some_value))`. The panic has been introduced in [v1.88.0](https://docs.victoriametrics.com/CHANGELOG.html#v1880).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use the provided `-remoteWrite.*` auth options when determining whether the remote storage supports [VictoriaMetrics remote write protocol](https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol). Previously the auth options were ignored. This was preventing from automatic switch to VictoriaMetrics remote write protocol.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not register `vm_promscrape_config_*` metrics if `-promscrape.config` flag is not used. Previously those metrics were registered and never updated, which was confusing and could trigger false-positive alerts.
 * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): skip measurements with no fields when migrating data from influxdb. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3837).
-* BUGFIX: delete failed snapshot contents from disk when creating snapshot fails. Previously failed snapshot contents could remain on disk in incomplete state. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3858)
+* BUGFIX: delete failed snapshot contents from disk on failed attempt to [create snapshot](https://docs.victoriametrics.com/#how-to-work-with-snapshots). Previously failed snapshot contents could remain on disk in incomplete state. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3858)
 
 ## [v1.88.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.88.0)
 
diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md
index 30c4abcf2a..29ca1819ed 100644
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@@ -1361,9 +1361,8 @@ Below is the output for `/path/to/vmstorage -help`:
      The maximum number of CPU cores to use for small merges. Default value is used if set to 0
   -snapshotAuthKey string
      authKey, which must be passed in query string to /snapshot* pages
-  -snapshotCreateTimeout value
-     Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period.
-     The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
+  -snapshotCreateTimeout duration
+     The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
   -snapshotsMaxAge value
      Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
      The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
diff --git a/docs/README.md b/docs/README.md
index c86ab500e3..7e17ca5a58 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -2482,6 +2482,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
      The maximum number of CPU cores to use for small merges. Default value is used if set to 0
   -snapshotAuthKey string
      authKey, which must be passed in query string to /snapshot* pages
+  -snapshotCreateTimeout duration
+     The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
   -snapshotsMaxAge value
      Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
      The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md
index 565e22cb41..143f6d151e 100644
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@@ -2485,9 +2485,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
      The maximum number of CPU cores to use for small merges. Default value is used if set to 0
   -snapshotAuthKey string
      authKey, which must be passed in query string to /snapshot* pages
-  -snapshotCreateTimeout value
-     Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period.
-     The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
+  -snapshotCreateTimeout duration
+     The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period
   -snapshotsMaxAge value
      Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted
      The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0)
diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go
index 23158caa4b..9fa62bc9d6 100644
--- a/lib/mergeset/table.go
+++ b/lib/mergeset/table.go
@@ -1505,7 +1505,8 @@ func mustCloseParts(pws []*partWrapper) {
 // very quickly.
 //
 // If deadline is reached before snapshot is created error is returned.
-// If any error occurs during snapshot created data is not removed.
+//
+// The caller is responsible for data removal at dstDir on unsuccessful snapshot creation.
 func (tb *Table) CreateSnapshotAt(dstDir string, deadline uint64) error {
 	logger.Infof("creating Table snapshot of %q...", tb.path)
 	startTime := time.Now()
@@ -1547,11 +1548,9 @@ func (tb *Table) CreateSnapshotAt(dstDir string, deadline uint64) error {
 		return fmt.Errorf("cannot read directory: %w", err)
 	}
 
-	for i, fi := range fis {
-		if deadline > 0 && i%5 == 0 {
-			if fasttime.UnixTimestamp() > deadline {
-				return fmt.Errorf("cannot create snapshot for %q in time: timeout exceeded", tb.path)
-			}
+	for _, fi := range fis {
+		if deadline > 0 && fasttime.UnixTimestamp() > deadline {
+			return fmt.Errorf("cannot create snapshot for %q: timeout exceeded", tb.path)
 		}
 
 		fn := fi.Name()
diff --git a/lib/storage/partition.go b/lib/storage/partition.go
index 0b71e8bdb6..793fa6ddf4 100644
--- a/lib/storage/partition.go
+++ b/lib/storage/partition.go
@@ -1990,6 +1990,9 @@ func (pt *partition) CreateSnapshotAt(smallPath, bigPath string) error {
 	return nil
 }
 
+// createSnapshot creates a snapshot from srcDir to dstDir.
+//
+// The caller is responsible for deleting dstDir if createSnapshot() returns error.
 func (pt *partition) createSnapshot(srcDir, dstDir string) error {
 	if err := fs.MkdirAllFailIfExist(dstDir); err != nil {
 		return fmt.Errorf("cannot create snapshot dir %q: %w", dstDir, err)
diff --git a/lib/storage/storage.go b/lib/storage/storage.go
index 0c0f3db566..9e835f7916 100644
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@@ -328,65 +328,71 @@ func (s *Storage) CreateSnapshot(deadline uint64) (string, error) {
 	s.snapshotLock.Lock()
 	defer s.snapshotLock.Unlock()
 
+	var dirsToRemoveOnError []string
+	defer func() {
+		for _, dir := range dirsToRemoveOnError {
+			fs.MustRemoveAll(dir)
+		}
+	}()
+
 	snapshotName := snapshot.NewName()
 	srcDir := s.path
 	dstDir := fmt.Sprintf("%s/snapshots/%s", srcDir, snapshotName)
 	if err := fs.MkdirAllFailIfExist(dstDir); err != nil {
 		return "", fmt.Errorf("cannot create dir %q: %w", dstDir, err)
 	}
+	dirsToRemoveOnError = append(dirsToRemoveOnError, dstDir)
+
+	smallDir, bigDir, err := s.tb.CreateSnapshot(snapshotName, deadline)
+	if err != nil {
+		return "", fmt.Errorf("cannot create table snapshot: %w", err)
+	}
+	dirsToRemoveOnError = append(dirsToRemoveOnError, smallDir, bigDir)
+
 	dstDataDir := dstDir + "/data"
 	if err := fs.MkdirAllFailIfExist(dstDataDir); err != nil {
 		return "", fmt.Errorf("cannot create dir %q: %w", dstDataDir, err)
 	}
-
-	smallDir, bigDir, err := s.tb.CreateSnapshot(snapshotName, deadline)
-	if err != nil {
-		fs.MustRemoveAll(dstDir)
-		return "", fmt.Errorf("cannot create table snapshot: %w", err)
-	}
 	dstSmallDir := dstDataDir + "/small"
 	if err := fs.SymlinkRelative(smallDir, dstSmallDir); err != nil {
-		fs.MustRemoveAll(dstDir)
 		return "", fmt.Errorf("cannot create symlink from %q to %q: %w", smallDir, dstSmallDir, err)
 	}
 	dstBigDir := dstDataDir + "/big"
 	if err := fs.SymlinkRelative(bigDir, dstBigDir); err != nil {
-		fs.MustRemoveAll(dstDir)
 		return "", fmt.Errorf("cannot create symlink from %q to %q: %w", bigDir, dstBigDir, err)
 	}
 	fs.MustSyncPath(dstDataDir)
 
+	srcMetadataDir := srcDir + "/metadata"
+	dstMetadataDir := dstDir + "/metadata"
+	if err := fs.CopyDirectory(srcMetadataDir, dstMetadataDir); err != nil {
+		return "", fmt.Errorf("cannot copy metadata: %w", err)
+	}
+
 	idbSnapshot := fmt.Sprintf("%s/indexdb/snapshots/%s", srcDir, snapshotName)
 	idb := s.idb()
 	currSnapshot := idbSnapshot + "/" + idb.name
 	if err := idb.tb.CreateSnapshotAt(currSnapshot, deadline); err != nil {
-		fs.MustRemoveAll(dstDir)
 		return "", fmt.Errorf("cannot create curr indexDB snapshot: %w", err)
 	}
+	dirsToRemoveOnError = append(dirsToRemoveOnError, idbSnapshot)
+
 	ok := idb.doExtDB(func(extDB *indexDB) {
 		prevSnapshot := idbSnapshot + "/" + extDB.name
 		err = extDB.tb.CreateSnapshotAt(prevSnapshot, deadline)
 	})
 	if ok && err != nil {
-		fs.MustRemoveAll(dstDir)
 		return "", fmt.Errorf("cannot create prev indexDB snapshot: %w", err)
 	}
 	dstIdbDir := dstDir + "/indexdb"
 	if err := fs.SymlinkRelative(idbSnapshot, dstIdbDir); err != nil {
-		fs.MustRemoveAll(dstDir)
 		return "", fmt.Errorf("cannot create symlink from %q to %q: %w", idbSnapshot, dstIdbDir, err)
 	}
 
-	srcMetadataDir := srcDir + "/metadata"
-	dstMetadataDir := dstDir + "/metadata"
-	if err := fs.CopyDirectory(srcMetadataDir, dstMetadataDir); err != nil {
-		fs.MustRemoveAll(dstDir)
-		return "", fmt.Errorf("cannot copy metadata: %w", err)
-	}
-
 	fs.MustSyncPath(dstDir)
 
 	logger.Infof("created Storage snapshot for %q at %q in %.3f seconds", srcDir, dstDir, time.Since(startTime).Seconds())
+	dirsToRemoveOnError = nil
 	return snapshotName, nil
 }
 
diff --git a/lib/storage/table.go b/lib/storage/table.go
index 508644baef..32bb033733 100644
--- a/lib/storage/table.go
+++ b/lib/storage/table.go
@@ -157,19 +157,22 @@ func (tb *table) CreateSnapshot(snapshotName string, deadline uint64) (string, s
 	}
 	dstBigDir := fmt.Sprintf("%s/big/snapshots/%s", tb.path, snapshotName)
 	if err := fs.MkdirAllFailIfExist(dstBigDir); err != nil {
+		fs.MustRemoveAll(dstSmallDir)
 		return "", "", fmt.Errorf("cannot create dir %q: %w", dstBigDir, err)
 	}
 
-	for i, ptw := range ptws {
-		if deadline > 0 && i%5 == 0 {
-			if fasttime.UnixTimestamp() > deadline {
-				return "", "", fmt.Errorf("cannot create snapshot for %q in %q in time: timeout exceeded", tb.path, snapshotName)
-			}
+	for _, ptw := range ptws {
+		if deadline > 0 && fasttime.UnixTimestamp() > deadline {
+			fs.MustRemoveAll(dstSmallDir)
+			fs.MustRemoveAll(dstBigDir)
+			return "", "", fmt.Errorf("cannot create snapshot for %q: timeout exceeded", tb.path)
 		}
 
 		smallPath := dstSmallDir + "/" + ptw.pt.name
 		bigPath := dstBigDir + "/" + ptw.pt.name
 		if err := ptw.pt.CreateSnapshotAt(smallPath, bigPath); err != nil {
+			fs.MustRemoveAll(dstSmallDir)
+			fs.MustRemoveAll(dstBigDir)
 			return "", "", fmt.Errorf("cannot create snapshot for partition %q in %q: %w", ptw.pt.name, tb.path, err)
 		}
 	}