From ab8008d6d7f66d714c5b13bf4f3aab1314a24526 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@gmail.com>
Date: Thu, 22 Apr 2021 12:58:53 +0300
Subject: [PATCH] lib/{storage,mergeset}: remove empty directories on startup.
 Such directories can be left after unclean shutdown on NFS storage

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1142
---
 docs/CHANGELOG.md        |  3 +++
 lib/fs/dir_remover.go    |  6 +++---
 lib/fs/fs.go             | 18 ++++++++++++++++++
 lib/mergeset/table.go    |  6 ++++++
 lib/storage/partition.go |  6 ++++++
 5 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index fb7ec8e295..ee953dd3e5 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -10,6 +10,9 @@ sort: 14
 * FEATURE: add OpenTSDB migration option to vmctl. See more details [here](https://docs.victoriametrics.com/vmctl#migrating-data-from-opentsdb).
 Thanks to @johnseekins!
 
+* BUGFIX: vmstorage: remove empty directories on startup. Such directories can be left after unclean shutdown on NFS storage. Previously such directories could lead to crashloop until manually removed. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1142).
+
+
 ## [v1.58.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.58.0)
 
 * FEATURE: vminsert and vmagent: add `-sortLabels` command-line flag for sorting metric labels before pushing them to `vmstorage`. This should reduce the size of `MetricName -> internal_series_id` cache (aka `vm_cache_size_bytes{type="storage/tsid"}`) when ingesting samples for the same time series with distinct order of labels. For example, `foo{k1="v1",k2="v2"}` and `foo{k2="v2",k1="v1"}` represent a single time series. Labels sorting is disabled by default, since the majority of established exporters preserve the order of labels for the exported metrics.
diff --git a/lib/fs/dir_remover.go b/lib/fs/dir_remover.go
index 7c343c2511..a2982d40f6 100644
--- a/lib/fs/dir_remover.go
+++ b/lib/fs/dir_remover.go
@@ -81,7 +81,7 @@ func dirRemover() {
 		time.Sleep(sleepTime)
 		if sleepTime < maxSleepTime {
 			sleepTime *= 2
-		} else if sleepTime > time.Second {
+		} else if sleepTime > 5*time.Second {
 			logger.Warnf("failed to remove directory %q due to NFS lock; retrying later in %.3f seconds", w.path, sleepTime.Seconds())
 		}
 	}
@@ -121,11 +121,11 @@ func MustStopDirRemover() {
 		dirRemoverWG.Wait()
 		close(doneCh)
 	}()
-	const maxWaitTime = 5 * time.Second
+	const maxWaitTime = 10 * time.Second
 	select {
 	case <-doneCh:
 		return
 	case <-time.After(maxWaitTime):
-		logger.Panicf("FATAL: cannot stop dirRemover in %s", maxWaitTime)
+		logger.Errorf("cannot stop dirRemover in %s; the remaining empty NFS directories should be automatically removed on the next startup", maxWaitTime)
 	}
 }
diff --git a/lib/fs/fs.go b/lib/fs/fs.go
index f417f82579..f2227f2db2 100644
--- a/lib/fs/fs.go
+++ b/lib/fs/fs.go
@@ -174,6 +174,24 @@ func mustSyncParentDirIfExists(path string) {
 	MustSyncPath(parentDirPath)
 }
 
+// IsEmptyDir returns true if path points to empty directory.
+func IsEmptyDir(path string) bool {
+	// See https://stackoverflow.com/a/30708914/274937
+	f, err := os.Open(path)
+	if err != nil {
+		logger.Panicf("FATAL: unexpected error when opening directory %q: %s", path, err)
+	}
+	_, err = f.Readdirnames(1)
+	MustClose(f)
+	if err != nil {
+		if err == io.EOF {
+			return true
+		}
+		logger.Panicf("FATAL: unexpected error when reading directory %q: %s", path, err)
+	}
+	return false
+}
+
 // MustRemoveAll removes path with all the contents.
 //
 // It properly fsyncs the parent directory after path removal.
diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go
index ac57e975da..a5ce4d4ff6 100644
--- a/lib/mergeset/table.go
+++ b/lib/mergeset/table.go
@@ -1014,6 +1014,12 @@ func openParts(path string) ([]*partWrapper, error) {
 			continue
 		}
 		partPath := path + "/" + fn
+		if fs.IsEmptyDir(partPath) {
+			// Remove empty directory, which can be left after unclean shutdown on NFS.
+			// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1142
+			fs.MustRemoveAll(partPath)
+			continue
+		}
 		p, err := openFilePart(partPath)
 		if err != nil {
 			mustCloseParts(pws)
diff --git a/lib/storage/partition.go b/lib/storage/partition.go
index 3d9518c552..0196b916b9 100644
--- a/lib/storage/partition.go
+++ b/lib/storage/partition.go
@@ -1546,6 +1546,12 @@ func openParts(pathPrefix1, pathPrefix2, path string) ([]*partWrapper, error) {
 			continue
 		}
 		partPath := path + "/" + fn
+		if fs.IsEmptyDir(partPath) {
+			// Remove empty directory, which can be left after unclean shutdown on NFS.
+			// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1142
+			fs.MustRemoveAll(partPath)
+			continue
+		}
 		startTime := time.Now()
 		p, err := openFilePart(partPath)
 		if err != nil {