From 87b39222befd24c083c824f1cb91da39f5d35dd0 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 12 Nov 2019 16:29:43 +0200 Subject: [PATCH] Revert "lib/fs: do not postpone directory removal on NFS error" This reverts commit 21aeb02b46649ac9906cb37733f7b155a77a0db9. --- app/vminsert/main.go | 3 ++ app/vmselect/main.go | 2 + app/vmstorage/main.go | 3 ++ lib/fs/dir_remover.go | 111 ++++++++++++++++++++++++++++++++++++++++++ lib/fs/fs.go | 43 +--------------- 5 files changed, 120 insertions(+), 42 deletions(-) create mode 100644 lib/fs/dir_remover.go diff --git a/app/vminsert/main.go b/app/vminsert/main.go index 0ed86956d..d412911e3 100644 --- a/app/vminsert/main.go +++ b/app/vminsert/main.go @@ -16,6 +16,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" @@ -88,6 +89,8 @@ func main() { netstorage.Stop() logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) + fs.MustStopDirRemover() + logger.Infof("the vminsert has been stopped") } diff --git a/app/vmselect/main.go b/app/vmselect/main.go index 47db6099e..a1fc24ba1 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -76,6 +76,8 @@ func main() { } logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) + fs.MustStopDirRemover() + logger.Infof("the vmselect has been stopped") } diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 4951df9ac..b0ec15974 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -10,6 +10,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/transport" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" @@ -82,6 +83,8 @@ func main() { strg.MustClose() logger.Infof("successfully closed the storage in %s", time.Since(startTime)) + fs.MustStopDirRemover() + logger.Infof("the vmstorage has been stopped") } diff --git a/lib/fs/dir_remover.go b/lib/fs/dir_remover.go new file mode 100644 index 000000000..15a9daab8 --- /dev/null +++ b/lib/fs/dir_remover.go @@ -0,0 +1,111 @@ +package fs + +import ( + "os" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/metrics" +) + +func mustRemoveAll(path string) bool { + err := os.RemoveAll(path) + if err == nil { + // Make sure the parent directory doesn't contain references + // to the current directory. + mustSyncParentDirIfExists(path) + return true + } + if !isTemporaryNFSError(err) { + logger.Panicf("FATAL: cannot remove %q: %s", path, err) + } + // NFS prevents from removing directories with open files. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . + // Schedule for later directory removal. + nfsDirRemoveFailedAttempts.Inc() + select { + case removeDirCh <- path: + default: + logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh)) + } + return false +} + +var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`) + +var removeDirCh = make(chan string, 1024) + +func dirRemover() { + const minSleepTime = 100 * time.Millisecond + const maxSleepTime = time.Second + sleepTime := minSleepTime + for { + var path string + select { + case path = <-removeDirCh: + default: + if atomic.LoadUint64(&stopDirRemover) != 0 { + return + } + time.Sleep(minSleepTime) + continue + } + if mustRemoveAll(path) { + sleepTime = minSleepTime + continue + } + + // Couldn't remove the directory at the path because of NFS lock. + // Sleep for a while and try again. + // Do not limit the amount of time required for deleting the directory, + // since this may break on laggy NFS. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 . + time.Sleep(sleepTime) + if sleepTime < maxSleepTime { + sleepTime *= 2 + } else { + logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path) + } + } +} + +func isTemporaryNFSError(err error) bool { + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 for details. + errStr := err.Error() + return strings.Contains(errStr, "directory not empty") || strings.Contains(errStr, "device or resource busy") +} + +var dirRemoverWG sync.WaitGroup +var stopDirRemover uint64 + +func init() { + dirRemoverWG.Add(1) + go func() { + defer dirRemoverWG.Done() + dirRemover() + }() +} + +// MustStopDirRemover must be called in the end of graceful shutdown +// in order to wait for removing the remaining directories from removeDirCh. +// +// It is expected that nobody calls MustRemoveAll when MustStopDirRemover +// is called. +func MustStopDirRemover() { + atomic.StoreUint64(&stopDirRemover, 1) + doneCh := make(chan struct{}) + go func() { + dirRemoverWG.Wait() + close(doneCh) + }() + const maxWaitTime = 5 * time.Second + select { + case <-doneCh: + return + case <-time.After(maxWaitTime): + logger.Panicf("FATAL: cannot stop dirRemover in %s", maxWaitTime) + } +} diff --git a/lib/fs/fs.go b/lib/fs/fs.go index 863972ccd..4b0cd3619 100644 --- a/lib/fs/fs.go +++ b/lib/fs/fs.go @@ -6,9 +6,7 @@ import ( "os" "path/filepath" "regexp" - "strings" "sync/atomic" - "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" @@ -248,46 +246,7 @@ func mustSyncParentDirIfExists(path string) { // // It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . func MustRemoveAll(path string) { - startTime := time.Now() - sleepTime := 100 * time.Millisecond -again: - err := os.RemoveAll(path) - if err == nil { - // Make sure the parent directory doesn't contain references - // to the current directory. - mustSyncParentDirIfExists(path) - return - } - if !isTemporaryNFSError(err) { - logger.Panicf("FATAL: cannot remove %q: %s", path, err) - } - // NFS prevents from removing directories with open files. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . - // Continuously try removing the directory for up to a minute before giving up. - // - // Do not postpone directory removal, since it breaks in the following case: - // - Remove the directory (the removal is postponed) - // - Scan for existing directories and open them. The scan finds - // the `removed` directory, but its contents may be already broken. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 . - nfsDirRemoveFailedAttempts.Inc() - if time.Since(startTime) > time.Minute { - logger.Panicf("FATAL: couldn't remove NFS directory %q in %s", path, time.Minute) - } - time.Sleep(sleepTime) - sleepTime *= 2 - if sleepTime > time.Second { - sleepTime = time.Second - } - goto again -} - -var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`) - -func isTemporaryNFSError(err error) bool { - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 for details. - errStr := err.Error() - return strings.Contains(errStr, "directory not empty") || strings.Contains(errStr, "device or resource busy") + _ = mustRemoveAll(path) } // HardLinkFiles makes hard links for all the files from srcDir in dstDir.