From 82bfe818d0685c09a8247ecd683a8f7232b70619 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 4 Sep 2019 12:22:18 +0300 Subject: [PATCH] lib/fs: try harder with directory removal on NFS in the event of temporary lock Do not give up after 11 attempts of directory removal on laggy NFS. Add `vm_nfs_dir_remove_failed_attempts_total` metric for counting the number of failed attempts on directory removal. Log failed attempts on directory removal after long sleep times. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 --- lib/fs/fs.go | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/lib/fs/fs.go b/lib/fs/fs.go index 060c09c45..2da41e7c7 100644 --- a/lib/fs/fs.go +++ b/lib/fs/fs.go @@ -248,12 +248,16 @@ func mustSyncParentDirIfExists(path string) { // // It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . func MustRemoveAll(path string) { + _ = mustRemoveAll(path) +} + +func mustRemoveAll(path string) bool { err := os.RemoveAll(path) if err == nil { // Make sure the parent directory doesn't contain references // to the current directory. mustSyncParentDirIfExists(path) - return + return true } if !isTemporaryNFSError(err) { logger.Panicf("FATAL: cannot remove %q: %s", path, err) @@ -261,38 +265,40 @@ func MustRemoveAll(path string) { // NFS prevents from removing directories with open files. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . // Schedule for later directory removal. + nfsDirRemoveFailedAttempts.Inc() select { case removeDirCh <- path: default: logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh)) } + return false } +var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`) + var removeDirCh = make(chan string, 1024) func dirRemover() { + const minSleepTime = 100 * time.Millisecond + const maxSleepTime = time.Second + sleepTime := minSleepTime for path := range removeDirCh { - attempts := 0 - for { - err := os.RemoveAll(path) - if err == nil { - break - } - if !isTemporaryNFSError(err) { - logger.Panicf("FATAL: cannot remove %q: %s", path, err) - } - // NFS prevents from removing directories with open files. - // Sleep for a while and try again in the hope open files will be closed. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . - attempts++ - if attempts > 10 { - logger.Panicf("FATAL: cannot remove %q in %d attempts: %s", path, attempts, err) - } - time.Sleep(100 * time.Millisecond) + if mustRemoveAll(path) { + sleepTime = minSleepTime + continue + } + + // Couldn't remove the directory at the path because of NFS lock. + // Sleep for a while and try again. + // Do not limit the amount of time required for deleting the directory, + // since this may break on laggy NFS. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 . + time.Sleep(sleepTime) + if sleepTime < maxSleepTime { + sleepTime *= 2 + } else { + logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path) } - // Make sure the parent directory doesn't contain references - // to the current directory. - mustSyncParentDirIfExists(path) } }