diff --git a/app/vminsert/main.go b/app/vminsert/main.go index d412911e38..0ed86956d8 100644 --- a/app/vminsert/main.go +++ b/app/vminsert/main.go @@ -16,7 +16,6 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/auth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" @@ -89,8 +88,6 @@ func main() { netstorage.Stop() logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) - fs.MustStopDirRemover() - logger.Infof("the vminsert has been stopped") } diff --git a/app/vmselect/main.go b/app/vmselect/main.go index a1fc24ba12..47db6099ef 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -76,8 +76,6 @@ func main() { } logger.Infof("successfully stopped netstorage in %s", time.Since(startTime)) - fs.MustStopDirRemover() - logger.Infof("the vmselect has been stopped") } diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 0dbbf70696..72b2914812 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -10,7 +10,6 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/transport" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" @@ -83,8 +82,6 @@ func main() { strg.MustClose() logger.Infof("successfully closed the storage in %s", time.Since(startTime)) - fs.MustStopDirRemover() - logger.Infof("the vmstorage has been stopped") } diff --git a/lib/fs/dir_remover.go b/lib/fs/dir_remover.go deleted file mode 100644 index 15a9daab8b..0000000000 --- a/lib/fs/dir_remover.go +++ /dev/null @@ -1,111 +0,0 @@ -package fs - -import ( - "os" - "strings" - "sync" - "sync/atomic" - "time" - - "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" - "github.com/VictoriaMetrics/metrics" -) - -func mustRemoveAll(path string) bool { - err := os.RemoveAll(path) - if err == nil { - // Make sure the parent directory doesn't contain references - // to the current directory. - mustSyncParentDirIfExists(path) - return true - } - if !isTemporaryNFSError(err) { - logger.Panicf("FATAL: cannot remove %q: %s", path, err) - } - // NFS prevents from removing directories with open files. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . - // Schedule for later directory removal. - nfsDirRemoveFailedAttempts.Inc() - select { - case removeDirCh <- path: - default: - logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh)) - } - return false -} - -var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`) - -var removeDirCh = make(chan string, 1024) - -func dirRemover() { - const minSleepTime = 100 * time.Millisecond - const maxSleepTime = time.Second - sleepTime := minSleepTime - for { - var path string - select { - case path = <-removeDirCh: - default: - if atomic.LoadUint64(&stopDirRemover) != 0 { - return - } - time.Sleep(minSleepTime) - continue - } - if mustRemoveAll(path) { - sleepTime = minSleepTime - continue - } - - // Couldn't remove the directory at the path because of NFS lock. - // Sleep for a while and try again. - // Do not limit the amount of time required for deleting the directory, - // since this may break on laggy NFS. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 . - time.Sleep(sleepTime) - if sleepTime < maxSleepTime { - sleepTime *= 2 - } else { - logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path) - } - } -} - -func isTemporaryNFSError(err error) bool { - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 for details. - errStr := err.Error() - return strings.Contains(errStr, "directory not empty") || strings.Contains(errStr, "device or resource busy") -} - -var dirRemoverWG sync.WaitGroup -var stopDirRemover uint64 - -func init() { - dirRemoverWG.Add(1) - go func() { - defer dirRemoverWG.Done() - dirRemover() - }() -} - -// MustStopDirRemover must be called in the end of graceful shutdown -// in order to wait for removing the remaining directories from removeDirCh. -// -// It is expected that nobody calls MustRemoveAll when MustStopDirRemover -// is called. -func MustStopDirRemover() { - atomic.StoreUint64(&stopDirRemover, 1) - doneCh := make(chan struct{}) - go func() { - dirRemoverWG.Wait() - close(doneCh) - }() - const maxWaitTime = 5 * time.Second - select { - case <-doneCh: - return - case <-time.After(maxWaitTime): - logger.Panicf("FATAL: cannot stop dirRemover in %s", maxWaitTime) - } -} diff --git a/lib/fs/fs.go b/lib/fs/fs.go index 4b0cd36197..eeb5cb7dbf 100644 --- a/lib/fs/fs.go +++ b/lib/fs/fs.go @@ -6,7 +6,9 @@ import ( "os" "path/filepath" "regexp" + "strings" "sync/atomic" + "time" "github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" @@ -246,7 +248,46 @@ func mustSyncParentDirIfExists(path string) { // // It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . func MustRemoveAll(path string) { - _ = mustRemoveAll(path) + startTime := time.Now() + sleepTime := 100 * time.Millisecond +again: + err := os.RemoveAll(path) + if err == nil { + // Make sure the parent directory doesn't contain references + // to the current directory. + mustSyncParentDirIfExists(path) + return + } + if !isTemporaryNFSError(err) { + logger.Panicf("FATAL: cannot remove %q: %s", path, err) + } + // NFS prevents from removing directories with open files. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . + // Continuously try removing the directory for up to a minute before giving up. + // + // Do not postpone directory removal, since it breaks in the following case: + // - Remove the directory (the removal is postponed) + // - Scan for exsiting directories and open them. The scan finds + // the `removed` directory, but its contents may be already broken. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 . + nfsDirRemoveFailedAttempts.Inc() + if time.Since(startTime) > time.Minute { + logger.Panicf("FATAL: couldn't remove NFS directory %q in %s", path, time.Minute) + } + time.Sleep(sleepTime) + sleepTime *= 2 + if sleepTime > time.Second { + sleepTime = time.Second + } + goto again +} + +var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`) + +func isTemporaryNFSError(err error) bool { + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 for details. + errStr := err.Error() + return strings.Contains(errStr, "directory not empty") || strings.Contains(errStr, "device or resource busy") } // HardLinkFiles makes hard links for all the files from srcDir in dstDir.