lib/fs: try harder with directory removal on NFS in the event of temporary lock

Do not give up after 11 attempts of directory removal on laggy NFS.

Add `vm_nfs_dir_remove_failed_attempts_total` metric for counting the number of failed attempts
on directory removal.

Log failed attempts on directory removal after long sleep times.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
This commit is contained in:
Aliaksandr Valialkin 2019-09-04 12:22:18 +03:00
parent 7cde25bac4
commit 82bfe818d0

View file

@ -248,12 +248,16 @@ func mustSyncParentDirIfExists(path string) {
// //
// It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . // It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
func MustRemoveAll(path string) { func MustRemoveAll(path string) {
_ = mustRemoveAll(path)
}
func mustRemoveAll(path string) bool {
err := os.RemoveAll(path) err := os.RemoveAll(path)
if err == nil { if err == nil {
// Make sure the parent directory doesn't contain references // Make sure the parent directory doesn't contain references
// to the current directory. // to the current directory.
mustSyncParentDirIfExists(path) mustSyncParentDirIfExists(path)
return return true
} }
if !isTemporaryNFSError(err) { if !isTemporaryNFSError(err) {
logger.Panicf("FATAL: cannot remove %q: %s", path, err) logger.Panicf("FATAL: cannot remove %q: %s", path, err)
@ -261,38 +265,40 @@ func MustRemoveAll(path string) {
// NFS prevents from removing directories with open files. // NFS prevents from removing directories with open files.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
// Schedule for later directory removal. // Schedule for later directory removal.
nfsDirRemoveFailedAttempts.Inc()
select { select {
case removeDirCh <- path: case removeDirCh <- path:
default: default:
logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh)) logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh))
} }
return false
} }
var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`)
var removeDirCh = make(chan string, 1024) var removeDirCh = make(chan string, 1024)
func dirRemover() { func dirRemover() {
const minSleepTime = 100 * time.Millisecond
const maxSleepTime = time.Second
sleepTime := minSleepTime
for path := range removeDirCh { for path := range removeDirCh {
attempts := 0 if mustRemoveAll(path) {
for { sleepTime = minSleepTime
err := os.RemoveAll(path) continue
if err == nil { }
break
} // Couldn't remove the directory at the path because of NFS lock.
if !isTemporaryNFSError(err) { // Sleep for a while and try again.
logger.Panicf("FATAL: cannot remove %q: %s", path, err) // Do not limit the amount of time required for deleting the directory,
} // since this may break on laggy NFS.
// NFS prevents from removing directories with open files. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 .
// Sleep for a while and try again in the hope open files will be closed. time.Sleep(sleepTime)
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 . if sleepTime < maxSleepTime {
attempts++ sleepTime *= 2
if attempts > 10 { } else {
logger.Panicf("FATAL: cannot remove %q in %d attempts: %s", path, attempts, err) logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path)
}
time.Sleep(100 * time.Millisecond)
} }
// Make sure the parent directory doesn't contain references
// to the current directory.
mustSyncParentDirIfExists(path)
} }
} }