mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/fs: try harder with directory removal on NFS in the event of temporary lock
Do not give up after 11 attempts of directory removal on laggy NFS. Add `vm_nfs_dir_remove_failed_attempts_total` metric for counting the number of failed attempts on directory removal. Log failed attempts on directory removal after long sleep times. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
This commit is contained in:
parent
7cde25bac4
commit
82bfe818d0
1 changed files with 27 additions and 21 deletions
48
lib/fs/fs.go
48
lib/fs/fs.go
|
@ -248,12 +248,16 @@ func mustSyncParentDirIfExists(path string) {
|
||||||
//
|
//
|
||||||
// It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
// It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||||
func MustRemoveAll(path string) {
|
func MustRemoveAll(path string) {
|
||||||
|
_ = mustRemoveAll(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustRemoveAll(path string) bool {
|
||||||
err := os.RemoveAll(path)
|
err := os.RemoveAll(path)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Make sure the parent directory doesn't contain references
|
// Make sure the parent directory doesn't contain references
|
||||||
// to the current directory.
|
// to the current directory.
|
||||||
mustSyncParentDirIfExists(path)
|
mustSyncParentDirIfExists(path)
|
||||||
return
|
return true
|
||||||
}
|
}
|
||||||
if !isTemporaryNFSError(err) {
|
if !isTemporaryNFSError(err) {
|
||||||
logger.Panicf("FATAL: cannot remove %q: %s", path, err)
|
logger.Panicf("FATAL: cannot remove %q: %s", path, err)
|
||||||
|
@ -261,38 +265,40 @@ func MustRemoveAll(path string) {
|
||||||
// NFS prevents from removing directories with open files.
|
// NFS prevents from removing directories with open files.
|
||||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||||
// Schedule for later directory removal.
|
// Schedule for later directory removal.
|
||||||
|
nfsDirRemoveFailedAttempts.Inc()
|
||||||
select {
|
select {
|
||||||
case removeDirCh <- path:
|
case removeDirCh <- path:
|
||||||
default:
|
default:
|
||||||
logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh))
|
logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh))
|
||||||
}
|
}
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`)
|
||||||
|
|
||||||
var removeDirCh = make(chan string, 1024)
|
var removeDirCh = make(chan string, 1024)
|
||||||
|
|
||||||
func dirRemover() {
|
func dirRemover() {
|
||||||
|
const minSleepTime = 100 * time.Millisecond
|
||||||
|
const maxSleepTime = time.Second
|
||||||
|
sleepTime := minSleepTime
|
||||||
for path := range removeDirCh {
|
for path := range removeDirCh {
|
||||||
attempts := 0
|
if mustRemoveAll(path) {
|
||||||
for {
|
sleepTime = minSleepTime
|
||||||
err := os.RemoveAll(path)
|
continue
|
||||||
if err == nil {
|
}
|
||||||
break
|
|
||||||
}
|
// Couldn't remove the directory at the path because of NFS lock.
|
||||||
if !isTemporaryNFSError(err) {
|
// Sleep for a while and try again.
|
||||||
logger.Panicf("FATAL: cannot remove %q: %s", path, err)
|
// Do not limit the amount of time required for deleting the directory,
|
||||||
}
|
// since this may break on laggy NFS.
|
||||||
// NFS prevents from removing directories with open files.
|
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 .
|
||||||
// Sleep for a while and try again in the hope open files will be closed.
|
time.Sleep(sleepTime)
|
||||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
if sleepTime < maxSleepTime {
|
||||||
attempts++
|
sleepTime *= 2
|
||||||
if attempts > 10 {
|
} else {
|
||||||
logger.Panicf("FATAL: cannot remove %q in %d attempts: %s", path, attempts, err)
|
logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path)
|
||||||
}
|
|
||||||
time.Sleep(100 * time.Millisecond)
|
|
||||||
}
|
}
|
||||||
// Make sure the parent directory doesn't contain references
|
|
||||||
// to the current directory.
|
|
||||||
mustSyncParentDirIfExists(path)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue