mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
lib/{mergeset,storage}: remove transaction files only after the mentioned dirs are really removed
This should fix the issue on NFS when incompletely removed dirs may be left after unclean shutdown (OOM, kill -9, hard reset, etc.), while the corresponding transaction files are already removed. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
This commit is contained in:
parent
4e22b521c2
commit
b9616c017f
4 changed files with 51 additions and 19 deletions
|
@ -11,12 +11,13 @@ import (
|
|||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
func mustRemoveAll(path string) bool {
|
||||
func mustRemoveAll(path string, done func()) bool {
|
||||
err := os.RemoveAll(path)
|
||||
if err == nil {
|
||||
// Make sure the parent directory doesn't contain references
|
||||
// to the current directory.
|
||||
mustSyncParentDirIfExists(path)
|
||||
done()
|
||||
return true
|
||||
}
|
||||
if !isTemporaryNFSError(err) {
|
||||
|
@ -26,8 +27,12 @@ func mustRemoveAll(path string) bool {
|
|||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
// Schedule for later directory removal.
|
||||
nfsDirRemoveFailedAttempts.Inc()
|
||||
w := &removeDirWork{
|
||||
path: path,
|
||||
done: done,
|
||||
}
|
||||
select {
|
||||
case removeDirCh <- path:
|
||||
case removeDirCh <- w:
|
||||
default:
|
||||
logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh))
|
||||
}
|
||||
|
@ -36,16 +41,21 @@ func mustRemoveAll(path string) bool {
|
|||
|
||||
var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`)
|
||||
|
||||
var removeDirCh = make(chan string, 1024)
|
||||
type removeDirWork struct {
|
||||
path string
|
||||
done func()
|
||||
}
|
||||
|
||||
var removeDirCh = make(chan *removeDirWork, 1024)
|
||||
|
||||
func dirRemover() {
|
||||
const minSleepTime = 100 * time.Millisecond
|
||||
const maxSleepTime = time.Second
|
||||
sleepTime := minSleepTime
|
||||
for {
|
||||
var path string
|
||||
var w *removeDirWork
|
||||
select {
|
||||
case path = <-removeDirCh:
|
||||
case w = <-removeDirCh:
|
||||
default:
|
||||
if atomic.LoadUint64(&stopDirRemover) != 0 {
|
||||
return
|
||||
|
@ -53,7 +63,7 @@ func dirRemover() {
|
|||
time.Sleep(minSleepTime)
|
||||
continue
|
||||
}
|
||||
if mustRemoveAll(path) {
|
||||
if mustRemoveAll(w.path, w.done) {
|
||||
sleepTime = minSleepTime
|
||||
continue
|
||||
}
|
||||
|
@ -67,7 +77,7 @@ func dirRemover() {
|
|||
if sleepTime < maxSleepTime {
|
||||
sleepTime *= 2
|
||||
} else {
|
||||
logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path)
|
||||
logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", w.path)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
14
lib/fs/fs.go
14
lib/fs/fs.go
|
@ -174,7 +174,7 @@ func mkdirSync(path string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// RemoveDirContents removes all the contents of the given dir it it exists.
|
||||
// RemoveDirContents removes all the contents of the given dir if it exists.
|
||||
//
|
||||
// It doesn't remove the dir itself, so the dir may be mounted
|
||||
// to a separate partition.
|
||||
|
@ -246,7 +246,17 @@ func mustSyncParentDirIfExists(path string) {
|
|||
//
|
||||
// It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
func MustRemoveAll(path string) {
|
||||
_ = mustRemoveAll(path)
|
||||
_ = mustRemoveAll(path, func() {})
|
||||
}
|
||||
|
||||
// MustRemoveAllWithDoneCallback removes path with all the contents.
|
||||
//
|
||||
// done is called after the path is successfully removed.
|
||||
//
|
||||
// done may be called after the function returns for NFS path.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61.
|
||||
func MustRemoveAllWithDoneCallback(path string, done func()) {
|
||||
_ = mustRemoveAll(path, done)
|
||||
}
|
||||
|
||||
// HardLinkFiles makes hard links for all the files from srcDir in dstDir.
|
||||
|
|
|
@ -1121,12 +1121,14 @@ func runTransaction(txnLock *sync.RWMutex, pathPrefix, txnPath string) error {
|
|||
}
|
||||
|
||||
// Remove old paths. It is OK if certain paths don't exist.
|
||||
var removeWG sync.WaitGroup
|
||||
for _, path := range rmPaths {
|
||||
path, err := validatePath(pathPrefix, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid path to remove: %s", err)
|
||||
}
|
||||
fs.MustRemoveAll(path)
|
||||
removeWG.Add(1)
|
||||
fs.MustRemoveAllWithDoneCallback(path, removeWG.Done)
|
||||
}
|
||||
|
||||
// Move the new part to new directory.
|
||||
|
@ -1154,10 +1156,14 @@ func runTransaction(txnLock *sync.RWMutex, pathPrefix, txnPath string) error {
|
|||
// Flush pathPrefix directory metadata to the underying storage.
|
||||
fs.MustSyncPath(pathPrefix)
|
||||
|
||||
// Remove the transaction file.
|
||||
go func() {
|
||||
// Remove the transaction file only after all the source paths are deleted.
|
||||
// This is required for NFS mounts. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
removeWG.Wait()
|
||||
if err := os.Remove(txnPath); err != nil {
|
||||
return fmt.Errorf("cannot remove transaction file %q: %s", txnPath, err)
|
||||
logger.Errorf("cannot remove transaction file %q: %s", txnPath, err)
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -1452,12 +1452,14 @@ func runTransaction(txnLock *sync.RWMutex, pathPrefix1, pathPrefix2, txnPath str
|
|||
}
|
||||
|
||||
// Remove old paths. It is OK if certain paths don't exist.
|
||||
var removeWG sync.WaitGroup
|
||||
for _, path := range rmPaths {
|
||||
path, err := validatePath(pathPrefix1, pathPrefix2, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid path to remove: %s", err)
|
||||
}
|
||||
fs.MustRemoveAll(path)
|
||||
removeWG.Add(1)
|
||||
fs.MustRemoveAllWithDoneCallback(path, removeWG.Done)
|
||||
}
|
||||
|
||||
// Move the new part to new directory.
|
||||
|
@ -1492,10 +1494,14 @@ func runTransaction(txnLock *sync.RWMutex, pathPrefix1, pathPrefix2, txnPath str
|
|||
fs.MustSyncPath(pathPrefix1)
|
||||
fs.MustSyncPath(pathPrefix2)
|
||||
|
||||
// Remove the transaction file.
|
||||
go func() {
|
||||
// Remove the transaction file only after all the source paths are deleted.
|
||||
// This is required for NFS mounts. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
removeWG.Wait()
|
||||
if err := os.Remove(txnPath); err != nil {
|
||||
return fmt.Errorf("cannot remove transaction file: %s", err)
|
||||
logger.Errorf("cannot remove transaction file %q: %s", txnPath, err)
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue