From cb23685681d2dc42499c3aa485679964e79f70b7 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 26 Mar 2024 12:59:50 +0100 Subject: [PATCH] app/vmselect: make vmselect resilient to absence of cache folder (#5987) vmselect uses a cache folder in file system for two purposes: 1. Storing rollup cache results on shutdown; 2. Storing temporary search results from vmstorage during query executions. It could happen that cache folder is deleted accidentally by user, or by OS during cleanup routines. This would cause vmselect to: 1. panic on /metrics call, because `MustGetFreeSpace` will fail; 2. return query error user, as it won't be able to store temporary search results. The changes in this commit are the following: 1. Make `MustGetFreeSpace` to try re-creating the cache folder if it is missing; 2. Make vmselect to try re-creating the cache folder if it can't persist tmp search results. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5985 Signed-off-by: hagen1778 Co-authored-by: Nikolay --- app/vmselect/netstorage/tmp_blocks_file.go | 18 +++++++++++++++++- docs/CHANGELOG.md | 1 + lib/fs/fs.go | 7 +++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/app/vmselect/netstorage/tmp_blocks_file.go b/app/vmselect/netstorage/tmp_blocks_file.go index 78288d538..7e20d75fb 100644 --- a/app/vmselect/netstorage/tmp_blocks_file.go +++ b/app/vmselect/netstorage/tmp_blocks_file.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" @@ -107,7 +108,7 @@ func (tbf *tmpBlocksFile) WriteBlockRefData(b []byte) (tmpBlockAddr, error) { // Slow path: flush the data from tbf.buf to file. if tbf.f == nil { - f, err := os.CreateTemp(tmpBlocksDir, "") + f, err := createTemp(tmpBlocksDir) if err != nil { return addr, err } @@ -122,6 +123,21 @@ func (tbf *tmpBlocksFile) WriteBlockRefData(b []byte) (tmpBlockAddr, error) { return addr, nil } +// createTemp creates new temporary file in the path dir. +// If path doesn't exist, it will try creating it. +func createTemp(path string) (*os.File, error) { + f, err := os.CreateTemp(path, "") + if err == nil { + return f, nil + } + if os.IsNotExist(err) || strings.Contains(err.Error(), "no such file or directory") { + // try re-creating the path and trying again + fs.MustMkdirIfNotExist(path) + return os.CreateTemp(path, "") + } + return nil, err +} + // Len() returnt tbf size in bytes. func (tbf *tmpBlocksFile) Len() uint64 { return tbf.offset diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index eb5d55989..ca545883c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -63,6 +63,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: do not drop `match[]` filter at [`/api/v1/series`](https://docs.victoriametrics.com/url-examples/#apiv1series) if `-search.ignoreExtraFiltersAtLabelsAPI` command-line flag is set, since missing `match[]` filter breaks `/api/v1/series` requests. * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): properly parse TLS key and CA files for [InfluxDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-influxdb-1x) and [OpenTSDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-opentsdb) migration modes. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix VictoriaLogs UI query handling to correctly apply `_time` filter across all queries. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5920). +* BUGFIX: [vmselect](https://docs.victoriametrics.com/): make vmselect resilient to absence of cache folder. If cache folder was mistakenly deleted by user or OS, vmselect will try re-creating it first. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5985). * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): limit duration of requests to /api/v1/labels, /api/v1/label/.../values or /api/v1/series with `-search.maxLabelsAPIDuration` duration. Before, `-search.maxExportDuration` value was used by mistake. Thanks to @kbweave for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5992). ## [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0) diff --git a/lib/fs/fs.go b/lib/fs/fs.go index 980066b10..7775360c3 100644 --- a/lib/fs/fs.go +++ b/lib/fs/fs.go @@ -351,6 +351,7 @@ func MustCreateFlockFile(dir string) *os.File { const FlockFilename = "flock.lock" // MustGetFreeSpace returns free space for the given directory path. +// It tries to re-create path if it doesn't exist yet. func MustGetFreeSpace(path string) uint64 { // Try obtaining cached value at first. freeSpaceMapLock.Lock() @@ -363,6 +364,12 @@ func MustGetFreeSpace(path string) uint64 { } // Slow path. + + // The path might be not available because: + // 1. We forgot to create it in the code + // 2. OS cleaned it up + MustMkdirIfNotExist(path) + // Determine the amount of free space at path. e.freeSpace = mustGetFreeSpace(path) e.updateTime = fasttime.UnixTimestamp()