From 48a5c4cb017ad0ebd449777bc0f7b28237584531 Mon Sep 17 00:00:00 2001 From: rtm0 <149964189+rtm0@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:33:38 +0200 Subject: [PATCH] Fix Date metricid cache consistency under concurrent use (#6534) ### Describe Your Changes Fix Date metricid cache consistency under concurrent use. When one goroutine calls Has() and does not find the cache entry in the immutable map it will acquire a lock and check the mutable map. And it is possible that before that lock is acquired, the entry is moved from the mutable map to the immutable map by another goroutine causing a cache miss. The fix is to check the immutable map again once the lock is acquired. ### Checklist The following checks are **mandatory**: - [x ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: Artem Fetishev Co-authored-by: Nikolay --- .gitignore | 3 ++- docs/CHANGELOG.md | 3 +++ lib/storage/storage.go | 22 ++++++++++++++++------ lib/storage/storage_test.go | 25 +++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 3dc9fba6b0..5224510fac 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ Gemfile.lock /_site _site *.tmp -/docs/.jekyll-metadata \ No newline at end of file +/docs/.jekyll-metadata +coverage.txt \ No newline at end of file diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 36d8ec5c08..877514c67f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -32,6 +32,9 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): add [Grafana dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vmauth.json) and [alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmauth.yml) for [vmauth](https://docs.victoriametrics.com/vmauth/) dashboard. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4313) for details. * BUGFIX: [docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#docker-compose-environment-for-victoriametrics): fix incorrect link to vmui from [VictoriaMetrics plugin in Grafana](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#grafana). + +* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fix the dateMetricIDCache consistency issue that leads to duplicate per-day index entries when new time series are inserted concurrently. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6534) for details. + * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix input cursor position reset in modal settings. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6530). ## [v1.102.0-rc2](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc2) diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 9aac701864..b353e2f430 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -2447,16 +2447,28 @@ func (dmc *dateMetricIDCache) SizeBytes() uint64 { } func (dmc *dateMetricIDCache) Has(generation, date, metricID uint64) bool { + if byDate := dmc.byDate.Load(); byDate.get(generation, date).Has(metricID) { + // Fast path. The majority of calls must go here. + return true + } + // Slow path. Acquire the lock and search the immutable map again and then + // also search the mutable map. + return dmc.hasSlow(generation, date, metricID) +} + +func (dmc *dateMetricIDCache) hasSlow(generation, date, metricID uint64) bool { + dmc.mu.Lock() + defer dmc.mu.Unlock() + + // First, check immutable map again because the entry may have been moved to + // the immutable map by the time the caller acquires the lock. byDate := dmc.byDate.Load() v := byDate.get(generation, date) if v.Has(metricID) { - // Fast path. - // The majority of calls must go here. return true } - // Slow path. Check mutable map. - dmc.mu.Lock() + // Then check immutable map. vMutable := dmc.byDateMutable.get(generation, date) ok := vMutable.Has(metricID) if ok { @@ -2467,8 +2479,6 @@ func (dmc *dateMetricIDCache) Has(generation, date, metricID uint64) bool { dmc.slowHits = 0 } } - dmc.mu.Unlock() - return ok } diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go index bb25bfa20b..0e0441784c 100644 --- a/lib/storage/storage_test.go +++ b/lib/storage/storage_test.go @@ -8,6 +8,7 @@ import ( "reflect" "sort" "strings" + "sync" "testing" "testing/quick" "time" @@ -156,6 +157,30 @@ func testDateMetricIDCache(c *dateMetricIDCache, concurrent bool) error { return nil } +func TestDateMetricIDCacheIsConsistent(_ *testing.T) { + const ( + generation = 1 + date = 1 + concurrency = 2 + numMetrics = 100000 + ) + dmc := newDateMetricIDCache() + var wg sync.WaitGroup + for i := range concurrency { + wg.Add(1) + go func() { + defer wg.Done() + for id := uint64(i * numMetrics); id < uint64((i+1)*numMetrics); id++ { + dmc.Set(generation, date, id) + if !dmc.Has(generation, date, id) { + panic(fmt.Errorf("dmc.Has(metricID=%d): unexpected cache miss after adding the entry to cache", id)) + } + } + }() + } + wg.Wait() +} + func TestUpdateCurrHourMetricIDs(t *testing.T) { newStorage := func() *Storage { var s Storage