From a42bd59ee4e7fdc06c75026ac4d5e6fae3d4bf58 Mon Sep 17 00:00:00 2001 From: rtm0 <149964189+rtm0@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:33:38 +0200 Subject: [PATCH] Fix Date metricid cache consistency under concurrent use (#6534) ### Describe Your Changes Fix Date metricid cache consistency under concurrent use. When one goroutine calls Has() and does not find the cache entry in the immutable map it will acquire a lock and check the mutable map. And it is possible that before that lock is acquired, the entry is moved from the mutable map to the immutable map by another goroutine causing a cache miss. The fix is to check the immutable map again once the lock is acquired. ### Checklist The following checks are **mandatory**: - [x ] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). --------- Signed-off-by: Artem Fetishev Co-authored-by: Nikolay --- .gitignore | 3 ++- docs/CHANGELOG.md | 3 +++ lib/storage/storage.go | 22 ++++++++++++++++------ lib/storage/storage_test.go | 25 +++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 3dc9fba6b..5224510fa 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ Gemfile.lock /_site _site *.tmp -/docs/.jekyll-metadata \ No newline at end of file +/docs/.jekyll-metadata +coverage.txt \ No newline at end of file diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 36d8ec5c0..877514c67 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -32,6 +32,9 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): add [Grafana dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vmauth.json) and [alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmauth.yml) for [vmauth](https://docs.victoriametrics.com/vmauth/) dashboard. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4313) for details. * BUGFIX: [docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#docker-compose-environment-for-victoriametrics): fix incorrect link to vmui from [VictoriaMetrics plugin in Grafana](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#grafana). + +* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fix the dateMetricIDCache consistency issue that leads to duplicate per-day index entries when new time series are inserted concurrently. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6534) for details. + * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix input cursor position reset in modal settings. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6530). ## [v1.102.0-rc2](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc2) diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 493d7d42a..3e22bb26d 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -2328,16 +2328,28 @@ func (dmc *dateMetricIDCache) SizeBytes() uint64 { } func (dmc *dateMetricIDCache) Has(generation, date, metricID uint64) bool { + if byDate := dmc.byDate.Load(); byDate.get(generation, date).Has(metricID) { + // Fast path. The majority of calls must go here. + return true + } + // Slow path. Acquire the lock and search the immutable map again and then + // also search the mutable map. + return dmc.hasSlow(generation, date, metricID) +} + +func (dmc *dateMetricIDCache) hasSlow(generation, date, metricID uint64) bool { + dmc.mu.Lock() + defer dmc.mu.Unlock() + + // First, check immutable map again because the entry may have been moved to + // the immutable map by the time the caller acquires the lock. byDate := dmc.byDate.Load() v := byDate.get(generation, date) if v.Has(metricID) { - // Fast path. - // The majority of calls must go here. return true } - // Slow path. Check mutable map. - dmc.mu.Lock() + // Then check immutable map. vMutable := dmc.byDateMutable.get(generation, date) ok := vMutable.Has(metricID) if ok { @@ -2348,8 +2360,6 @@ func (dmc *dateMetricIDCache) Has(generation, date, metricID uint64) bool { dmc.slowHits = 0 } } - dmc.mu.Unlock() - return ok } diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go index 6763ba9d4..7ce8fa30d 100644 --- a/lib/storage/storage_test.go +++ b/lib/storage/storage_test.go @@ -8,6 +8,7 @@ import ( "reflect" "sort" "strings" + "sync" "testing" "testing/quick" "time" @@ -156,6 +157,30 @@ func testDateMetricIDCache(c *dateMetricIDCache, concurrent bool) error { return nil } +func TestDateMetricIDCacheIsConsistent(_ *testing.T) { + const ( + generation = 1 + date = 1 + concurrency = 2 + numMetrics = 100000 + ) + dmc := newDateMetricIDCache() + var wg sync.WaitGroup + for i := range concurrency { + wg.Add(1) + go func() { + defer wg.Done() + for id := uint64(i * numMetrics); id < uint64((i+1)*numMetrics); id++ { + dmc.Set(generation, date, id) + if !dmc.Has(generation, date, id) { + panic(fmt.Errorf("dmc.Has(metricID=%d): unexpected cache miss after adding the entry to cache", id)) + } + } + }() + } + wg.Wait() +} + func TestUpdateCurrHourMetricIDs(t *testing.T) { newStorage := func() *Storage { var s Storage