From 94fe7e5c0827596d71314eef3d52a471bb1483e3 Mon Sep 17 00:00:00 2001
From: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Date: Thu, 4 May 2023 19:16:48 +0400
Subject: [PATCH] lib/storage: fix indexdb rotation infinite loop (#4249)

When using `retentionTimezoneOffset` and having local timezone being more than 4 hours different from UTC indexdb retention calculation could return negative value. This caused indexdb rotation to get in loop.
Fix calculation of offset to use `retentionTimezoneOffset` value properly and add test to cover all legit timezone configs.
See:
- https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4207
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4206

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
---
 docs/CHANGELOG.md           |  1 +
 lib/storage/storage.go      | 19 ++++++++++++-------
 lib/storage/storage_test.go | 36 +++++++++++++++++++++++++++++++-----
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 799ca4f2e5..2b13e9f3e3 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -23,6 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * BUGFIX: properly limit the number of [OpenTSDB HTTP](https://docs.victoriametrics.com/#sending-opentsdb-data-via-http-apiput-requests) concurrent requests specified via `-maxConcurrentInserts` command-line flag. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4204). Thanks to @zouxiang1993 for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4208).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly return empty slices instead of nil for `/api/v1/rules` and `/api/v1/alerts` API handlers. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4221).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `__meta_kubernetes_endpoints_name` label for all ports discovered from endpoint. Previously, ports not matched by `Service` did not have this label. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4154) for details.
+* BUGFIX: fix possible infinite loop during `indexdb` rotation when `-retentionTimezoneOffset` command-line flag is set and the local timezone is not UTC. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4207). Thanks to @faceair for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4206).
 * BUGFIX: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html): suppress `series after dedup` error message in logs when `-remoteWrite.streamAggr.dedupInterval` command-line flag is set at [vmagent](https://docs.victoriametrics.com/vmgent.html) or when `-streamAggr.dedupInterval` command-line flag is set at [single-node VictoriaMetrics](https://docs.victoriametrics.com/).
 
 ## [v1.87.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.5)
diff --git a/lib/storage/storage.go b/lib/storage/storage.go
index ed15d11cdf..49c103be76 100644
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@@ -1111,18 +1111,23 @@ func SetRetentionTimezoneOffset(offset time.Duration) {
 var retentionTimezoneOffsetMsecs int64
 
 func nextRetentionDuration(retentionMsecs int64) time.Duration {
-	// Round retentionMsecs to days. This guarantees that per-day inverted index works as expected.
-	retentionMsecs = ((retentionMsecs + msecPerDay - 1) / msecPerDay) * msecPerDay
-	t := time.Now().UnixNano() / 1e6
-	deadline := ((t + retentionMsecs - 1) / retentionMsecs) * retentionMsecs
+	nowMsecs := time.Now().UnixNano() / 1e6
+	return nextRetentionDurationAt(nowMsecs, retentionMsecs)
+}
+
+func nextRetentionDurationAt(atMsecs int64, retentionMsecs int64) time.Duration {
 	// Schedule the deadline to +4 hours from the next retention period start.
 	// This should prevent from possible double deletion of indexdb
 	// due to time drift - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/248 .
-	deadline += 4 * 3600 * 1000
+	retentionOffsetMsecs := retentionTimezoneOffsetMsecs - int64(4*3600*1000)
+
+	// Round retentionMsecs to days. This guarantees that per-day inverted index works as expected
+	deadline := ((atMsecs + retentionMsecs + retentionOffsetMsecs - 1) / retentionMsecs) * retentionMsecs
+
 	// The effect of time zone on retention period is moved out.
 	// See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2574
-	deadline -= retentionTimezoneOffsetMsecs
-	return time.Duration(deadline-t) * time.Millisecond
+	deadline -= retentionOffsetMsecs
+	return time.Duration(deadline-atMsecs) * time.Millisecond
 }
 
 // SearchMetricNames returns marshaled metric names matching the given tfss on the given tr.
diff --git a/lib/storage/storage_test.go b/lib/storage/storage_test.go
index e22d9cad48..e43843877b 100644
--- a/lib/storage/storage_test.go
+++ b/lib/storage/storage_test.go
@@ -493,12 +493,38 @@ func TestMetricRowMarshalUnmarshal(t *testing.T) {
 }
 
 func TestNextRetentionDuration(t *testing.T) {
-	for retentionMonths := float64(0.1); retentionMonths < 120; retentionMonths += 0.3 {
-		d := nextRetentionDuration(int64(retentionMonths * msecsPerMonth))
+	validateRetention := func(now time.Time, retention float64) {
+		t.Helper()
+
+		nowMsecs := now.UnixMilli()
+		d := nextRetentionDurationAt(nowMsecs, int64(retention*msecsPerMonth))
 		if d <= 0 {
-			currTime := time.Now().UTC()
-			nextTime := time.Now().UTC().Add(d)
-			t.Fatalf("unexpected retention duration for retentionMonths=%f; got %s; must be %s + %f months", retentionMonths, nextTime, currTime, retentionMonths)
+			nextTime := now.Add(d)
+			t.Fatalf("unexpected retention duration for retentionMonths=%f; got %s; must be %s + %f months", retention, nextTime, now, retention)
+		}
+	}
+
+	for retentionMonths := float64(0.1); retentionMonths < 120; retentionMonths += 0.3 {
+		// UTC offsets are in range [-12 hours, +14 hours].
+		// Verify that any legit combination of retention timezone and local time
+		// will return valid retention duration.
+		// See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4207
+		for retentionOffset := -12; retentionOffset <= 14; retentionOffset++ {
+			for localTimeOffset := -12; localTimeOffset <= 14; localTimeOffset++ {
+				SetRetentionTimezoneOffset(time.Duration(retentionOffset) * time.Hour)
+				tz := time.FixedZone("", -1*localTimeOffset*60*60)
+				now := time.Now().In(tz)
+				validateRetention(now, retentionMonths)
+
+				now = time.Date(2023, 4, 27, 3, 58, 0, 0, tz)
+				validateRetention(now, retentionMonths)
+
+				now = time.Date(2023, 4, 27, 4, 1, 0, 0, tz)
+				validateRetention(now, retentionMonths)
+
+				now = time.Date(2023, 4, 27, 6, 0, 0, 0, tz)
+				validateRetention(now, retentionMonths)
+			}
 		}
 	}
 }