From c4fe23794ae15e1f5719a1d2910e0ffeaa74d9e6 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Fri, 18 Oct 2024 17:18:24 +0800 Subject: [PATCH] vmalert: fix blocking hot-reload process if the old rule group hasn't started yet (#7258) Group [sleeps](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/daa7183749f6ec4d386f652d2fcb224d83989963/app/vmalert/rule/group.go#L320) random duration before start the evaluation, and during the sleep, `g.updateCh <- new` will be blocked since there is no `<-g.updateCh` waiting. --------- Signed-off-by: hagen1778 Co-authored-by: hagen1778 --- app/vmalert/rule/group.go | 30 +++++++++++---- app/vmalert/rule/group_test.go | 68 ++++++++++++++++++++++++++++++++++ docs/changelog/CHANGELOG.md | 1 + 3 files changed, 91 insertions(+), 8 deletions(-) diff --git a/app/vmalert/rule/group.go b/app/vmalert/rule/group.go index e77b9b5f5..5051a6082 100644 --- a/app/vmalert/rule/group.go +++ b/app/vmalert/rule/group.go @@ -324,14 +324,28 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re g.infof("will start in %v", sleepBeforeStart) sleepTimer := time.NewTimer(sleepBeforeStart) - select { - case <-ctx.Done(): - sleepTimer.Stop() - return - case <-g.doneCh: - sleepTimer.Stop() - return - case <-sleepTimer.C: + randSleep: + for { + select { + case <-ctx.Done(): + sleepTimer.Stop() + return + case <-g.doneCh: + sleepTimer.Stop() + return + case ng := <-g.updateCh: + g.mu.Lock() + err := g.updateWith(ng) + if err != nil { + logger.Errorf("group %q: failed to update: %s", g.Name, err) + g.mu.Unlock() + continue + } + g.mu.Unlock() + g.infof("reload successfully") + case <-sleepTimer.C: + break randSleep + } } evalTS = evalTS.Add(sleepBeforeStart) } diff --git a/app/vmalert/rule/group_test.go b/app/vmalert/rule/group_test.go index 711bb277d..cbe26b5e0 100644 --- a/app/vmalert/rule/group_test.go +++ b/app/vmalert/rule/group_test.go @@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) { }) } +func TestUpdateDuringRandSleep(t *testing.T) { + // enable rand sleep to test group update during sleep + SkipRandSleepOnGroupStart = false + defer func() { + SkipRandSleepOnGroupStart = true + }() + rule := AlertingRule{ + Name: "jobDown", + Expr: "up==0", + Labels: map[string]string{ + "foo": "bar", + }, + } + g := &Group{ + Name: "test", + Rules: []Rule{ + &rule, + }, + // big interval ensures big enough randSleep during start process + Interval: 100 * time.Hour, + updateCh: make(chan *Group), + } + go g.Start(context.Background(), nil, nil, nil) + + rule1 := AlertingRule{ + Name: "jobDown", + Expr: "up{job=\"vmagent\"}==0", + Labels: map[string]string{ + "foo": "bar", + }, + } + g1 := &Group{ + Rules: []Rule{ + &rule1, + }, + } + g.updateCh <- g1 + time.Sleep(10 * time.Millisecond) + g.mu.RLock() + if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" { + t.Fatalf("expected to have updated rule expr") + } + g.mu.RUnlock() + + rule2 := AlertingRule{ + Name: "jobDown", + Expr: "up{job=\"vmagent\"}==0", + Labels: map[string]string{ + "foo": "bar", + "baz": "qux", + }, + } + g2 := &Group{ + Rules: []Rule{ + &rule2, + }, + } + g.updateCh <- g2 + time.Sleep(10 * time.Millisecond) + g.mu.RLock() + if len(g.Rules[0].(*AlertingRule).Labels) != 2 { + t.Fatalf("expected to have updated labels") + } + g.mu.RUnlock() + + g.Close() +} + func TestGroupStart(t *testing.T) { const ( rules = ` diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index 9ea85db95..9e852467a 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -34,6 +34,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly. * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration. * BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040). +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart. ## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)