From 2472baa9343aa54e6f48f030fed7cfbc6a733f2d Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 6 Mar 2023 14:04:43 +0100 Subject: [PATCH] app/vmalert: do not wait for group start on removal (#3891) Each group in vmalert starts with an artifical delay to avoid thundering herd problem. For some groups with high evaluation intervals, the delay could be significant. If during this delay user will remove the group from the config and hot-reload it - vmalert will have to wait until the delay ends. This results into slow config reloading and UI hang. The change moves the start-delay logic back to the group's `start` method. Now, group can immediately exit from the delay when `group.close()` method is called. Signed-off-by: hagen1778 --- app/vmalert/group.go | 20 ++++++++++++++++++++ app/vmalert/manager.go | 20 -------------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 77bb23e30..94b74b27e 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -271,6 +271,26 @@ var skipRandSleepOnGroupStart bool func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) { defer func() { close(g.finishedCh) }() + // Spread group rules evaluation over time in order to reduce load on VictoriaMetrics. + if !skipRandSleepOnGroupStart { + randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64))) + sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval) + if randSleep < sleepOffset { + randSleep += uint64(g.Interval) + } + randSleep -= sleepOffset + sleepTimer := time.NewTimer(time.Duration(randSleep)) + select { + case <-ctx.Done(): + sleepTimer.Stop() + return + case <-g.doneCh: + sleepTimer.Stop() + return + case <-sleepTimer.C: + } + } + e := &executor{ rw: rw, notifiers: nts, diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index 0d7b7ba83..ac1a516f3 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -6,7 +6,6 @@ import ( "net/url" "sort" "sync" - "time" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" @@ -88,25 +87,6 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error id := g.ID() go func() { defer m.wg.Done() - // Spread group rules evaluation over time in order to reduce load on VictoriaMetrics. - if !skipRandSleepOnGroupStart { - randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64))) - sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval) - if randSleep < sleepOffset { - randSleep += uint64(g.Interval) - } - randSleep -= sleepOffset - sleepTimer := time.NewTimer(time.Duration(randSleep)) - select { - case <-ctx.Done(): - sleepTimer.Stop() - return - case <-g.doneCh: - sleepTimer.Stop() - return - case <-sleepTimer.C: - } - } if restore { g.start(ctx, m.notifiers, m.rw, m.rr) } else {