app/vmalert: do not wait for group start on removal (#3891)

Each group in vmalert starts with an artifical delay to avoid
thundering herd problem. For some groups with high evaluation
intervals, the delay could be significant.
If during this delay user will remove the group from the config
and hot-reload it - vmalert will have to wait until the delay
ends. This results into slow config reloading and UI hang.

The change moves the start-delay logic back to the group's
`start` method. Now, group can immediately exit from the
delay when `group.close()` method is called.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-03-06 14:04:43 +01:00 committed by Aliaksandr Valialkin
parent 4c94fe0f0a
commit fa3b2bd205
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
2 changed files with 20 additions and 20 deletions

View file

@ -271,6 +271,26 @@ var skipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
defer func() { close(g.finishedCh) }()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
if randSleep < sleepOffset {
randSleep += uint64(g.Interval)
}
randSleep -= sleepOffset
sleepTimer := time.NewTimer(time.Duration(randSleep))
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case <-sleepTimer.C:
}
}
e := &executor{
rw: rw,
notifiers: nts,

View file

@ -6,7 +6,6 @@ import (
"net/url"
"sort"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@ -88,25 +87,6 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
id := g.ID()
go func() {
defer m.wg.Done()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
if randSleep < sleepOffset {
randSleep += uint64(g.Interval)
}
randSleep -= sleepOffset
sleepTimer := time.NewTimer(time.Duration(randSleep))
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case <-sleepTimer.C:
}
}
if restore {
g.start(ctx, m.notifiers, m.rw, m.rr)
} else {