mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vmalert: do not wait for group start on removal (#3891)
Each group in vmalert starts with an artifical delay to avoid thundering herd problem. For some groups with high evaluation intervals, the delay could be significant. If during this delay user will remove the group from the config and hot-reload it - vmalert will have to wait until the delay ends. This results into slow config reloading and UI hang. The change moves the start-delay logic back to the group's `start` method. Now, group can immediately exit from the delay when `group.close()` method is called. Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
4c94fe0f0a
commit
fa3b2bd205
2 changed files with 20 additions and 20 deletions
|
@ -271,6 +271,26 @@ var skipRandSleepOnGroupStart bool
|
|||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += uint64(g.Interval)
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
}
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: nts,
|
||||
|
|
|
@ -6,7 +6,6 @@ import (
|
|||
"net/url"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
@ -88,25 +87,6 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
|
|||
id := g.ID()
|
||||
go func() {
|
||||
defer m.wg.Done()
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += uint64(g.Interval)
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
}
|
||||
}
|
||||
if restore {
|
||||
g.start(ctx, m.notifiers, m.rw, m.rr)
|
||||
} else {
|
||||
|
|
Loading…
Reference in a new issue