mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vmalert: do not wait for group start on removal (#3891)
Each group in vmalert starts with an artifical delay to avoid thundering herd problem. For some groups with high evaluation intervals, the delay could be significant. If during this delay user will remove the group from the config and hot-reload it - vmalert will have to wait until the delay ends. This results into slow config reloading and UI hang. The change moves the start-delay logic back to the group's `start` method. Now, group can immediately exit from the delay when `group.close()` method is called. Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
1b194bc6de
commit
2472baa934
2 changed files with 20 additions and 20 deletions
|
@ -271,6 +271,26 @@ var skipRandSleepOnGroupStart bool
|
|||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += uint64(g.Interval)
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
}
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: nts,
|
||||
|
|
|
@ -6,7 +6,6 @@ import (
|
|||
"net/url"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
@ -88,25 +87,6 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
|
|||
id := g.ID()
|
||||
go func() {
|
||||
defer m.wg.Done()
|
||||
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
||||
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += uint64(g.Interval)
|
||||
}
|
||||
randSleep -= sleepOffset
|
||||
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
}
|
||||
}
|
||||
if restore {
|
||||
g.start(ctx, m.notifiers, m.rw, m.rr)
|
||||
} else {
|
||||
|
|
Loading…
Reference in a new issue