vmalert: cancel in-flight requests on group's update or close (#3886)

When group's update() or close() method is called, the group
still need to wait for its current evaluation to finish.
Sometimes, evaluation could take a significant amount of time
which slows configuration update or vmalert's graceful shutdown.

The change interrupts current evaluation in order to speed up
the graceful shutdown or config update procedures.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-03-01 15:48:20 +01:00 committed by GitHub
parent 2e153b68cd
commit d6fa4da712
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 5 deletions

View file

@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"hash/fnv" "hash/fnv"
"net/url" "net/url"
@ -44,6 +45,9 @@ type Group struct {
// channel accepts new Group obj // channel accepts new Group obj
// which supposed to update current group // which supposed to update current group
updateCh chan *Group updateCh chan *Group
// evalCancel stores the cancel fn for interrupting
// rules evaluation. Used on groups update() and close().
evalCancel context.CancelFunc
metrics *groupMetrics metrics *groupMetrics
} }
@ -233,11 +237,24 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil return nil
} }
// interruptEval interrupts in-flight rules evaluations
// within the group. It is expected that g.evalCancel
// will be repopulated after the call.
func (g *Group) interruptEval() {
g.mu.RLock()
defer g.mu.RUnlock()
if g.evalCancel != nil {
g.evalCancel()
}
}
func (g *Group) close() { func (g *Group) close() {
if g.doneCh == nil { if g.doneCh == nil {
return return
} }
close(g.doneCh) close(g.doneCh)
g.interruptEval()
<-g.finishedCh <-g.finishedCh
g.metrics.iterationDuration.Unregister() g.metrics.iterationDuration.Unregister()
@ -263,7 +280,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency) logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
eval := func(ts time.Time) { eval := func(ctx context.Context, ts time.Time) {
g.metrics.iterationTotal.Inc() g.metrics.iterationTotal.Inc()
start := time.Now() start := time.Now()
@ -285,7 +302,13 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
g.LastEvaluation = start g.LastEvaluation = start
} }
eval(evalTS) evalCtx, cancel := context.WithCancel(ctx)
g.mu.Lock()
g.evalCancel = cancel
g.mu.Unlock()
defer g.evalCancel()
eval(evalCtx, evalTS)
t := time.NewTicker(g.Interval) t := time.NewTicker(g.Interval)
defer t.Stop() defer t.Stop()
@ -309,6 +332,14 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
return return
case ng := <-g.updateCh: case ng := <-g.updateCh:
g.mu.Lock() g.mu.Lock()
// it is expected that g.evalCancel will be evoked
// somewhere else to unblock group from the rules evaluation.
// we recreate the evalCtx and g.evalCancel, so it can
// be called again.
evalCtx, cancel = context.WithCancel(ctx)
g.evalCancel = cancel
err := g.updateWith(ng) err := g.updateWith(ng)
if err != nil { if err != nil {
logger.Errorf("group %q: failed to update: %s", g.Name, err) logger.Errorf("group %q: failed to update: %s", g.Name, err)
@ -333,7 +364,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
} }
evalTS = evalTS.Add((missed + 1) * g.Interval) evalTS = evalTS.Add((missed + 1) * g.Interval)
eval(evalTS) eval(evalCtx, evalTS)
} }
} }
} }
@ -407,6 +438,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
tss, err := rule.Exec(ctx, ts, limit) tss, err := rule.Exec(ctx, ts, limit)
if err != nil { if err != nil {
if errors.Is(err, context.Canceled) {
// the context can be cancelled on graceful shutdown
// or on group update. So no need to handle the error as usual.
return nil
}
execErrors.Inc() execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err) return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
} }

View file

@ -474,3 +474,31 @@ func TestFaultyRW(t *testing.T) {
t.Fatalf("expected to get an error from faulty RW client, got nil instead") t.Fatalf("expected to get an error from faulty RW client, got nil instead")
} }
} }
func TestCloseWithEvalInterruption(t *testing.T) {
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const delay = time.Second * 2
fq := &fakeQuerierWithDelay{delay: delay}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fq, evalInterval, nil)
go g.start(context.Background(), nil, nil, nil)
time.Sleep(evalInterval * 20)
go func() {
g.close()
}()
deadline := time.Tick(delay / 2)
select {
case <-deadline:
t.Fatalf("deadline for close exceeded")
case <-g.finishedCh:
}
}

View file

@ -104,6 +104,24 @@ func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time
return cp, req, nil return cp, req, nil
} }
type fakeQuerierWithDelay struct {
fakeQuerier
delay time.Duration
}
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) ([]datasource.Metric, *http.Request, error) {
timer := time.NewTimer(fqd.delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.fakeQuerier.Query(ctx, expr, ts)
}
func (fqd *fakeQuerierWithDelay) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqd
}
type fakeNotifier struct { type fakeNotifier struct {
sync.Mutex sync.Mutex
alerts []notifier.Alert alerts []notifier.Alert

View file

@ -87,6 +87,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
m.wg.Add(1) m.wg.Add(1)
id := g.ID() id := g.ID()
go func() { go func() {
defer m.wg.Done()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics. // Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart { if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64))) randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
@ -111,8 +112,6 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
} else { } else {
g.start(ctx, m.notifiers, m.rw, nil) g.start(ctx, m.notifiers, m.rw, nil)
} }
m.wg.Done()
}() }()
m.groups[id] = g m.groups[id] = g
return nil return nil
@ -168,6 +167,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
} }
for _, ng := range groupsRegistry { for _, ng := range groupsRegistry {
if err := m.startGroup(ctx, ng, restore); err != nil { if err := m.startGroup(ctx, ng, restore); err != nil {
m.groupsMu.Unlock()
return err return err
} }
} }
@ -181,6 +181,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
old.updateCh <- new old.updateCh <- new
wg.Done() wg.Done()
}(item.old, item.new) }(item.old, item.new)
item.old.interruptEval()
} }
wg.Wait() wg.Wait()
} }