mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert: fix blocking hot-reload process if the old rule group hasn't started yet (#7258)
Group
[sleeps](daa7183749/app/vmalert/rule/group.go (L320)
)
random duration before start the evaluation, and during the sleep,
`g.updateCh <- new` will be blocked since there is no `<-g.updateCh`
waiting.
---------
Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
41e0bbb6d1
commit
c4fe23794a
3 changed files with 91 additions and 8 deletions
|
@ -324,14 +324,28 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re
|
||||||
g.infof("will start in %v", sleepBeforeStart)
|
g.infof("will start in %v", sleepBeforeStart)
|
||||||
|
|
||||||
sleepTimer := time.NewTimer(sleepBeforeStart)
|
sleepTimer := time.NewTimer(sleepBeforeStart)
|
||||||
select {
|
randSleep:
|
||||||
case <-ctx.Done():
|
for {
|
||||||
sleepTimer.Stop()
|
select {
|
||||||
return
|
case <-ctx.Done():
|
||||||
case <-g.doneCh:
|
sleepTimer.Stop()
|
||||||
sleepTimer.Stop()
|
return
|
||||||
return
|
case <-g.doneCh:
|
||||||
case <-sleepTimer.C:
|
sleepTimer.Stop()
|
||||||
|
return
|
||||||
|
case ng := <-g.updateCh:
|
||||||
|
g.mu.Lock()
|
||||||
|
err := g.updateWith(ng)
|
||||||
|
if err != nil {
|
||||||
|
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||||
|
g.mu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
g.mu.Unlock()
|
||||||
|
g.infof("reload successfully")
|
||||||
|
case <-sleepTimer.C:
|
||||||
|
break randSleep
|
||||||
|
}
|
||||||
}
|
}
|
||||||
evalTS = evalTS.Add(sleepBeforeStart)
|
evalTS = evalTS.Add(sleepBeforeStart)
|
||||||
}
|
}
|
||||||
|
|
|
@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUpdateDuringRandSleep(t *testing.T) {
|
||||||
|
// enable rand sleep to test group update during sleep
|
||||||
|
SkipRandSleepOnGroupStart = false
|
||||||
|
defer func() {
|
||||||
|
SkipRandSleepOnGroupStart = true
|
||||||
|
}()
|
||||||
|
rule := AlertingRule{
|
||||||
|
Name: "jobDown",
|
||||||
|
Expr: "up==0",
|
||||||
|
Labels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
g := &Group{
|
||||||
|
Name: "test",
|
||||||
|
Rules: []Rule{
|
||||||
|
&rule,
|
||||||
|
},
|
||||||
|
// big interval ensures big enough randSleep during start process
|
||||||
|
Interval: 100 * time.Hour,
|
||||||
|
updateCh: make(chan *Group),
|
||||||
|
}
|
||||||
|
go g.Start(context.Background(), nil, nil, nil)
|
||||||
|
|
||||||
|
rule1 := AlertingRule{
|
||||||
|
Name: "jobDown",
|
||||||
|
Expr: "up{job=\"vmagent\"}==0",
|
||||||
|
Labels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
g1 := &Group{
|
||||||
|
Rules: []Rule{
|
||||||
|
&rule1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
g.updateCh <- g1
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
g.mu.RLock()
|
||||||
|
if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" {
|
||||||
|
t.Fatalf("expected to have updated rule expr")
|
||||||
|
}
|
||||||
|
g.mu.RUnlock()
|
||||||
|
|
||||||
|
rule2 := AlertingRule{
|
||||||
|
Name: "jobDown",
|
||||||
|
Expr: "up{job=\"vmagent\"}==0",
|
||||||
|
Labels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
"baz": "qux",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
g2 := &Group{
|
||||||
|
Rules: []Rule{
|
||||||
|
&rule2,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
g.updateCh <- g2
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
g.mu.RLock()
|
||||||
|
if len(g.Rules[0].(*AlertingRule).Labels) != 2 {
|
||||||
|
t.Fatalf("expected to have updated labels")
|
||||||
|
}
|
||||||
|
g.mu.RUnlock()
|
||||||
|
|
||||||
|
g.Close()
|
||||||
|
}
|
||||||
|
|
||||||
func TestGroupStart(t *testing.T) {
|
func TestGroupStart(t *testing.T) {
|
||||||
const (
|
const (
|
||||||
rules = `
|
rules = `
|
||||||
|
|
|
@ -34,6 +34,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
||||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly.
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly.
|
||||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration.
|
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration.
|
||||||
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
|
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
|
||||||
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart.
|
||||||
|
|
||||||
## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)
|
## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue