diff --git a/app/vmalert/rule/group.go b/app/vmalert/rule/group.go index e77b9b5f54..5051a6082f 100644 --- a/app/vmalert/rule/group.go +++ b/app/vmalert/rule/group.go @@ -324,14 +324,28 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re g.infof("will start in %v", sleepBeforeStart) sleepTimer := time.NewTimer(sleepBeforeStart) - select { - case <-ctx.Done(): - sleepTimer.Stop() - return - case <-g.doneCh: - sleepTimer.Stop() - return - case <-sleepTimer.C: + randSleep: + for { + select { + case <-ctx.Done(): + sleepTimer.Stop() + return + case <-g.doneCh: + sleepTimer.Stop() + return + case ng := <-g.updateCh: + g.mu.Lock() + err := g.updateWith(ng) + if err != nil { + logger.Errorf("group %q: failed to update: %s", g.Name, err) + g.mu.Unlock() + continue + } + g.mu.Unlock() + g.infof("reload successfully") + case <-sleepTimer.C: + break randSleep + } } evalTS = evalTS.Add(sleepBeforeStart) } diff --git a/app/vmalert/rule/group_test.go b/app/vmalert/rule/group_test.go index 711bb277d5..cbe26b5e0a 100644 --- a/app/vmalert/rule/group_test.go +++ b/app/vmalert/rule/group_test.go @@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) { }) } +func TestUpdateDuringRandSleep(t *testing.T) { + // enable rand sleep to test group update during sleep + SkipRandSleepOnGroupStart = false + defer func() { + SkipRandSleepOnGroupStart = true + }() + rule := AlertingRule{ + Name: "jobDown", + Expr: "up==0", + Labels: map[string]string{ + "foo": "bar", + }, + } + g := &Group{ + Name: "test", + Rules: []Rule{ + &rule, + }, + // big interval ensures big enough randSleep during start process + Interval: 100 * time.Hour, + updateCh: make(chan *Group), + } + go g.Start(context.Background(), nil, nil, nil) + + rule1 := AlertingRule{ + Name: "jobDown", + Expr: "up{job=\"vmagent\"}==0", + Labels: map[string]string{ + "foo": "bar", + }, + } + g1 := &Group{ + Rules: []Rule{ + &rule1, + }, + } + g.updateCh <- g1 + time.Sleep(10 * time.Millisecond) + g.mu.RLock() + if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" { + t.Fatalf("expected to have updated rule expr") + } + g.mu.RUnlock() + + rule2 := AlertingRule{ + Name: "jobDown", + Expr: "up{job=\"vmagent\"}==0", + Labels: map[string]string{ + "foo": "bar", + "baz": "qux", + }, + } + g2 := &Group{ + Rules: []Rule{ + &rule2, + }, + } + g.updateCh <- g2 + time.Sleep(10 * time.Millisecond) + g.mu.RLock() + if len(g.Rules[0].(*AlertingRule).Labels) != 2 { + t.Fatalf("expected to have updated labels") + } + g.mu.RUnlock() + + g.Close() +} + func TestGroupStart(t *testing.T) { const ( rules = ` diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index df5d0deb56..9e852467ac 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -28,12 +28,13 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmgateway](https://docs.victoriametrics.com/vmgateway/): fix possible panic during parsing of a token without `vm_access` claim. This issue was introduced in v1.104.0. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix error messages rendering from overflowing the screen with long messages. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7207). -* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040). * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): properly add metrics tags for `opentsdb` migration source. Previously it could have empty values. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7161). * BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): reduce the initial health check interval for datasource. This reduces the time spent on evaluating rules by vmalert-tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6970). * BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): allow specifying empty labels list `labels: '{}'` in the same way as promtool does. This improves compatibility between these tools. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly. * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration. +* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040). +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart. ## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)