mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vmselect/netstorage: fix potential panic under high load
The panic may trigger during data blocks' processing received from vmstorage nodes when some of vmstorage nodes return an error or when `-replicationFactor` is set to values higher than 2 at `vmselect`. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058
This commit is contained in:
parent
024e2f18da
commit
9cca3a0a1b
2 changed files with 32 additions and 9 deletions
|
@ -1354,22 +1354,35 @@ func ProcessBlocks(qt *querytracer.Tracer, denyPartialResponse bool, sq *storage
|
|||
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
||||
// Use per-worker WaitGroup instead of a shared WaitGroup in order to avoid inter-CPU contention,
|
||||
// which may siginificantly slow down the rate of processBlock calls on multi-CPU systems.
|
||||
type wgWithPadding struct {
|
||||
type wgStruct struct {
|
||||
// mu prevents from calling processBlock when stop is set to true
|
||||
mu sync.Mutex
|
||||
|
||||
// wg is used for waiting until currently executed processBlock calls are finished.
|
||||
wg sync.WaitGroup
|
||||
|
||||
// stop must be set to true when no more processBlocks calls should be made.
|
||||
stop bool
|
||||
}
|
||||
type wgWithPadding struct {
|
||||
wgStruct
|
||||
// The padding prevents false sharing on widespread platforms with
|
||||
// 128 mod (cache line size) = 0 .
|
||||
_ [128 - unsafe.Sizeof(sync.WaitGroup{})%128]byte
|
||||
_ [128 - unsafe.Sizeof(wgStruct{})%128]byte
|
||||
}
|
||||
wgs := make([]wgWithPadding, len(storageNodes))
|
||||
var stopped uint32
|
||||
f := func(mb *storage.MetricBlock, workerIdx int) error {
|
||||
wg := &wgs[workerIdx].wg
|
||||
wg.Add(1)
|
||||
defer wg.Done()
|
||||
if atomic.LoadUint32(&stopped) != 0 {
|
||||
muwg := &wgs[workerIdx]
|
||||
muwg.mu.Lock()
|
||||
if muwg.stop {
|
||||
muwg.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
return processBlock(mb, workerIdx)
|
||||
muwg.wg.Add(1)
|
||||
muwg.mu.Unlock()
|
||||
err := processBlock(mb, workerIdx)
|
||||
muwg.wg.Done()
|
||||
return err
|
||||
}
|
||||
|
||||
// Send the query to all the storage nodes in parallel.
|
||||
|
@ -1389,7 +1402,12 @@ func ProcessBlocks(qt *querytracer.Tracer, denyPartialResponse bool, sq *storage
|
|||
return *errP
|
||||
})
|
||||
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
||||
atomic.StoreUint32(&stopped, 1)
|
||||
for i := range wgs {
|
||||
muwg := &wgs[i]
|
||||
muwg.mu.Lock()
|
||||
muwg.stop = true
|
||||
muwg.mu.Unlock()
|
||||
}
|
||||
for i := range wgs {
|
||||
wgs[i].wg.Wait()
|
||||
}
|
||||
|
|
|
@ -17,8 +17,13 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
|||
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): evaluate `q1`, ..., `qN` in parallel when calculating `union(q1, .., qN)`. Previously [union](https://docs.victoriametrics.com/MetricsQL.html#union) args were evaluated sequentially. This could result in lower than expected performance.
|
||||
|
||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): fix potential panic at `vmselect` under high load, which has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058).
|
||||
|
||||
|
||||
## [v1.81.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.81.0)
|
||||
|
||||
**It isn't recommended to update cluster version of VictoriaMetrics to v1.81.0 because of [the bug](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058), which may result in `vmselect` crashes under high load**
|
||||
|
||||
Released at 31-08-2022
|
||||
|
||||
**Update note 1:** [vmalert](https://docs.victoriametrics.com/vmalert.html) by default hides values of `-remoteWrite.url`, `-remoteRead.url` and `-datasource.url` in logs and at `http://vmalert:8880/flags` for security reasons. See the corresponding SECURITY change in the Chagelog below for additional info.
|
||||
|
|
Loading…
Reference in a new issue