mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vmselect/netstorage: fix potential panic under high load
The panic may trigger during data blocks' processing received from vmstorage nodes when some of vmstorage nodes return an error or when `-replicationFactor` is set to values higher than 2 at `vmselect`. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058
This commit is contained in:
parent
024e2f18da
commit
9cca3a0a1b
2 changed files with 32 additions and 9 deletions
|
@ -1354,22 +1354,35 @@ func ProcessBlocks(qt *querytracer.Tracer, denyPartialResponse bool, sq *storage
|
||||||
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
||||||
// Use per-worker WaitGroup instead of a shared WaitGroup in order to avoid inter-CPU contention,
|
// Use per-worker WaitGroup instead of a shared WaitGroup in order to avoid inter-CPU contention,
|
||||||
// which may siginificantly slow down the rate of processBlock calls on multi-CPU systems.
|
// which may siginificantly slow down the rate of processBlock calls on multi-CPU systems.
|
||||||
type wgWithPadding struct {
|
type wgStruct struct {
|
||||||
|
// mu prevents from calling processBlock when stop is set to true
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
|
// wg is used for waiting until currently executed processBlock calls are finished.
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
// stop must be set to true when no more processBlocks calls should be made.
|
||||||
|
stop bool
|
||||||
|
}
|
||||||
|
type wgWithPadding struct {
|
||||||
|
wgStruct
|
||||||
// The padding prevents false sharing on widespread platforms with
|
// The padding prevents false sharing on widespread platforms with
|
||||||
// 128 mod (cache line size) = 0 .
|
// 128 mod (cache line size) = 0 .
|
||||||
_ [128 - unsafe.Sizeof(sync.WaitGroup{})%128]byte
|
_ [128 - unsafe.Sizeof(wgStruct{})%128]byte
|
||||||
}
|
}
|
||||||
wgs := make([]wgWithPadding, len(storageNodes))
|
wgs := make([]wgWithPadding, len(storageNodes))
|
||||||
var stopped uint32
|
|
||||||
f := func(mb *storage.MetricBlock, workerIdx int) error {
|
f := func(mb *storage.MetricBlock, workerIdx int) error {
|
||||||
wg := &wgs[workerIdx].wg
|
muwg := &wgs[workerIdx]
|
||||||
wg.Add(1)
|
muwg.mu.Lock()
|
||||||
defer wg.Done()
|
if muwg.stop {
|
||||||
if atomic.LoadUint32(&stopped) != 0 {
|
muwg.mu.Unlock()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return processBlock(mb, workerIdx)
|
muwg.wg.Add(1)
|
||||||
|
muwg.mu.Unlock()
|
||||||
|
err := processBlock(mb, workerIdx)
|
||||||
|
muwg.wg.Done()
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send the query to all the storage nodes in parallel.
|
// Send the query to all the storage nodes in parallel.
|
||||||
|
@ -1389,7 +1402,12 @@ func ProcessBlocks(qt *querytracer.Tracer, denyPartialResponse bool, sq *storage
|
||||||
return *errP
|
return *errP
|
||||||
})
|
})
|
||||||
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
// Make sure that processBlock is no longer called after the exit from ProcessBlocks() function.
|
||||||
atomic.StoreUint32(&stopped, 1)
|
for i := range wgs {
|
||||||
|
muwg := &wgs[i]
|
||||||
|
muwg.mu.Lock()
|
||||||
|
muwg.stop = true
|
||||||
|
muwg.mu.Unlock()
|
||||||
|
}
|
||||||
for i := range wgs {
|
for i := range wgs {
|
||||||
wgs[i].wg.Wait()
|
wgs[i].wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,13 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||||
|
|
||||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): evaluate `q1`, ..., `qN` in parallel when calculating `union(q1, .., qN)`. Previously [union](https://docs.victoriametrics.com/MetricsQL.html#union) args were evaluated sequentially. This could result in lower than expected performance.
|
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): evaluate `q1`, ..., `qN` in parallel when calculating `union(q1, .., qN)`. Previously [union](https://docs.victoriametrics.com/MetricsQL.html#union) args were evaluated sequentially. This could result in lower than expected performance.
|
||||||
|
|
||||||
|
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): fix potential panic at `vmselect` under high load, which has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058).
|
||||||
|
|
||||||
|
|
||||||
## [v1.81.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.81.0)
|
## [v1.81.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.81.0)
|
||||||
|
|
||||||
|
**It isn't recommended to update cluster version of VictoriaMetrics to v1.81.0 because of [the bug](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3058), which may result in `vmselect` crashes under high load**
|
||||||
|
|
||||||
Released at 31-08-2022
|
Released at 31-08-2022
|
||||||
|
|
||||||
**Update note 1:** [vmalert](https://docs.victoriametrics.com/vmalert.html) by default hides values of `-remoteWrite.url`, `-remoteRead.url` and `-datasource.url` in logs and at `http://vmalert:8880/flags` for security reasons. See the corresponding SECURITY change in the Chagelog below for additional info.
|
**Update note 1:** [vmalert](https://docs.victoriametrics.com/vmalert.html) by default hides values of `-remoteWrite.url`, `-remoteRead.url` and `-datasource.url` in logs and at `http://vmalert:8880/flags` for security reasons. See the corresponding SECURITY change in the Chagelog below for additional info.
|
||||||
|
|
Loading…
Reference in a new issue