mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-02-09 15:27:11 +00:00
app/vminsert/netstorage: log the error message when pending data wasn't sent to vmstorage nodes because they were unavailable at graceful shutdown
This commit is contained in:
parent
7d619deacc
commit
be3f5d1c64
1 changed files with 23 additions and 22 deletions
|
@ -95,6 +95,7 @@ again:
|
||||||
return fmt.Errorf("cannot send %d rows because of graceful shutdown", rows)
|
return fmt.Errorf("cannot send %d rows because of graceful shutdown", rows)
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
|
||||||
if !sn.isReady() {
|
if !sn.isReady() {
|
||||||
if len(sns) == 1 {
|
if len(sns) == 1 {
|
||||||
// There are no other storage nodes to re-route to. So wait until the current node becomes healthy.
|
// There are no other storage nodes to re-route to. So wait until the current node becomes healthy.
|
||||||
|
@ -102,11 +103,12 @@ again:
|
||||||
goto again
|
goto again
|
||||||
}
|
}
|
||||||
if *disableReroutingOnUnavailable {
|
if *disableReroutingOnUnavailable {
|
||||||
// We should not send timeseries from currently unavailable storage to alive storage nodes
|
// We should not send timeseries from currently unavailable storage to alive storage nodes.
|
||||||
sn.brCond.Wait()
|
sn.brCond.Wait()
|
||||||
goto again
|
goto again
|
||||||
}
|
}
|
||||||
sn.brLock.Unlock()
|
sn.brLock.Unlock()
|
||||||
|
|
||||||
// The vmstorage node isn't ready for data processing. Re-route buf to healthy vmstorage nodes even if disableRerouting is set.
|
// The vmstorage node isn't ready for data processing. Re-route buf to healthy vmstorage nodes even if disableRerouting is set.
|
||||||
rowsProcessed, err := rerouteRowsToReadyStorageNodes(snb, sn, buf)
|
rowsProcessed, err := rerouteRowsToReadyStorageNodes(snb, sn, buf)
|
||||||
rows -= rowsProcessed
|
rows -= rowsProcessed
|
||||||
|
@ -115,6 +117,7 @@ again:
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
if len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
||||||
// Fast path: the buf contents fits sn.buf.
|
// Fast path: the buf contents fits sn.buf.
|
||||||
sn.br.buf = append(sn.br.buf, buf...)
|
sn.br.buf = append(sn.br.buf, buf...)
|
||||||
|
@ -136,12 +139,6 @@ again:
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var closedCh = func() <-chan struct{} {
|
|
||||||
ch := make(chan struct{})
|
|
||||||
close(ch)
|
|
||||||
return ch
|
|
||||||
}()
|
|
||||||
|
|
||||||
func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
replicas := *replicationFactor
|
replicas := *replicationFactor
|
||||||
if replicas <= 0 {
|
if replicas <= 0 {
|
||||||
|
@ -164,29 +161,26 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
var br bufRows
|
var br bufRows
|
||||||
brLastResetTime := fasttime.UnixTimestamp()
|
brLastResetTime := fasttime.UnixTimestamp()
|
||||||
var waitCh <-chan struct{}
|
|
||||||
mustStop := false
|
mustStop := false
|
||||||
for !mustStop {
|
for !mustStop {
|
||||||
sn.brLock.Lock()
|
sn.brLock.Lock()
|
||||||
bufLen := len(sn.br.buf)
|
waitForNewData := len(sn.br.buf) == 0
|
||||||
sn.brLock.Unlock()
|
sn.brLock.Unlock()
|
||||||
waitCh = nil
|
if waitForNewData {
|
||||||
if bufLen > 0 {
|
select {
|
||||||
// Do not sleep if sn.br.buf isn't empty.
|
case <-sn.stopCh:
|
||||||
waitCh = closedCh
|
mustStop = true
|
||||||
}
|
// Make sure the br.buf is flushed last time before returning
|
||||||
select {
|
// in order to send the remaining bits of data.
|
||||||
case <-sn.stopCh:
|
case <-ticker.C:
|
||||||
mustStop = true
|
}
|
||||||
// Make sure the sn.buf is flushed last time before returning
|
|
||||||
// in order to send the remaining bits of data.
|
|
||||||
case <-ticker.C:
|
|
||||||
case <-waitCh:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sn.brLock.Lock()
|
sn.brLock.Lock()
|
||||||
sn.br, br = br, sn.br
|
sn.br, br = br, sn.br
|
||||||
sn.brCond.Broadcast()
|
sn.brCond.Broadcast()
|
||||||
sn.brLock.Unlock()
|
sn.brLock.Unlock()
|
||||||
|
|
||||||
currentTime := fasttime.UnixTimestamp()
|
currentTime := fasttime.UnixTimestamp()
|
||||||
if len(br.buf) < cap(br.buf)/4 && currentTime-brLastResetTime > 10 {
|
if len(br.buf) < cap(br.buf)/4 && currentTime-brLastResetTime > 10 {
|
||||||
// Free up capacity space occupied by br.buf in order to reduce memory usage after spikes.
|
// Free up capacity space occupied by br.buf in order to reduce memory usage after spikes.
|
||||||
|
@ -205,6 +199,7 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
select {
|
select {
|
||||||
case <-sn.stopCh:
|
case <-sn.stopCh:
|
||||||
timerpool.Put(t)
|
timerpool.Put(t)
|
||||||
|
logger.Errorf("dropping %d rows on graceful shutdown, since all the vmstorage nodes are unavailable", br.rows)
|
||||||
return
|
return
|
||||||
case <-t.C:
|
case <-t.C:
|
||||||
timerpool.Put(t)
|
timerpool.Put(t)
|
||||||
|
@ -294,6 +289,7 @@ func (sn *storageNode) sendBufRowsNonblocking(br *bufRows) bool {
|
||||||
if !sn.isReady() {
|
if !sn.isReady() {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
sn.bcLock.Lock()
|
sn.bcLock.Lock()
|
||||||
defer sn.bcLock.Unlock()
|
defer sn.bcLock.Unlock()
|
||||||
|
|
||||||
|
@ -673,7 +669,7 @@ func rerouteRowsToReadyStorageNodes(snb *storageNodesBucket, snSource *storageNo
|
||||||
idx := nodesHash.getNodeIdx(h, idxsExcludeNew)
|
idx := nodesHash.getNodeIdx(h, idxsExcludeNew)
|
||||||
snNew := sns[idx]
|
snNew := sns[idx]
|
||||||
if !snNew.trySendBuf(rowBuf, 1) {
|
if !snNew.trySendBuf(rowBuf, 1) {
|
||||||
// The row cannot be sent to both snSource and to sn without blocking.
|
// The row cannot be sent to both snSource, sn and snNew without blocking.
|
||||||
// Sleep for a while and try sending the row to snSource again.
|
// Sleep for a while and try sending the row to snSource again.
|
||||||
time.Sleep(100 * time.Millisecond)
|
time.Sleep(100 * time.Millisecond)
|
||||||
goto again
|
goto again
|
||||||
|
@ -785,6 +781,11 @@ func getNotReadyStorageNodeIdxs(snb *storageNodesBucket, dst []int, snExtra *sto
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sn *storageNode) trySendBuf(buf []byte, rows int) bool {
|
func (sn *storageNode) trySendBuf(buf []byte, rows int) bool {
|
||||||
|
if !sn.isReady() {
|
||||||
|
// Fast path without locking the sn.brLock.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
sent := false
|
sent := false
|
||||||
sn.brLock.Lock()
|
sn.brLock.Lock()
|
||||||
if sn.isReady() && len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
if sn.isReady() && len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
||||||
|
|
Loading…
Reference in a new issue