app/vminsert/netstorage: log the error message when pending data wasn't sent to vmstorage nodes because they were unavailable at graceful shutdown

This commit is contained in:
Aliaksandr Valialkin 2024-02-27 14:15:15 +02:00
parent 7d619deacc
commit be3f5d1c64
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB

View file

@ -95,6 +95,7 @@ again:
return fmt.Errorf("cannot send %d rows because of graceful shutdown", rows) return fmt.Errorf("cannot send %d rows because of graceful shutdown", rows)
default: default:
} }
if !sn.isReady() { if !sn.isReady() {
if len(sns) == 1 { if len(sns) == 1 {
// There are no other storage nodes to re-route to. So wait until the current node becomes healthy. // There are no other storage nodes to re-route to. So wait until the current node becomes healthy.
@ -102,11 +103,12 @@ again:
goto again goto again
} }
if *disableReroutingOnUnavailable { if *disableReroutingOnUnavailable {
// We should not send timeseries from currently unavailable storage to alive storage nodes // We should not send timeseries from currently unavailable storage to alive storage nodes.
sn.brCond.Wait() sn.brCond.Wait()
goto again goto again
} }
sn.brLock.Unlock() sn.brLock.Unlock()
// The vmstorage node isn't ready for data processing. Re-route buf to healthy vmstorage nodes even if disableRerouting is set. // The vmstorage node isn't ready for data processing. Re-route buf to healthy vmstorage nodes even if disableRerouting is set.
rowsProcessed, err := rerouteRowsToReadyStorageNodes(snb, sn, buf) rowsProcessed, err := rerouteRowsToReadyStorageNodes(snb, sn, buf)
rows -= rowsProcessed rows -= rowsProcessed
@ -115,6 +117,7 @@ again:
} }
return nil return nil
} }
if len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode { if len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
// Fast path: the buf contents fits sn.buf. // Fast path: the buf contents fits sn.buf.
sn.br.buf = append(sn.br.buf, buf...) sn.br.buf = append(sn.br.buf, buf...)
@ -136,12 +139,6 @@ again:
return nil return nil
} }
var closedCh = func() <-chan struct{} {
ch := make(chan struct{})
close(ch)
return ch
}()
func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) { func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
replicas := *replicationFactor replicas := *replicationFactor
if replicas <= 0 { if replicas <= 0 {
@ -164,29 +161,26 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
defer ticker.Stop() defer ticker.Stop()
var br bufRows var br bufRows
brLastResetTime := fasttime.UnixTimestamp() brLastResetTime := fasttime.UnixTimestamp()
var waitCh <-chan struct{}
mustStop := false mustStop := false
for !mustStop { for !mustStop {
sn.brLock.Lock() sn.brLock.Lock()
bufLen := len(sn.br.buf) waitForNewData := len(sn.br.buf) == 0
sn.brLock.Unlock() sn.brLock.Unlock()
waitCh = nil if waitForNewData {
if bufLen > 0 { select {
// Do not sleep if sn.br.buf isn't empty. case <-sn.stopCh:
waitCh = closedCh mustStop = true
} // Make sure the br.buf is flushed last time before returning
select { // in order to send the remaining bits of data.
case <-sn.stopCh: case <-ticker.C:
mustStop = true }
// Make sure the sn.buf is flushed last time before returning
// in order to send the remaining bits of data.
case <-ticker.C:
case <-waitCh:
} }
sn.brLock.Lock() sn.brLock.Lock()
sn.br, br = br, sn.br sn.br, br = br, sn.br
sn.brCond.Broadcast() sn.brCond.Broadcast()
sn.brLock.Unlock() sn.brLock.Unlock()
currentTime := fasttime.UnixTimestamp() currentTime := fasttime.UnixTimestamp()
if len(br.buf) < cap(br.buf)/4 && currentTime-brLastResetTime > 10 { if len(br.buf) < cap(br.buf)/4 && currentTime-brLastResetTime > 10 {
// Free up capacity space occupied by br.buf in order to reduce memory usage after spikes. // Free up capacity space occupied by br.buf in order to reduce memory usage after spikes.
@ -205,6 +199,7 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
select { select {
case <-sn.stopCh: case <-sn.stopCh:
timerpool.Put(t) timerpool.Put(t)
logger.Errorf("dropping %d rows on graceful shutdown, since all the vmstorage nodes are unavailable", br.rows)
return return
case <-t.C: case <-t.C:
timerpool.Put(t) timerpool.Put(t)
@ -294,6 +289,7 @@ func (sn *storageNode) sendBufRowsNonblocking(br *bufRows) bool {
if !sn.isReady() { if !sn.isReady() {
return false return false
} }
sn.bcLock.Lock() sn.bcLock.Lock()
defer sn.bcLock.Unlock() defer sn.bcLock.Unlock()
@ -673,7 +669,7 @@ func rerouteRowsToReadyStorageNodes(snb *storageNodesBucket, snSource *storageNo
idx := nodesHash.getNodeIdx(h, idxsExcludeNew) idx := nodesHash.getNodeIdx(h, idxsExcludeNew)
snNew := sns[idx] snNew := sns[idx]
if !snNew.trySendBuf(rowBuf, 1) { if !snNew.trySendBuf(rowBuf, 1) {
// The row cannot be sent to both snSource and to sn without blocking. // The row cannot be sent to both snSource, sn and snNew without blocking.
// Sleep for a while and try sending the row to snSource again. // Sleep for a while and try sending the row to snSource again.
time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond)
goto again goto again
@ -785,6 +781,11 @@ func getNotReadyStorageNodeIdxs(snb *storageNodesBucket, dst []int, snExtra *sto
} }
func (sn *storageNode) trySendBuf(buf []byte, rows int) bool { func (sn *storageNode) trySendBuf(buf []byte, rows int) bool {
if !sn.isReady() {
// Fast path without locking the sn.brLock.
return false
}
sent := false sent := false
sn.brLock.Lock() sn.brLock.Lock()
if sn.isReady() && len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode { if sn.isReady() && len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {