app/vminsert/netstorage: periodically check for each -storageNode health, so it could be marked as healthy when it is ready to accept data

This fixes uneven data routing in cluster version when `-replicationFactor` is set to 1 (default value), i.e. when the replication is disabled. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2024-11-21 14:44:00 +00:00 · 2020-06-18 20:41:33 +03:00 · 2020-06-18 20:41:33 +03:00 · 464682f380
commit 464682f380
parent 5f3a895c23
1 changed files with 21 additions and 1 deletions
--- a/app/vminsert/netstorage/netstorage.go
+++ b/app/vminsert/netstorage/netstorage.go
@ -127,7 +127,8 @@ func (sn *storageNode) run(stopCh <-chan struct{}, snIdx int) {
 			brLastResetTime = currentTime
 		}
 		if len(br.buf) == 0 {
-			// Nothing to send.
+			// Nothing to send. Just check sn health, so it could be returned to non-broken state.
+			sn.checkHealth()
 			continue
 		}

@ -183,6 +184,25 @@ func sendBufToReplicas(br *bufRows, snIdx, replicas int) bool {
 	return true
 }

+func (sn *storageNode) checkHealth() {
+	if !sn.isBroken() {
+		return
+	}
+
+	sn.bcLock.Lock()
+	defer sn.bcLock.Unlock()
+
+	if sn.bc != nil {
+		logger.Panicf("BUG: sn.bc must be nil when sn is broken; got %p", sn.bc)
+	}
+	bc, err := sn.dial()
+	if err != nil {
+		logger.Warnf("cannot dial storageNode %q: %s", sn.dialer.Addr(), err)
+	}
+	sn.bc = bc
+	atomic.StoreUint32(&sn.broken, 0)
+}
+
 func (sn *storageNode) sendBufRows(br *bufRows) bool {
 	sn.bcLock.Lock()
 	defer sn.bcLock.Unlock()