2019-05-22 21:23:23 +00:00
package netstorage
import (
2021-10-08 09:52:56 +00:00
"errors"
2019-05-24 09:51:07 +00:00
"flag"
2019-05-22 21:23:23 +00:00
"fmt"
2020-04-27 06:32:08 +00:00
"io"
2021-09-15 15:04:28 +00:00
"net"
2021-06-23 11:00:06 +00:00
"sort"
2019-05-22 21:23:23 +00:00
"sync"
2020-05-27 12:07:16 +00:00
"sync/atomic"
2019-05-22 21:23:23 +00:00
"time"
2020-05-27 12:07:16 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
2019-06-08 19:29:25 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/consts"
2019-05-22 21:23:23 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
2020-06-01 11:33:29 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
2019-05-22 21:23:23 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
2019-06-08 19:29:25 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
2019-05-22 21:23:23 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
2019-06-08 19:29:25 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
2020-09-28 18:35:40 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
2019-05-22 21:23:23 +00:00
"github.com/VictoriaMetrics/metrics"
2019-06-08 19:29:25 +00:00
xxhash "github.com/cespare/xxhash/v2"
2019-05-22 21:23:23 +00:00
)
2020-05-28 16:57:05 +00:00
var (
2021-06-04 01:33:49 +00:00
disableRPCCompression = flag . Bool ( ` rpc.disableCompression ` , false , "Whether to disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage" )
2020-05-28 16:57:05 +00:00
replicationFactor = flag . Int ( "replicationFactor" , 1 , "Replication factor for the ingested data, i.e. how many copies to make among distinct -storageNode instances. " +
"Note that vmselect must run with -dedup.minScrapeInterval=1ms for data de-duplication when replicationFactor is greater than 1. " +
"Higher values for -dedup.minScrapeInterval at vmselect is OK" )
2021-09-13 15:46:26 +00:00
disableRerouting = flag . Bool ( ` disableRerouting ` , true , "Whether to disable re-routing when some of vmstorage nodes accept incoming data at slower speed compared to other storage nodes. Disabled re-routing limits the ingestion rate by the slowest vmstorage node. On the other side, disabled re-routing minimizes the number of active time series in the cluster during rolling restarts and during spikes in series churn rate" )
2020-05-28 16:57:05 +00:00
)
2019-05-24 09:51:07 +00:00
2021-10-08 10:52:56 +00:00
var errStorageReadOnly = errors . New ( "storage node is read only" )
2021-10-08 09:52:56 +00:00
func ( sn * storageNode ) isNotReady ( ) bool {
return atomic . LoadUint32 ( & sn . broken ) != 0 || atomic . LoadUint32 ( & sn . isReadOnly ) != 0
2020-05-27 12:07:16 +00:00
}
// push pushes buf to sn internal bufs.
2019-05-22 21:23:23 +00:00
//
2020-05-27 12:07:16 +00:00
// This function doesn't block on fast path.
// It may block only if all the storageNodes cannot handle the incoming ingestion rate.
// This blocking provides backpressure to the caller.
//
// The function falls back to sending data to other vmstorage nodes
// if sn is currently unavailable or overloaded.
2019-06-08 19:29:25 +00:00
//
// rows is the number of rows in the buf.
func ( sn * storageNode ) push ( buf [ ] byte , rows int ) error {
2020-05-24 22:39:24 +00:00
if len ( buf ) > maxBufSizePerStorageNode {
logger . Panicf ( "BUG: len(buf)=%d cannot exceed %d" , len ( buf ) , maxBufSizePerStorageNode )
2019-06-08 19:29:25 +00:00
}
sn . rowsPushed . Add ( rows )
2021-06-04 01:33:49 +00:00
sn . brLock . Lock ( )
again :
select {
case <- storageNodesStopCh :
sn . brLock . Unlock ( )
return fmt . Errorf ( "cannot send %d rows because of graceful shutdown" , rows )
default :
}
2021-10-08 09:52:56 +00:00
if sn . isNotReady ( ) {
2021-06-04 01:33:49 +00:00
if len ( storageNodes ) == 1 {
// There are no other storage nodes to re-route to. So wait until the current node becomes healthy.
sn . brCond . Wait ( )
goto again
}
sn . brLock . Unlock ( )
// The vmstorage node is temporarily broken. Re-route buf to healthy vmstorage nodes even if *disableRerouting==true.
2021-06-05 13:16:16 +00:00
if err := rerouteRowsMayBlock ( sn , false , buf , rows ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "%d rows dropped because the current vsmtorage is unavailable and %w" , rows , err )
2019-06-08 19:29:25 +00:00
}
2019-05-22 21:23:23 +00:00
return nil
}
2020-05-27 12:07:16 +00:00
if len ( sn . br . buf ) + len ( buf ) <= maxBufSizePerStorageNode {
2019-06-08 19:29:25 +00:00
// Fast path: the buf contents fits sn.buf.
2020-05-27 12:07:16 +00:00
sn . br . buf = append ( sn . br . buf , buf ... )
sn . br . rows += rows
sn . brLock . Unlock ( )
2019-06-08 19:29:25 +00:00
return nil
2019-05-22 21:23:23 +00:00
}
2021-06-04 01:33:49 +00:00
if * disableRerouting || len ( storageNodes ) == 1 {
sn . brCond . Wait ( )
goto again
}
2020-05-27 12:07:16 +00:00
sn . brLock . Unlock ( )
2019-06-08 19:29:25 +00:00
2021-06-04 01:33:49 +00:00
// The buf contents doesn't fit sn.buf.
2020-05-27 12:07:16 +00:00
// This means that the current vmstorage is slow or will become broken soon.
2021-06-04 01:33:49 +00:00
// Spread buf among all the vmstorage nodes.
2021-06-05 13:16:16 +00:00
if err := rerouteRowsMayBlock ( sn , true , buf , rows ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "%d rows dropped because the current vmstorage buf is full and %w" , rows , err )
2019-06-08 19:29:25 +00:00
}
return nil
}
2020-05-27 12:07:16 +00:00
var closedCh = func ( ) <- chan struct { } {
ch := make ( chan struct { } )
close ( ch )
return ch
} ( )
2019-06-08 19:29:25 +00:00
2020-05-28 16:57:05 +00:00
func ( sn * storageNode ) run ( stopCh <- chan struct { } , snIdx int ) {
replicas := * replicationFactor
if replicas <= 0 {
replicas = 1
}
if replicas > len ( storageNodes ) {
replicas = len ( storageNodes )
}
2021-10-08 10:52:56 +00:00
sn . readOnlyCheckerWG . Add ( 1 )
go func ( ) {
defer sn . readOnlyCheckerWG . Done ( )
sn . readOnlyChecker ( stopCh )
} ( )
defer sn . readOnlyCheckerWG . Wait ( )
2020-09-28 18:35:40 +00:00
ticker := time . NewTicker ( 200 * time . Millisecond )
2020-05-27 12:07:16 +00:00
defer ticker . Stop ( )
var br bufRows
2020-06-01 11:33:29 +00:00
brLastResetTime := fasttime . UnixTimestamp ( )
2020-05-27 12:07:16 +00:00
var waitCh <- chan struct { }
mustStop := false
for ! mustStop {
sn . brLock . Lock ( )
bufLen := len ( sn . br . buf )
sn . brLock . Unlock ( )
waitCh = nil
2020-09-28 18:35:40 +00:00
if bufLen > 0 {
// Do not sleep if sn.br.buf isn't empty.
2020-05-27 12:07:16 +00:00
waitCh = closedCh
}
select {
case <- stopCh :
mustStop = true
// Make sure the sn.buf is flushed last time before returning
// in order to send the remaining bits of data.
case <- ticker . C :
case <- waitCh :
}
2020-09-28 18:35:40 +00:00
sn . brLock . Lock ( )
sn . br , br = br , sn . br
2021-06-04 01:33:49 +00:00
sn . brCond . Broadcast ( )
2020-09-28 18:35:40 +00:00
sn . brLock . Unlock ( )
2020-06-01 11:33:29 +00:00
currentTime := fasttime . UnixTimestamp ( )
if len ( br . buf ) < cap ( br . buf ) / 4 && currentTime - brLastResetTime > 10 {
// Free up capacity space occupied by br.buf in order to reduce memory usage after spikes.
br . buf = append ( br . buf [ : 0 : 0 ] , br . buf ... )
brLastResetTime = currentTime
}
2020-09-28 18:35:40 +00:00
sn . checkHealth ( )
2020-05-28 16:57:05 +00:00
if len ( br . buf ) == 0 {
2020-09-28 18:35:40 +00:00
// Nothing to send.
2020-05-27 12:07:16 +00:00
continue
}
2020-05-28 16:57:05 +00:00
// Send br to replicas storageNodes starting from snIdx.
2020-09-28 18:35:40 +00:00
for ! sendBufToReplicasNonblocking ( & br , snIdx , replicas ) {
t := timerpool . Get ( 200 * time . Millisecond )
select {
case <- stopCh :
timerpool . Put ( t )
return
case <- t . C :
timerpool . Put ( t )
sn . checkHealth ( )
}
2020-05-27 12:07:16 +00:00
}
2020-05-28 16:57:05 +00:00
br . reset ( )
}
}
2020-09-28 18:35:40 +00:00
func sendBufToReplicasNonblocking ( br * bufRows , snIdx , replicas int ) bool {
2020-05-28 16:57:05 +00:00
usedStorageNodes := make ( map [ * storageNode ] bool , replicas )
for i := 0 ; i < replicas ; i ++ {
idx := snIdx + i
attempts := 0
for {
attempts ++
if attempts > len ( storageNodes ) {
if i == 0 {
// The data wasn't replicated at all.
logger . Warnf ( "cannot push %d bytes with %d rows to storage nodes, since all the nodes are temporarily unavailable; " +
"re-trying to send the data soon" , len ( br . buf ) , br . rows )
return false
}
// The data is partially replicated, so just emit a warning and return true.
// We could retry sending the data again, but this may result in uncontrolled duplicate data.
// So it is better returning true.
rowsIncompletelyReplicatedTotal . Add ( br . rows )
logger . Warnf ( "cannot make a copy #%d out of %d copies according to -replicationFactor=%d for %d bytes with %d rows, " +
"since a part of storage nodes is temporarily unavailable" , i + 1 , replicas , * replicationFactor , len ( br . buf ) , br . rows )
return true
}
if idx >= len ( storageNodes ) {
2020-06-19 09:39:19 +00:00
idx %= len ( storageNodes )
2020-05-28 16:57:05 +00:00
}
sn := storageNodes [ idx ]
idx ++
if usedStorageNodes [ sn ] {
// The br has been already replicated to sn. Skip it.
continue
}
2020-09-28 18:35:40 +00:00
if ! sn . sendBufRowsNonblocking ( br ) {
2020-05-28 16:57:05 +00:00
// Cannot send data to sn. Go to the next sn.
continue
}
// Successfully sent data to sn.
usedStorageNodes [ sn ] = true
break
2020-05-27 12:07:16 +00:00
}
2019-05-22 21:23:23 +00:00
}
2020-05-28 16:57:05 +00:00
return true
}
2020-06-18 17:41:33 +00:00
func ( sn * storageNode ) checkHealth ( ) {
sn . bcLock . Lock ( )
defer sn . bcLock . Unlock ( )
if sn . bc != nil {
2020-09-28 18:35:40 +00:00
// The sn looks healthy.
return
2020-06-18 17:41:33 +00:00
}
bc , err := sn . dial ( )
if err != nil {
2020-11-13 22:43:32 +00:00
atomic . StoreUint32 ( & sn . broken , 1 )
2021-06-04 01:33:49 +00:00
sn . brCond . Broadcast ( )
2020-09-28 21:20:01 +00:00
if sn . lastDialErr == nil {
// Log the error only once.
sn . lastDialErr = err
logger . Warnf ( "cannot dial storageNode %q: %s" , sn . dialer . Addr ( ) , err )
}
2020-06-18 17:51:28 +00:00
return
2020-06-18 17:41:33 +00:00
}
2020-09-28 18:35:40 +00:00
logger . Infof ( "successfully dialed -storageNode=%q" , sn . dialer . Addr ( ) )
2020-09-28 21:20:01 +00:00
sn . lastDialErr = nil
2020-06-18 17:41:33 +00:00
sn . bc = bc
atomic . StoreUint32 ( & sn . broken , 0 )
2021-06-04 01:33:49 +00:00
sn . brCond . Broadcast ( )
2020-06-18 17:41:33 +00:00
}
2020-09-28 18:35:40 +00:00
func ( sn * storageNode ) sendBufRowsNonblocking ( br * bufRows ) bool {
2021-10-08 09:52:56 +00:00
if sn . isNotReady ( ) {
2020-09-28 18:35:40 +00:00
return false
}
2020-05-28 16:57:05 +00:00
sn . bcLock . Lock ( )
defer sn . bcLock . Unlock ( )
if sn . bc == nil {
2020-09-28 18:35:40 +00:00
// Do not call sn.dial() here in order to prevent long blocking on sn.bcLock.Lock(),
// which can negatively impact data sending in sendBufToReplicasNonblocking().
2020-11-13 22:43:32 +00:00
// sn.dial() should be called by sn.checkHealth() on unsuccessful call to sendBufToReplicasNonblocking().
2020-09-28 18:35:40 +00:00
return false
2020-05-28 16:57:05 +00:00
}
2021-08-11 08:40:52 +00:00
startTime := time . Now ( )
2020-05-28 16:57:05 +00:00
err := sendToConn ( sn . bc , br . buf )
2021-08-11 08:40:52 +00:00
duration := time . Since ( startTime )
sn . sendDurationSeconds . Add ( duration . Seconds ( ) )
2020-05-28 16:57:05 +00:00
if err == nil {
2020-09-28 18:35:40 +00:00
// Successfully sent buf to bc.
2020-05-28 16:57:05 +00:00
sn . rowsSent . Add ( br . rows )
return true
}
2021-10-08 10:52:56 +00:00
if errors . Is ( err , errStorageReadOnly ) {
// The vmstorage is transitioned to readonly mode.
2021-10-08 09:52:56 +00:00
atomic . StoreUint32 ( & sn . isReadOnly , 1 )
sn . brCond . Broadcast ( )
2021-10-08 10:52:56 +00:00
// Signal the caller that the data wasn't accepted by the vmstorage,
// so it will be re-routed to the remaining vmstorage nodes.
2021-10-08 09:52:56 +00:00
return false
}
2020-05-28 16:57:05 +00:00
// Couldn't flush buf to sn. Mark sn as broken.
2020-09-28 18:35:40 +00:00
logger . Warnf ( "cannot send %d bytes with %d rows to -storageNode=%q: %s; closing the connection to storageNode and " +
"re-routing this data to healthy storage nodes" , len ( br . buf ) , br . rows , sn . dialer . Addr ( ) , err )
2020-05-28 16:57:05 +00:00
if err = sn . bc . Close ( ) ; err != nil {
logger . Warnf ( "cannot close connection to storageNode %q: %s" , sn . dialer . Addr ( ) , err )
}
sn . bc = nil
atomic . StoreUint32 ( & sn . broken , 1 )
2021-06-04 01:33:49 +00:00
sn . brCond . Broadcast ( )
2020-09-28 18:35:40 +00:00
sn . connectionErrors . Inc ( )
2020-05-28 16:57:05 +00:00
return false
2019-05-22 21:23:23 +00:00
}
2020-05-27 12:07:16 +00:00
func sendToConn ( bc * handshake . BufferedConn , buf [ ] byte ) error {
2019-09-11 11:25:53 +00:00
if len ( buf ) == 0 {
2020-05-27 12:07:16 +00:00
// Nothing to send
2019-09-11 11:25:53 +00:00
return nil
}
2020-01-21 16:20:14 +00:00
timeoutSeconds := len ( buf ) / 3e5
2019-09-13 19:25:15 +00:00
if timeoutSeconds < 60 {
timeoutSeconds = 60
2019-09-11 10:37:03 +00:00
}
timeout := time . Duration ( timeoutSeconds ) * time . Second
deadline := time . Now ( ) . Add ( timeout )
2020-04-27 06:32:08 +00:00
if err := bc . SetWriteDeadline ( deadline ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot set write deadline to %s: %w" , deadline , err )
2019-05-22 21:23:23 +00:00
}
2019-09-11 10:37:03 +00:00
// sizeBuf guarantees that the rows batch will be either fully
// read or fully discarded on the vmstorage side.
// sizeBuf is used for read optimization in vmstorage.
2020-05-27 12:07:16 +00:00
sizeBuf := sizeBufPool . Get ( )
defer sizeBufPool . Put ( sizeBuf )
sizeBuf . B = encoding . MarshalUint64 ( sizeBuf . B [ : 0 ] , uint64 ( len ( buf ) ) )
if _ , err := bc . Write ( sizeBuf . B ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot write data size %d: %w" , len ( buf ) , err )
2019-05-22 21:23:23 +00:00
}
2020-04-27 06:32:08 +00:00
if _ , err := bc . Write ( buf ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot write data with size %d: %w" , len ( buf ) , err )
2019-05-22 21:23:23 +00:00
}
2020-04-27 06:32:08 +00:00
if err := bc . Flush ( ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot flush data with size %d: %w" , len ( buf ) , err )
2019-06-08 19:29:25 +00:00
}
2020-04-27 06:32:08 +00:00
// Wait for `ack` from vmstorage.
// This guarantees that the message has been fully received by vmstorage.
2020-04-28 08:18:41 +00:00
deadline = time . Now ( ) . Add ( timeout )
2020-04-27 06:32:08 +00:00
if err := bc . SetReadDeadline ( deadline ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot set read deadline for reading `ack` to vmstorage: %w" , err )
2020-04-27 06:32:08 +00:00
}
2020-05-27 12:07:16 +00:00
if _ , err := io . ReadFull ( bc , sizeBuf . B [ : 1 ] ) ; err != nil {
2020-06-30 19:58:18 +00:00
return fmt . Errorf ( "cannot read `ack` from vmstorage: %w" , err )
2020-04-27 06:32:08 +00:00
}
2021-10-08 09:52:56 +00:00
ackResp := sizeBuf . B [ 0 ]
switch ackResp {
case 1 :
2021-10-08 10:52:56 +00:00
// ok response, data successfully accepted by vmstorage
2021-10-08 09:52:56 +00:00
case 2 :
2021-10-08 10:52:56 +00:00
// vmstorage is in readonly mode
return errStorageReadOnly
2021-10-08 09:52:56 +00:00
default :
return fmt . Errorf ( "unexpected `ack` received from vmstorage; got %d; want 1 or 2" , sizeBuf . B [ 0 ] )
2020-04-27 06:32:08 +00:00
}
2021-10-08 09:52:56 +00:00
2019-05-22 21:23:23 +00:00
return nil
}
2020-05-27 12:07:16 +00:00
var sizeBufPool bytesutil . ByteBufferPool
func ( sn * storageNode ) dial ( ) ( * handshake . BufferedConn , error ) {
2019-05-22 21:23:23 +00:00
c , err := sn . dialer . Dial ( )
if err != nil {
sn . dialErrors . Inc ( )
2020-05-27 12:07:16 +00:00
return nil , err
2019-05-22 21:23:23 +00:00
}
compressionLevel := 1
2019-05-24 09:51:07 +00:00
if * disableRPCCompression {
compressionLevel = 0
}
2019-05-22 21:23:23 +00:00
bc , err := handshake . VMInsertClient ( c , compressionLevel )
if err != nil {
_ = c . Close ( )
sn . handshakeErrors . Inc ( )
2020-06-30 19:58:18 +00:00
return nil , fmt . Errorf ( "handshake error: %w" , err )
2019-05-22 21:23:23 +00:00
}
2020-05-27 12:07:16 +00:00
return bc , nil
2019-05-22 21:23:23 +00:00
}
2019-06-08 19:29:25 +00:00
// storageNode is a client sending data to vmstorage node.
2019-05-22 21:23:23 +00:00
type storageNode struct {
2021-06-04 01:33:49 +00:00
// The last time for the re-routing.
lastRerouteTime uint64
2020-05-27 12:07:16 +00:00
// broken is set to non-zero if the given vmstorage node is temporarily unhealthy.
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
broken uint32
2019-06-08 19:29:25 +00:00
2021-10-08 09:52:56 +00:00
// isReadOnly is set to non-zero if the given vmstorage node is read only
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
isReadOnly uint32
2020-05-27 12:07:16 +00:00
// brLock protects br.
brLock sync . Mutex
2019-05-22 21:23:23 +00:00
2021-06-04 01:33:49 +00:00
// brCond is used for waiting for free space in br.
brCond * sync . Cond
2020-05-27 12:07:16 +00:00
// Buffer with data that needs to be written to the storage node.
2020-05-28 16:57:05 +00:00
// It must be accessed under brLock.
2020-05-27 12:07:16 +00:00
br bufRows
2019-05-22 21:23:23 +00:00
2020-05-28 16:57:05 +00:00
// bcLock protects bc.
bcLock sync . Mutex
2021-10-08 10:52:56 +00:00
// waitGroup for readOnlyChecker
readOnlyCheckerWG sync . WaitGroup
2020-05-28 16:57:05 +00:00
// bc is a single connection to vmstorage for data transfer.
// It must be accessed under bcLock.
bc * handshake . BufferedConn
2019-06-08 19:29:25 +00:00
dialer * netutil . TCPDialer
2020-09-28 21:20:01 +00:00
// last error during dial.
lastDialErr error
2019-06-08 19:29:25 +00:00
// The number of dial errors to vmstorage node.
2019-05-22 21:23:23 +00:00
dialErrors * metrics . Counter
2019-06-08 19:29:25 +00:00
// The number of handshake errors to vmstorage node.
2019-05-22 21:23:23 +00:00
handshakeErrors * metrics . Counter
2019-06-08 19:29:25 +00:00
// The number of connection errors to vmstorage node.
2019-05-22 21:23:23 +00:00
connectionErrors * metrics . Counter
2019-06-08 19:29:25 +00:00
// The number of rows pushed to storageNode with push method.
rowsPushed * metrics . Counter
// The number of rows sent to vmstorage node.
rowsSent * metrics . Counter
2019-05-22 21:23:23 +00:00
2019-06-08 19:29:25 +00:00
// The number of rows rerouted from the given vmstorage node
// to healthy nodes when the given node was unhealthy.
rowsReroutedFromHere * metrics . Counter
// The number of rows rerouted to the given vmstorage node
// from other nodes when they were unhealthy.
rowsReroutedToHere * metrics . Counter
2021-08-11 08:40:52 +00:00
// The total duration spent for sending data to vmstorage node.
// This metric is useful for determining the saturation of vminsert->vmstorage link.
sendDurationSeconds * metrics . FloatCounter
2019-05-22 21:23:23 +00:00
}
2019-06-08 19:29:25 +00:00
// storageNodes contains a list of vmstorage node clients.
2019-05-22 21:23:23 +00:00
var storageNodes [ ] * storageNode
2021-06-04 01:33:49 +00:00
var storageNodesWG sync . WaitGroup
2019-05-22 21:23:23 +00:00
2021-06-04 01:33:49 +00:00
var storageNodesStopCh = make ( chan struct { } )
2019-05-22 21:23:23 +00:00
2021-10-07 09:21:42 +00:00
// hashSeed is a seed for distributing time series amont storage nodes.
var hashSeed byte
2019-06-08 19:29:25 +00:00
// InitStorageNodes initializes vmstorage nodes' connections to the given addrs.
2021-10-07 09:21:42 +00:00
//
// eed is used for changing the distribution of input time series among addrs.
func InitStorageNodes ( addrs [ ] string , seed byte ) {
2019-05-22 21:23:23 +00:00
if len ( addrs ) == 0 {
logger . Panicf ( "BUG: addrs must be non-empty" )
}
2021-10-07 09:21:42 +00:00
hashSeed = seed
2019-05-22 21:23:23 +00:00
2021-06-23 11:00:06 +00:00
// Sort addrs in order to guarantee identical series->vmstorage mapping across all the vminsert nodes.
addrsCopy := append ( [ ] string { } , addrs ... )
sort . Strings ( addrsCopy )
addrs = addrsCopy
2020-05-28 16:57:05 +00:00
storageNodes = storageNodes [ : 0 ]
2019-05-22 21:23:23 +00:00
for _ , addr := range addrs {
2021-09-15 15:04:28 +00:00
if _ , _ , err := net . SplitHostPort ( addr ) ; err != nil {
// Automatically add missing port.
addr += ":8400"
}
2019-05-22 21:23:23 +00:00
sn := & storageNode {
dialer : netutil . NewTCPDialer ( "vminsert" , addr ) ,
2019-06-08 19:29:25 +00:00
dialErrors : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_dial_errors_total { name="vminsert", addr=%q} ` , addr ) ) ,
handshakeErrors : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_handshake_errors_total { name="vminsert", addr=%q} ` , addr ) ) ,
connectionErrors : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_connection_errors_total { name="vminsert", addr=%q} ` , addr ) ) ,
rowsPushed : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_rows_pushed_total { name="vminsert", addr=%q} ` , addr ) ) ,
rowsSent : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_rows_sent_total { name="vminsert", addr=%q} ` , addr ) ) ,
rowsReroutedFromHere : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_rows_rerouted_from_here_total { name="vminsert", addr=%q} ` , addr ) ) ,
rowsReroutedToHere : metrics . NewCounter ( fmt . Sprintf ( ` vm_rpc_rows_rerouted_to_here_total { name="vminsert", addr=%q} ` , addr ) ) ,
2021-08-11 08:40:52 +00:00
sendDurationSeconds : metrics . NewFloatCounter ( fmt . Sprintf ( ` vm_rpc_send_duration_seconds_total { name="vminsert", addr=%q} ` , addr ) ) ,
2019-05-22 21:23:23 +00:00
}
2021-06-04 01:33:49 +00:00
sn . brCond = sync . NewCond ( & sn . brLock )
2019-06-08 19:29:25 +00:00
_ = metrics . NewGauge ( fmt . Sprintf ( ` vm_rpc_rows_pending { name="vminsert", addr=%q} ` , addr ) , func ( ) float64 {
2020-05-27 12:07:16 +00:00
sn . brLock . Lock ( )
n := sn . br . rows
sn . brLock . Unlock ( )
2019-06-08 19:29:25 +00:00
return float64 ( n )
} )
_ = metrics . NewGauge ( fmt . Sprintf ( ` vm_rpc_buf_pending_bytes { name="vminsert", addr=%q} ` , addr ) , func ( ) float64 {
2020-05-27 12:07:16 +00:00
sn . brLock . Lock ( )
n := len ( sn . br . buf )
sn . brLock . Unlock ( )
2019-06-08 19:29:25 +00:00
return float64 ( n )
} )
2020-11-17 20:13:20 +00:00
_ = metrics . NewGauge ( fmt . Sprintf ( ` vm_rpc_vmstorage_is_reachable { name="vminsert", addr=%q} ` , addr ) , func ( ) float64 {
2021-10-08 09:52:56 +00:00
if atomic . LoadUint32 ( & sn . broken ) != 0 {
2020-11-17 20:13:20 +00:00
return 0
}
return 1
} )
2021-10-08 09:52:56 +00:00
_ = metrics . NewGauge ( fmt . Sprintf ( ` vm_rpc_vmstorage_is_read_only { name="vminsert", addr=%q} ` , addr ) , func ( ) float64 {
return float64 ( atomic . LoadUint32 ( & sn . isReadOnly ) )
} )
2019-05-22 21:23:23 +00:00
storageNodes = append ( storageNodes , sn )
}
2019-06-08 19:29:25 +00:00
2020-05-27 12:07:16 +00:00
maxBufSizePerStorageNode = memory . Allowed ( ) / 8 / len ( storageNodes )
2020-05-24 22:39:24 +00:00
if maxBufSizePerStorageNode > consts . MaxInsertPacketSize {
maxBufSizePerStorageNode = consts . MaxInsertPacketSize
}
2020-05-28 16:57:05 +00:00
for idx , sn := range storageNodes {
storageNodesWG . Add ( 1 )
go func ( sn * storageNode , idx int ) {
sn . run ( storageNodesStopCh , idx )
storageNodesWG . Done ( )
} ( sn , idx )
}
2019-05-22 21:23:23 +00:00
}
// Stop gracefully stops netstorage.
func Stop ( ) {
2019-06-08 19:29:25 +00:00
close ( storageNodesStopCh )
2021-06-04 01:33:49 +00:00
for _ , sn := range storageNodes {
sn . brCond . Broadcast ( )
}
2019-05-22 21:23:23 +00:00
storageNodesWG . Wait ( )
}
2019-06-08 19:29:25 +00:00
2021-06-05 13:16:16 +00:00
// rerouteRowsMayBlock re-routes rows from buf among healthy storage nodes.
2020-05-27 12:07:16 +00:00
//
2021-06-04 01:33:49 +00:00
// It waits until healthy storage nodes have enough space for the re-routed rows.
2020-05-27 12:07:16 +00:00
// This guarantees backpressure if the ingestion rate exceeds vmstorage nodes'
// ingestion rate capacity.
//
2021-06-04 01:33:49 +00:00
// It returns non-nil error only if Stop is called.
2021-06-05 13:16:16 +00:00
func rerouteRowsMayBlock ( snSource * storageNode , mayUseSNSource bool , buf [ ] byte , rows int ) error {
2021-06-04 01:33:49 +00:00
if len ( storageNodes ) < 2 {
logger . Panicf ( "BUG: re-routing can work only if at least 2 storage nodes are configured; got %d nodes" , len ( storageNodes ) )
2020-05-27 12:07:16 +00:00
}
2021-06-05 13:16:16 +00:00
reroutesTotal . Inc ( )
2021-06-08 08:01:12 +00:00
atomic . StoreUint64 ( & snSource . lastRerouteTime , fasttime . UnixTimestamp ( ) )
2021-06-05 13:16:16 +00:00
sns := getStorageNodesMapForRerouting ( snSource , mayUseSNSource )
if areStorageNodesEqual ( sns ) {
// Fast path - all the storage nodes are the same - send the buf to them.
2021-06-04 01:33:49 +00:00
sn := sns [ 0 ]
2021-06-05 13:16:16 +00:00
if ! sn . sendBufMayBlock ( buf ) {
return fmt . Errorf ( "cannot re-route data because of graceful shutdown" )
}
if sn != snSource {
snSource . rowsReroutedFromHere . Add ( rows )
sn . rowsReroutedToHere . Add ( rows )
2020-09-28 18:35:40 +00:00
}
2021-06-05 13:16:16 +00:00
return nil
2020-09-28 18:35:40 +00:00
}
2021-06-04 01:33:49 +00:00
src := buf
2021-06-05 13:16:16 +00:00
var mr storage . MetricRow
2019-06-08 19:29:25 +00:00
for len ( src ) > 0 {
2021-05-08 14:55:44 +00:00
tail , err := mr . UnmarshalX ( src )
2019-06-08 19:29:25 +00:00
if err != nil {
2021-06-04 01:33:49 +00:00
logger . Panicf ( "BUG: cannot unmarshal MetricRow: %s" , err )
2019-06-08 19:29:25 +00:00
}
rowBuf := src [ : len ( src ) - len ( tail ) ]
src = tail
2021-06-04 01:33:49 +00:00
reroutedRowsProcessed . Inc ( )
h := xxhash . Sum64 ( mr . MetricNameRaw )
2021-05-08 14:55:44 +00:00
mr . ResetX ( )
2021-06-05 13:16:16 +00:00
idx := h % uint64 ( len ( sns ) )
sn := sns [ idx ]
if ! sn . sendBufMayBlock ( rowBuf ) {
return fmt . Errorf ( "cannot re-route data because of graceful shutdown" )
2021-06-04 01:33:49 +00:00
}
2021-06-05 13:16:16 +00:00
if sn != snSource {
snSource . rowsReroutedFromHere . Inc ( )
sn . rowsReroutedToHere . Inc ( )
2019-06-08 19:29:25 +00:00
}
}
2021-06-05 13:16:16 +00:00
return nil
2020-05-27 12:07:16 +00:00
}
2021-06-04 01:33:49 +00:00
func ( sn * storageNode ) sendBufMayBlock ( buf [ ] byte ) bool {
2020-05-27 12:07:16 +00:00
sn . brLock . Lock ( )
2021-06-04 01:33:49 +00:00
for len ( sn . br . buf ) + len ( buf ) > maxBufSizePerStorageNode {
select {
case <- storageNodesStopCh :
sn . brLock . Unlock ( )
return false
default :
}
sn . brCond . Wait ( )
2020-05-27 12:07:16 +00:00
}
2021-06-04 01:33:49 +00:00
sn . br . buf = append ( sn . br . buf , buf ... )
sn . br . rows ++
2020-05-27 12:07:16 +00:00
sn . brLock . Unlock ( )
2021-06-04 01:33:49 +00:00
return true
2019-06-08 19:29:25 +00:00
}
2021-10-08 10:52:56 +00:00
func ( sn * storageNode ) readOnlyChecker ( stop <- chan struct { } ) {
ticker := time . NewTicker ( time . Second * 30 )
defer ticker . Stop ( )
for {
select {
case <- stop :
2021-10-08 09:52:56 +00:00
return
2021-10-08 10:52:56 +00:00
case <- ticker . C :
sn . checkReadOnlyMode ( )
2021-10-08 09:52:56 +00:00
}
}
2021-10-08 10:52:56 +00:00
}
func ( sn * storageNode ) checkReadOnlyMode ( ) {
if atomic . LoadUint32 ( & sn . isReadOnly ) == 0 {
// fast path - the sn isn't in readonly mode
return
}
// Check whether the storage remains in readonly mode
sn . bcLock . Lock ( )
defer sn . bcLock . Unlock ( )
if sn . bc == nil {
return
}
// send nil buff to check ack response from storage
err := sendToConn ( sn . bc , nil )
if err == nil {
// The storage switched from readonly to non-readonly mode
atomic . StoreUint32 ( & sn . isReadOnly , 0 )
return
}
if ! errors . Is ( err , errStorageReadOnly ) {
logger . Errorf ( "cannot check storage readonly mode for -storageNode=%q: %s" , sn . dialer . Addr ( ) , err )
}
2021-10-08 09:52:56 +00:00
}
2021-06-05 13:16:16 +00:00
func getStorageNodesMapForRerouting ( snExclude * storageNode , mayUseSNExclude bool ) [ ] * storageNode {
sns := getStorageNodesForRerouting ( snExclude , true )
if len ( sns ) == len ( storageNodes ) {
return sns
}
if ! mayUseSNExclude {
sns = getStorageNodesForRerouting ( snExclude , false )
}
for len ( sns ) < len ( storageNodes ) {
sns = append ( sns , snExclude )
}
return sns
}
func areStorageNodesEqual ( sns [ ] * storageNode ) bool {
snOrigin := sns [ 0 ]
for _ , sn := range sns [ 1 : ] {
if sn != snOrigin {
return false
}
}
return true
}
func getStorageNodesForRerouting ( snExclude * storageNode , skipRecentlyReroutedNodes bool ) [ ] * storageNode {
sns := make ( [ ] * storageNode , 0 , len ( storageNodes ) )
currentTime := fasttime . UnixTimestamp ( )
for i , sn := range storageNodes {
2021-10-08 09:52:56 +00:00
if sn == snExclude || sn . isNotReady ( ) {
2021-06-05 13:16:16 +00:00
// Skip snExclude and broken storage nodes.
continue
}
if skipRecentlyReroutedNodes && currentTime <= atomic . LoadUint64 ( & sn . lastRerouteTime ) + 5 {
// Skip nodes, which were re-routed recently.
continue
}
for len ( sns ) <= i {
sns = append ( sns , sn )
}
}
if len ( sns ) > 0 {
for len ( sns ) < len ( storageNodes ) {
sns = append ( sns , sns [ 0 ] )
}
}
return sns
}
2019-06-08 19:29:25 +00:00
var (
2020-05-24 22:39:24 +00:00
maxBufSizePerStorageNode int
2021-06-04 01:33:49 +00:00
reroutedRowsProcessed = metrics . NewCounter ( ` vm_rpc_rerouted_rows_processed_total { name="vminsert"} ` )
reroutesTotal = metrics . NewCounter ( ` vm_rpc_reroutes_total { name="vminsert"} ` )
2020-05-28 16:57:05 +00:00
rowsIncompletelyReplicatedTotal = metrics . NewCounter ( ` vm_rpc_rows_incompletely_replicated_total { name="vminsert"} ` )
2019-06-08 19:29:25 +00:00
)