VictoriaMetrics/app/vminsert/netstorage/netstorage.go

489 lines
14 KiB
Go

package netstorage
import (
"flag"
"fmt"
"io"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/consts"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
xxhash "github.com/cespare/xxhash/v2"
)
var disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
// push pushes buf to sn.
//
// It falls back to sending data to another vmstorage node if sn is currently
// unavailable.
//
// rows is the number of rows in the buf.
func (sn *storageNode) push(buf []byte, rows int) error {
if len(buf) > consts.MaxInsertPacketSize {
logger.Panicf("BUG: len(buf)=%d cannot exceed %d", len(buf), consts.MaxInsertPacketSize)
}
sn.rowsPushed.Add(rows)
sn.mu.Lock()
defer sn.mu.Unlock()
if sn.broken {
// The vmstorage node is broken. Re-route buf to healthy vmstorage nodes.
if !addToReroutedBuf(buf, rows) {
rowsLostTotal.Add(rows)
return fmt.Errorf("%d rows dropped because of reroutedBuf overflows %d bytes", rows, reroutedBufMaxSize)
}
sn.rowsReroutedFromHere.Add(rows)
return nil
}
if len(sn.buf)+len(buf) <= consts.MaxInsertPacketSize {
// Fast path: the buf contents fits sn.buf.
sn.buf = append(sn.buf, buf...)
sn.rows += rows
return nil
}
// Slow path: the buf contents doesn't fit sn.buf.
// Flush sn.buf to vmstorage and then add buf to sn.buf.
if err := sn.flushBufLocked(); err != nil {
// Failed to flush or re-route sn.buf to vmstorage nodes.
// The sn.buf is already dropped by flushBufLocked.
// Drop buf too, since there is litte sense in trying to rescue it.
rowsLostTotal.Add(rows)
return err
}
// Successful flush.
sn.buf = append(sn.buf, buf...)
sn.rows += rows
return nil
}
func (sn *storageNode) sendReroutedRow(buf []byte) error {
sn.mu.Lock()
defer sn.mu.Unlock()
if sn.broken {
return errBrokenStorageNode
}
if len(sn.buf)+len(buf) > consts.MaxInsertPacketSize {
return fmt.Errorf("cannot put %d bytes into vmstorage buffer, since its size cannot exceed %d bytes", len(sn.buf)+len(buf), consts.MaxInsertPacketSize)
}
sn.buf = append(sn.buf, buf...)
sn.rows++
return nil
}
var errBrokenStorageNode = fmt.Errorf("the vmstorage node is temporarily broken")
func (sn *storageNode) flushBufLocked() error {
err := sn.sendBufLocked(sn.buf)
if err == nil {
// Successful flush. Remove broken flag.
sn.broken = false
sn.rowsSent.Add(sn.rows)
sn.buf = sn.buf[:0]
sn.rows = 0
return nil
}
// Couldn't flush sn.buf to vmstorage. Mark sn as broken
// and try re-routing sn.buf to healthy vmstorage nodes.
sn.broken = true
if !addToReroutedBuf(sn.buf, sn.rows) {
// Preserve sn.buf when it cannot be sent to healthy nodes
// in the hope the error will disappear on the next call to flushBufLocked.
//
// This should fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/294 .
return err
}
sn.buf = sn.buf[:0]
sn.rows = 0
return err
}
func (sn *storageNode) sendBufLocked(buf []byte) error {
if len(buf) == 0 {
return nil
}
if sn.bc == nil {
if err := sn.dial(); err != nil {
return fmt.Errorf("cannot dial %q: %s", sn.dialer.Addr(), err)
}
}
if err := sn.sendToConn(sn.bc, buf); err != nil {
sn.closeBrokenConn()
return err
}
return nil
}
func (sn *storageNode) sendToConn(bc *handshake.BufferedConn, buf []byte) error {
timeoutSeconds := len(buf) / 3e5
if timeoutSeconds < 60 {
timeoutSeconds = 60
}
timeout := time.Duration(timeoutSeconds) * time.Second
deadline := time.Now().Add(timeout)
if err := bc.SetWriteDeadline(deadline); err != nil {
return fmt.Errorf("cannot set write deadline to %s: %s", deadline, err)
}
// sizeBuf guarantees that the rows batch will be either fully
// read or fully discarded on the vmstorage side.
// sizeBuf is used for read optimization in vmstorage.
sn.sizeBuf = encoding.MarshalUint64(sn.sizeBuf[:0], uint64(len(buf)))
if _, err := bc.Write(sn.sizeBuf); err != nil {
return fmt.Errorf("cannot write data size %d: %s", len(buf), err)
}
if _, err := bc.Write(buf); err != nil {
return fmt.Errorf("cannot write data with size %d: %s", len(buf), err)
}
if err := bc.Flush(); err != nil {
return fmt.Errorf("cannot flush data with size %d: %s", len(buf), err)
}
// Wait for `ack` from vmstorage.
// This guarantees that the message has been fully received by vmstorage.
deadline = time.Now().Add(timeout)
if err := bc.SetReadDeadline(deadline); err != nil {
return fmt.Errorf("cannot set read deadline for reading `ack` to vmstorage: %s", err)
}
if _, err := io.ReadFull(bc, sn.sizeBuf[:1]); err != nil {
return fmt.Errorf("cannot read `ack` from vmstorage: %s", err)
}
if sn.sizeBuf[0] != 1 {
return fmt.Errorf("unexpected `ack` received from vmstorage; got %d; want %d", sn.sizeBuf[0], 1)
}
return nil
}
func (sn *storageNode) dial() error {
c, err := sn.dialer.Dial()
if err != nil {
sn.dialErrors.Inc()
return err
}
compressionLevel := 1
if *disableRPCCompression {
compressionLevel = 0
}
bc, err := handshake.VMInsertClient(c, compressionLevel)
if err != nil {
_ = c.Close()
sn.handshakeErrors.Inc()
return fmt.Errorf("handshake error: %s", err)
}
sn.bc = bc
return nil
}
func (sn *storageNode) closeBrokenConn() {
if sn.bc == nil {
return
}
_ = sn.bc.Close()
sn.bc = nil
sn.connectionErrors.Inc()
}
func (sn *storageNode) run(stopCh <-chan struct{}) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
mustStop := false
for !mustStop {
select {
case <-stopCh:
mustStop = true
// Make sure flushBufLocked is called last time before returning
// in order to send the remaining bits of data.
case <-ticker.C:
}
sn.mu.Lock()
if err := sn.flushBufLocked(); err != nil {
sn.closeBrokenConn()
logger.Errorf("cannot flush data to storageNode %q: %s", sn.dialer.Addr(), err)
}
sn.mu.Unlock()
}
}
func rerouteWorker(stopCh <-chan struct{}) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
var buf []byte
mustStop := false
for !mustStop {
select {
case <-stopCh:
mustStop = true
// Make sure spreadReroutedBufToStorageNodes is called last time before returning
// in order to reroute the remaining data to healthy vmstorage nodes.
case <-ticker.C:
}
var err error
buf, err = spreadReroutedBufToStorageNodes(buf[:0])
if err != nil {
rerouteErrors.Inc()
logger.Errorf("cannot reroute data among healthy vmstorage nodes: %s", err)
}
}
}
// storageNode is a client sending data to vmstorage node.
type storageNode struct {
mu sync.Mutex
// Buffer with data that needs to be written to vmstorage node.
buf []byte
// The number of rows buf contains at the moment.
rows int
// Temporary buffer for encoding marshaled buf size.
sizeBuf []byte
// broken is set to true if the given vmstorage node is temporarily unhealthy.
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
broken bool
dialer *netutil.TCPDialer
bc *handshake.BufferedConn
// The number of dial errors to vmstorage node.
dialErrors *metrics.Counter
// The number of handshake errors to vmstorage node.
handshakeErrors *metrics.Counter
// The number of connection errors to vmstorage node.
connectionErrors *metrics.Counter
// The number of rows pushed to storageNode with push method.
rowsPushed *metrics.Counter
// The number of rows sent to vmstorage node.
rowsSent *metrics.Counter
// The number of rows rerouted from the given vmstorage node
// to healthy nodes when the given node was unhealthy.
rowsReroutedFromHere *metrics.Counter
// The number of rows rerouted to the given vmstorage node
// from other nodes when they were unhealthy.
rowsReroutedToHere *metrics.Counter
}
// storageNodes contains a list of vmstorage node clients.
var storageNodes []*storageNode
var (
storageNodesWG sync.WaitGroup
rerouteWorkerWG sync.WaitGroup
)
var (
storageNodesStopCh = make(chan struct{})
rerouteWorkerStopCh = make(chan struct{})
)
// InitStorageNodes initializes vmstorage nodes' connections to the given addrs.
func InitStorageNodes(addrs []string) {
if len(addrs) == 0 {
logger.Panicf("BUG: addrs must be non-empty")
}
if len(addrs) > 255 {
logger.Panicf("BUG: too much addresses: %d; max supported %d addresses", len(addrs), 255)
}
for _, addr := range addrs {
sn := &storageNode{
dialer: netutil.NewTCPDialer("vminsert", addr),
dialErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_dial_errors_total{name="vminsert", addr=%q}`, addr)),
handshakeErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_handshake_errors_total{name="vminsert", addr=%q}`, addr)),
connectionErrors: metrics.NewCounter(fmt.Sprintf(`vm_rpc_connection_errors_total{name="vminsert", addr=%q}`, addr)),
rowsPushed: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_pushed_total{name="vminsert", addr=%q}`, addr)),
rowsSent: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_sent_total{name="vminsert", addr=%q}`, addr)),
rowsReroutedFromHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_from_here_total{name="vminsert", addr=%q}`, addr)),
rowsReroutedToHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_to_here_total{name="vminsert", addr=%q}`, addr)),
}
_ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_rows_pending{name="vminsert", addr=%q}`, addr), func() float64 {
sn.mu.Lock()
n := sn.rows
sn.mu.Unlock()
return float64(n)
})
_ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_buf_pending_bytes{name="vminsert", addr=%q}`, addr), func() float64 {
sn.mu.Lock()
n := len(sn.buf)
sn.mu.Unlock()
return float64(n)
})
storageNodes = append(storageNodes, sn)
storageNodesWG.Add(1)
go func(addr string) {
sn.run(storageNodesStopCh)
storageNodesWG.Done()
}(addr)
}
reroutedBufMaxSize = memory.Allowed() / 16
rerouteWorkerWG.Add(1)
go func() {
rerouteWorker(rerouteWorkerStopCh)
rerouteWorkerWG.Done()
}()
}
// Stop gracefully stops netstorage.
func Stop() {
close(rerouteWorkerStopCh)
rerouteWorkerWG.Wait()
close(storageNodesStopCh)
storageNodesWG.Wait()
}
func addToReroutedBuf(buf []byte, rows int) bool {
reroutedLock.Lock()
defer reroutedLock.Unlock()
if len(reroutedBuf)+len(buf) > reroutedBufMaxSize {
reroutedBufOverflows.Inc()
return false
}
reroutedBuf = append(reroutedBuf, buf...)
reroutedRows += rows
reroutesTotal.Inc()
return true
}
func spreadReroutedBufToStorageNodes(swapBuf []byte) ([]byte, error) {
healthyStorageNodes := getHealthyStorageNodes()
if len(healthyStorageNodes) == 0 {
// No more vmstorage nodes to write data to.
return swapBuf, fmt.Errorf("all the storage nodes are unhealthy")
}
reroutedLock.Lock()
reroutedBuf, swapBuf = swapBuf[:0], reroutedBuf
rows := reroutedRows
reroutedRows = 0
reroutedLock.Unlock()
if len(swapBuf) == 0 {
// Nothing to re-route.
return swapBuf, nil
}
var mr storage.MetricRow
src := swapBuf
rowsProcessed := 0
for len(src) > 0 {
tail, err := mr.Unmarshal(src)
if err != nil {
logger.Panicf("BUG: cannot unmarshal recently marshaled MetricRow: %s", err)
}
rowBuf := src[:len(src)-len(tail)]
src = tail
// Use non-consistent hashing instead of jump hash in order to re-route rows
// equally among healthy vmstorage nodes.
// This should spread the increased load among healthy vmstorage nodes.
h := xxhash.Sum64(mr.MetricNameRaw)
idx := h % uint64(len(healthyStorageNodes))
attempts := 0
for {
sn := healthyStorageNodes[idx]
err := sn.sendReroutedRow(rowBuf)
if err == nil {
sn.rowsReroutedToHere.Inc()
break
}
// Cannot send data to sn. Try sending to the next vmstorage node.
idx++
if idx >= uint64(len(healthyStorageNodes)) {
idx = 0
}
attempts++
if attempts < len(healthyStorageNodes) {
continue
}
// There are no healthy nodes.
// Try returning the remaining data to reroutedBuf if it has enough free space.
rowsRemaining := rows - rowsProcessed
recovered := false
reroutedLock.Lock()
if len(rowBuf)+len(tail)+len(reroutedBuf) <= reroutedBufMaxSize {
swapBuf = append(swapBuf[:0], rowBuf...)
swapBuf = append(swapBuf, tail...)
swapBuf = append(swapBuf, reroutedBuf...)
reroutedBuf, swapBuf = swapBuf, reroutedBuf[:0]
reroutedRows += rowsRemaining
recovered = true
}
reroutedLock.Unlock()
if recovered {
return swapBuf, nil
}
rowsLostTotal.Add(rowsRemaining)
return swapBuf, fmt.Errorf("all the %d vmstorage nodes are unavailable; lost %d rows; last error: %s", len(storageNodes), rowsRemaining, err)
}
rowsProcessed++
}
if rowsProcessed != rows {
logger.Panicf("BUG: unexpected number of rows processed; got %d; want %d", rowsProcessed, rows)
}
reroutedRowsProcessed.Add(rowsProcessed)
return swapBuf, nil
}
var (
reroutedLock sync.Mutex
reroutedBuf []byte
reroutedRows int
reroutedBufMaxSize int
reroutedRowsProcessed = metrics.NewCounter(`vm_rpc_rerouted_rows_processed_total{name="vminsert"}`)
reroutedBufOverflows = metrics.NewCounter(`vm_rpc_rerouted_buf_overflows_total{name="vminsert"}`)
reroutesTotal = metrics.NewCounter(`vm_rpc_reroutes_total{name="vminsert"}`)
_ = metrics.NewGauge(`vm_rpc_rerouted_rows_pending{name="vminsert"}`, func() float64 {
reroutedLock.Lock()
n := reroutedRows
reroutedLock.Unlock()
return float64(n)
})
_ = metrics.NewGauge(`vm_rpc_rerouted_buf_pending_bytes{name="vminsert"}`, func() float64 {
reroutedLock.Lock()
n := len(reroutedBuf)
reroutedLock.Unlock()
return float64(n)
})
rerouteErrors = metrics.NewCounter(`vm_rpc_reroute_errors_total{name="vminsert"}`)
rowsLostTotal = metrics.NewCounter(`vm_rpc_rows_lost_total{name="vminsert"}`)
)
func getHealthyStorageNodes() []*storageNode {
sns := make([]*storageNode, 0, len(storageNodes)-1)
for _, sn := range storageNodes {
sn.mu.Lock()
if !sn.broken {
sns = append(sns, sn)
}
sn.mu.Unlock()
}
return sns
}