mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vminsert: moves replication at datapointWrite level
* It allows to tolerate slow storage node for replicationFactor > 1 and disableRerouting=false * It allows to have partial replication with replicationFactor > and dropSamplesOnOverload=true * It increases cpu and memory usage by 30-50% per replica Signed-off-by: f41gh7 <nik@victoriametrics.com>
This commit is contained in:
parent
08cbbf8134
commit
1f80c3ce08
2 changed files with 85 additions and 51 deletions
|
@ -122,11 +122,48 @@ func (ctx *InsertCtx) ApplyRelabeling() {
|
||||||
func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, timestamp int64, value float64) error {
|
func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, timestamp int64, value float64) error {
|
||||||
ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, labels)
|
ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, labels)
|
||||||
storageNodeIdx := ctx.GetStorageNodeIdx(at, labels)
|
storageNodeIdx := ctx.GetStorageNodeIdx(at, labels)
|
||||||
return ctx.WriteDataPointExt(storageNodeIdx, ctx.MetricNameBuf, timestamp, value)
|
return ctx.writeDataPointToReplicas(storageNodeIdx, ctx.MetricNameBuf, timestamp, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx.
|
// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx.
|
||||||
func (ctx *InsertCtx) WriteDataPointExt(storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
func (ctx *InsertCtx) WriteDataPointExt(storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
||||||
|
return ctx.writeDataPointToReplicas(storageNodeIdx, metricNameRaw, timestamp, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ctx *InsertCtx) writeDataPointToReplicas(storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
||||||
|
var firstErr error
|
||||||
|
var failsCount int
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
snIdx := storageNodeIdx + i
|
||||||
|
if snIdx >= len(ctx.snb.sns) {
|
||||||
|
snIdx %= len(ctx.snb.sns)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := ctx.writeDataPointExt(snIdx, metricNameRaw, timestamp, value); err != nil {
|
||||||
|
if replicas == 1 {
|
||||||
|
return fmt.Errorf("cannot write datapoint: %w", err)
|
||||||
|
}
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = err
|
||||||
|
}
|
||||||
|
failsCount++
|
||||||
|
// The data is partially replicated, so just emit a warning and return true.
|
||||||
|
// We could retry sending the data again, but this may result in uncontrolled duplicate data.
|
||||||
|
// So it is better returning true.
|
||||||
|
br := &ctx.bufRowss[snIdx]
|
||||||
|
rowsIncompletelyReplicatedTotal.Add(br.rows)
|
||||||
|
incompleteReplicationLogger.Warnf("cannot make a copy #%d out of %d copies according to -replicationFactor=%d, used_nodes=%d for %d bytes with %d rows, "+
|
||||||
|
"since a part of storage nodes is temporarily unavailable", i+1, replicas, *replicationFactor, len(br.buf), br.rows)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if failsCount == replicas {
|
||||||
|
return fmt.Errorf("cannot write datapoint to any replicas: %w", firstErr)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ctx *InsertCtx) writeDataPointExt(storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
||||||
br := &ctx.bufRowss[storageNodeIdx]
|
br := &ctx.bufRowss[storageNodeIdx]
|
||||||
snb := ctx.snb
|
snb := ctx.snb
|
||||||
sn := snb.sns[storageNodeIdx]
|
sn := snb.sns[storageNodeIdx]
|
||||||
|
|
|
@ -45,6 +45,8 @@ var (
|
||||||
"See also -disableRerouting")
|
"See also -disableRerouting")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var replicas int
|
||||||
|
|
||||||
var errStorageReadOnly = errors.New("storage node is read only")
|
var errStorageReadOnly = errors.New("storage node is read only")
|
||||||
|
|
||||||
func (sn *storageNode) isReady() bool {
|
func (sn *storageNode) isReady() bool {
|
||||||
|
@ -140,15 +142,6 @@ again:
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
replicas := *replicationFactor
|
|
||||||
if replicas <= 0 {
|
|
||||||
replicas = 1
|
|
||||||
}
|
|
||||||
sns := snb.sns
|
|
||||||
if replicas > len(sns) {
|
|
||||||
replicas = len(sns)
|
|
||||||
}
|
|
||||||
|
|
||||||
sn.readOnlyCheckerWG.Add(1)
|
sn.readOnlyCheckerWG.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer sn.readOnlyCheckerWG.Done()
|
defer sn.readOnlyCheckerWG.Done()
|
||||||
|
@ -193,7 +186,7 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Send br to replicas storage nodes starting from snIdx.
|
// Send br to replicas storage nodes starting from snIdx.
|
||||||
for !sendBufToReplicasNonblocking(snb, &br, snIdx, replicas) {
|
for !sendBufToSnNonblocking(snb, &br, snIdx) {
|
||||||
d := timeutil.AddJitterToDuration(time.Millisecond * 200)
|
d := timeutil.AddJitterToDuration(time.Millisecond * 200)
|
||||||
t := timerpool.Get(d)
|
t := timerpool.Get(d)
|
||||||
select {
|
select {
|
||||||
|
@ -206,51 +199,39 @@ func (sn *storageNode) run(snb *storageNodesBucket, snIdx int) {
|
||||||
sn.checkHealth()
|
sn.checkHealth()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if sn.isBufferFull.CompareAndSwap(true, false) {
|
||||||
|
logger.Infof("transited node=%s to non-full", sn.dialer.Addr())
|
||||||
|
}
|
||||||
|
|
||||||
br.reset()
|
br.reset()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func sendBufToReplicasNonblocking(snb *storageNodesBucket, br *bufRows, snIdx, replicas int) bool {
|
func sendBufToSnNonblocking(snb *storageNodesBucket, br *bufRows, snIdx int) bool {
|
||||||
usedStorageNodes := make(map[*storageNode]struct{}, replicas)
|
|
||||||
sns := snb.sns
|
sns := snb.sns
|
||||||
for i := 0; i < replicas; i++ {
|
idx := snIdx
|
||||||
idx := snIdx + i
|
|
||||||
attempts := 0
|
attempts := 0
|
||||||
for {
|
for {
|
||||||
attempts++
|
attempts++
|
||||||
if attempts > len(sns) {
|
if attempts > len(sns) {
|
||||||
if i == 0 {
|
|
||||||
// The data wasn't replicated at all.
|
// The data wasn't replicated at all.
|
||||||
cannotReplicateLogger.Warnf("cannot push %d bytes with %d rows to storage nodes, since all the nodes are temporarily unavailable; "+
|
cannotReplicateLogger.Warnf("cannot push %d bytes with %d rows to storage nodes, since all the nodes are temporarily unavailable; "+
|
||||||
"re-trying to send the data soon", len(br.buf), br.rows)
|
"re-trying to send the data soon", len(br.buf), br.rows)
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
// The data is partially replicated, so just emit a warning and return true.
|
|
||||||
// We could retry sending the data again, but this may result in uncontrolled duplicate data.
|
|
||||||
// So it is better returning true.
|
|
||||||
rowsIncompletelyReplicatedTotal.Add(br.rows)
|
|
||||||
incompleteReplicationLogger.Warnf("cannot make a copy #%d out of %d copies according to -replicationFactor=%d for %d bytes with %d rows, "+
|
|
||||||
"since a part of storage nodes is temporarily unavailable", i+1, replicas, *replicationFactor, len(br.buf), br.rows)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
if idx >= len(sns) {
|
if idx >= len(sns) {
|
||||||
idx %= len(sns)
|
idx %= len(sns)
|
||||||
}
|
}
|
||||||
sn := sns[idx]
|
sn := sns[idx]
|
||||||
idx++
|
idx++
|
||||||
if _, ok := usedStorageNodes[sn]; ok {
|
|
||||||
// The br has been already replicated to sn. Skip it.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !sn.sendBufRowsNonblocking(br) {
|
if !sn.sendBufRowsNonblocking(br) {
|
||||||
// Cannot send data to sn. Go to the next sn.
|
// Cannot send data to sn. Go to the next sn.
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Successfully sent data to sn.
|
// Successfully sent data to sn.
|
||||||
usedStorageNodes[sn] = struct{}{}
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -413,6 +394,8 @@ type storageNode struct {
|
||||||
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
|
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
|
||||||
isBroken atomic.Bool
|
isBroken atomic.Bool
|
||||||
|
|
||||||
|
isBufferFull atomic.Bool
|
||||||
|
|
||||||
// isReadOnly is set to true if the given vmstorage node is read only
|
// isReadOnly is set to true if the given vmstorage node is read only
|
||||||
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
|
// In this case the data is re-routed to the remaining healthy vmstorage nodes.
|
||||||
isReadOnly atomic.Bool
|
isReadOnly atomic.Bool
|
||||||
|
@ -505,6 +488,13 @@ func setStorageNodesBucket(snb *storageNodesBucket) {
|
||||||
//
|
//
|
||||||
// Call MustStop when the initialized vmstorage connections are no longer needed.
|
// Call MustStop when the initialized vmstorage connections are no longer needed.
|
||||||
func Init(addrs []string, hashSeed uint64) {
|
func Init(addrs []string, hashSeed uint64) {
|
||||||
|
replicas = *replicationFactor
|
||||||
|
if replicas <= 0 {
|
||||||
|
replicas = 1
|
||||||
|
}
|
||||||
|
if replicas > len(addrs) {
|
||||||
|
replicas = len(addrs)
|
||||||
|
}
|
||||||
snb := initStorageNodes(addrs, hashSeed)
|
snb := initStorageNodes(addrs, hashSeed)
|
||||||
setStorageNodesBucket(snb)
|
setStorageNodesBucket(snb)
|
||||||
}
|
}
|
||||||
|
@ -772,7 +762,7 @@ var noStorageNodesLogger = logger.WithThrottler("storageNodesUnavailable", 5*tim
|
||||||
func getNotReadyStorageNodeIdxs(snb *storageNodesBucket, dst []int, snExtra *storageNode) []int {
|
func getNotReadyStorageNodeIdxs(snb *storageNodesBucket, dst []int, snExtra *storageNode) []int {
|
||||||
dst = dst[:0]
|
dst = dst[:0]
|
||||||
for i, sn := range snb.sns {
|
for i, sn := range snb.sns {
|
||||||
if sn == snExtra || !sn.isReady() {
|
if sn == snExtra || !sn.isReady() || sn.isBufferFull.Load() {
|
||||||
dst = append(dst, i)
|
dst = append(dst, i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -787,10 +777,17 @@ func (sn *storageNode) trySendBuf(buf []byte, rows int) bool {
|
||||||
|
|
||||||
sent := false
|
sent := false
|
||||||
sn.brLock.Lock()
|
sn.brLock.Lock()
|
||||||
if sn.isReady() && len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
if !sn.isReady() {
|
||||||
|
return sent
|
||||||
|
}
|
||||||
|
if len(sn.br.buf)+len(buf) <= maxBufSizePerStorageNode {
|
||||||
sn.br.buf = append(sn.br.buf, buf...)
|
sn.br.buf = append(sn.br.buf, buf...)
|
||||||
sn.br.rows += rows
|
sn.br.rows += rows
|
||||||
sent = true
|
sent = true
|
||||||
|
} else {
|
||||||
|
if sn.isBufferFull.CompareAndSwap(false, true) {
|
||||||
|
logger.Infof("node: %s transited to full", sn.dialer.Addr())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
sn.brLock.Unlock()
|
sn.brLock.Unlock()
|
||||||
return sent
|
return sent
|
||||||
|
|
Loading…
Reference in a new issue