Add vmstorageUserTimeout flags to configure TCP user timeout (Linux) (#4423)

`TCP_USER_TIMEOUT` (since Linux 2.6.37) specifies the maximum amount of
time that transmitted data may remain unacknowledged before TCP will
forcibly close the connection and return `ETIMEDOUT` to the application.

Setting a low TCP user timeout allows RPC connections quickly reroute
around unavailable storage nodes during network interruptions.
This commit is contained in:
Will Jordan 2023-08-29 02:46:39 -07:00 committed by GitHub
parent 0aa0435d17
commit 2b7b3293c1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 51 additions and 6 deletions

View file

@ -32,6 +32,10 @@ var (
disableRerouting = flag.Bool("disableRerouting", true, "Whether to disable re-routing when some of vmstorage nodes accept incoming data at slower speed compared to other storage nodes. Disabled re-routing limits the ingestion rate by the slowest vmstorage node. On the other side, disabled re-routing minimizes the number of active time series in the cluster during rolling restarts and during spikes in series churn rate. See also -dropSamplesOnOverload")
dropSamplesOnOverload = flag.Bool("dropSamplesOnOverload", false, "Whether to drop incoming samples if the destination vmstorage node is overloaded and/or unavailable. This prioritizes cluster availability over consistency, e.g. the cluster continues accepting all the ingested samples, but some of them may be dropped if vmstorage nodes are temporarily unavailable and/or overloaded. The drop of samples happens before the replication, so it's not recommended to use this flag with -replicationFactor enabled.")
vmstorageDialTimeout = flag.Duration("vmstorageDialTimeout", 5*time.Second, "Timeout for establishing RPC connections from vminsert to vmstorage")
vmstorageUserTimeout = flag.Duration("vmstorageUserTimeout", 0, "TCP user timeout for RPC connections from vminsert to vmstorage (Linux only). "+
"When greater than 0, it specifies the maximum amount of time transmitted data may remain unacknowledged before the TCP connection is closed."+
"Setting a low TCP user timeout allows inserts to reroute around unresponsive storage nodes faster than the full insert timeout (at least 60 seconds)."+
"By default, this timeout is disabled.")
)
var errStorageReadOnly = errors.New("storage node is read only")
@ -516,7 +520,7 @@ func initStorageNodes(addrs []string, hashSeed uint64) *storageNodesBucket {
addr += ":8400"
}
sn := &storageNode{
dialer: netutil.NewTCPDialer(ms, "vminsert", addr, *vmstorageDialTimeout),
dialer: netutil.NewTCPDialer(ms, "vminsert", addr, *vmstorageDialTimeout, *vmstorageUserTimeout),
stopCh: stopCh,

View file

@ -42,6 +42,11 @@ var (
maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery")
maxSamplesPerQuery = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries")
vmstorageDialTimeout = flag.Duration("vmstorageDialTimeout", 5*time.Second, "Timeout for establishing RPC connections from vmselect to vmstorage")
vmstorageUserTimeout = flag.Duration("vmstorageUserTimeout", 0, "TCP user timeout for RPC connections from vmselect to vmstorage (Linux only). "+
"When greater than 0, it specifies the maximum amount of time transmitted data may remain unacknowledged before the TCP connection is closed. "+
"Setting a low TCP user timeout allows queries to ignore unresponsive storage nodes faster than the max query duration. "+
"By default, this timeout is disabled. "+
"See also -search.maxQueryDuration")
)
// Result is a single timeseries result.
@ -2753,7 +2758,7 @@ func newStorageNode(ms *metrics.Set, addr string) *storageNode {
addr += ":8401"
}
// There is no need in requests compression, since vmselect requests are usually very small.
connPool := netutil.NewConnPool(ms, "vmselect", addr, handshake.VMSelectClient, 0, *vmstorageDialTimeout)
connPool := netutil.NewConnPool(ms, "vmselect", addr, handshake.VMSelectClient, 0, *vmstorageDialTimeout, *vmstorageUserTimeout)
sn := &storageNode{
connPool: connPool,

View file

@ -49,9 +49,9 @@ type connWithTimestamp struct {
// The compression is disabled if compressionLevel <= 0.
//
// Call ConnPool.MustStop when the returned ConnPool is no longer needed.
func NewConnPool(ms *metrics.Set, name, addr string, handshakeFunc handshake.Func, compressionLevel int, dialTimeout time.Duration) *ConnPool {
func NewConnPool(ms *metrics.Set, name, addr string, handshakeFunc handshake.Func, compressionLevel int, dialTimeout time.Duration, userTimeout time.Duration) *ConnPool {
cp := &ConnPool{
d: NewTCPDialer(ms, name, addr, dialTimeout),
d: NewTCPDialer(ms, name, addr, dialTimeout, userTimeout),
concurrentDialsCh: make(chan struct{}, 8),
name: name,

View file

@ -44,7 +44,7 @@ func testConnPoolStartStop(t *testing.T, name string, ms *metrics.Set) {
var cps []*ConnPool
for i := 0; i < 5; i++ {
addr := fmt.Sprintf("host-%d", i)
cp := NewConnPool(ms, name, addr, handshake.VMSelectClient, compressLevel, dialTimeout)
cp := NewConnPool(ms, name, addr, handshake.VMSelectClient, compressLevel, dialTimeout, 0)
cps = append(cps, cp)
}
for _, cp := range cps {

View file

@ -3,6 +3,7 @@ package netutil
import (
"fmt"
"net"
"syscall"
"time"
"github.com/VictoriaMetrics/metrics"
@ -12,7 +13,7 @@ import (
//
// The name is used in metric tags for the returned dialer.
// The name must be unique among dialers.
func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout time.Duration) *TCPDialer {
func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout time.Duration, userTimeout time.Duration) *TCPDialer {
d := &TCPDialer{
d: &net.Dialer{
Timeout: dialTimeout,
@ -27,6 +28,17 @@ func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout time.Duration)
dialErrors: ms.NewCounter(fmt.Sprintf(`vm_tcpdialer_errors_total{name=%q, addr=%q, type="dial"}`, name, addr)),
}
d.connMetrics.init(ms, "vm_tcpdialer", name, addr)
if userTimeout > 0 {
d.d.Control = func(network, address string, c syscall.RawConn) (err error) {
controlErr := c.Control(func(fd uintptr) {
err = setTCPUserTimeout(fd, userTimeout)
})
if controlErr != nil {
return controlErr
}
return err
}
}
return d
}

View file

@ -0,0 +1,12 @@
//go:build !linux
// +build !linux
package netutil
import (
"time"
)
func setTCPUserTimeout(fd uintptr, timeout time.Duration) error {
return nil
}

View file

@ -0,0 +1,12 @@
package netutil
import (
"golang.org/x/sys/unix"
"syscall"
"time"
)
func setTCPUserTimeout(fd uintptr, timeout time.Duration) error {
return syscall.SetsockoptInt(
int(fd), syscall.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int(timeout.Milliseconds()))
}