VictoriaMetrics/lib/netutil/conn_pool.go
Aliaksandr Valialkin 288d13af8d
lib/netutil: parallelize background pings for remote addresses
This should improve the time needed for determining unavailale remote addresses
across big numer of ConnPool's.

This is a follow-up for a1629bd3be

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2022-06-21 13:32:27 +03:00

235 lines
6 KiB
Go

package netutil
import (
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake"
"github.com/VictoriaMetrics/metrics"
)
// ConnPool is a connection pool with ZSTD-compressed connections.
type ConnPool struct {
mu sync.Mutex
d *TCPDialer
// concurrentDialsCh limits the number of concurrent dials the ConnPool can make.
// This should prevent from creating an excees number of connections during temporary
// spikes in workload at vmselect and vmstorage nodes.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2552
concurrentDialsCh chan struct{}
name string
handshakeFunc handshake.Func
compressionLevel int
conns []connWithTimestamp
// lastDialError contains the last error seen when dialing remote addr.
// When it is non-nil and conns is empty, then ConnPool.Get() return this error.
// This reduces the time needed for dialing unavailable remote storage systems.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711#issuecomment-1160363187
lastDialError error
}
type connWithTimestamp struct {
bc *handshake.BufferedConn
lastActiveTime uint64
}
// NewConnPool creates a new connection pool for the given addr.
//
// Name is used in exported metrics.
// handshakeFunc is used for handshaking after the connection establishing.
// The compression is disabled if compressionLevel <= 0.
func NewConnPool(name, addr string, handshakeFunc handshake.Func, compressionLevel int, dialTimeout time.Duration) *ConnPool {
cp := &ConnPool{
d: NewTCPDialer(name, addr, dialTimeout),
concurrentDialsCh: make(chan struct{}, 8),
name: name,
handshakeFunc: handshakeFunc,
compressionLevel: compressionLevel,
}
cp.checkAvailability(true)
_ = metrics.NewGauge(fmt.Sprintf(`vm_tcpdialer_conns_idle{name=%q, addr=%q}`, name, addr), func() float64 {
cp.mu.Lock()
n := len(cp.conns)
cp.mu.Unlock()
return float64(n)
})
_ = metrics.NewGauge(fmt.Sprintf(`vm_tcpdialer_addr_available{name=%q, addr=%q}`, name, addr), func() float64 {
cp.mu.Lock()
isAvailable := len(cp.conns) > 0 || cp.lastDialError == nil
cp.mu.Unlock()
if isAvailable {
return 1
}
return 0
})
connPoolsMu.Lock()
connPools = append(connPools, cp)
connPoolsMu.Unlock()
return cp
}
// Addr returns the address where connections are established.
func (cp *ConnPool) Addr() string {
return cp.d.addr
}
// Get returns free connection from the pool.
func (cp *ConnPool) Get() (*handshake.BufferedConn, error) {
bc, err := cp.tryGetConn()
if err != nil {
return nil, err
}
if bc != nil {
// Fast path - obtained the connection from pool.
return bc, nil
}
return cp.getConnSlow()
}
func (cp *ConnPool) getConnSlow() (*handshake.BufferedConn, error) {
// Limit the number of concurrent dials.
// This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2552
cp.concurrentDialsCh <- struct{}{}
defer func() {
<-cp.concurrentDialsCh
}()
// Make an attempt to get already established connections from the pool.
// It may appear there while waiting for cp.concurrentDialsCh.
bc, err := cp.tryGetConn()
if err != nil {
return nil, err
}
if bc != nil {
return bc, nil
}
// Pool is empty. Create new connection.
return cp.dialAndHandshake()
}
func (cp *ConnPool) dialAndHandshake() (*handshake.BufferedConn, error) {
c, err := cp.d.Dial()
if err != nil {
err = fmt.Errorf("cannot dial %s: %w", cp.d.Addr(), err)
}
cp.mu.Lock()
cp.lastDialError = err
cp.mu.Unlock()
if err != nil {
return nil, err
}
bc, err := cp.handshakeFunc(c, cp.compressionLevel)
if err != nil {
// Do not put handshake error to cp.lastDialError, because handshake
// is perfomed on an already established connection.
err = fmt.Errorf("cannot perform %q handshake with server %q: %w", cp.name, cp.d.Addr(), err)
_ = c.Close()
return nil, err
}
return bc, err
}
func (cp *ConnPool) tryGetConn() (*handshake.BufferedConn, error) {
cp.mu.Lock()
defer cp.mu.Unlock()
if len(cp.conns) == 0 {
return nil, cp.lastDialError
}
c := cp.conns[len(cp.conns)-1]
bc := c.bc
c.bc = nil
cp.conns = cp.conns[:len(cp.conns)-1]
return bc, nil
}
// Put puts bc back to the pool.
//
// Do not put broken and closed connections to the pool!
func (cp *ConnPool) Put(bc *handshake.BufferedConn) {
if err := bc.SetDeadline(time.Time{}); err != nil {
// Close the connection instead of returning it to the pool,
// since it may be broken.
_ = bc.Close()
return
}
cp.mu.Lock()
cp.conns = append(cp.conns, connWithTimestamp{
bc: bc,
lastActiveTime: fasttime.UnixTimestamp(),
})
cp.mu.Unlock()
}
func (cp *ConnPool) closeIdleConns() {
// Close connections, which were idle for more than 30 seconds.
// This should reduce the number of connections after sudden spikes in query rate.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2508
deadline := fasttime.UnixTimestamp() - 30
var activeConns []connWithTimestamp
cp.mu.Lock()
conns := cp.conns
for _, c := range conns {
if c.lastActiveTime > deadline {
activeConns = append(activeConns, c)
} else {
_ = c.bc.Close()
c.bc = nil
}
}
cp.conns = activeConns
cp.mu.Unlock()
}
func (cp *ConnPool) checkAvailability(force bool) {
cp.mu.Lock()
hasDialError := cp.lastDialError != nil
cp.mu.Unlock()
if hasDialError || force {
bc, _ := cp.dialAndHandshake()
if bc != nil {
cp.Put(bc)
}
}
}
func init() {
go func() {
for {
time.Sleep(17 * time.Second)
forEachConnPool(func(cp *ConnPool) {
cp.closeIdleConns()
})
}
}()
go func() {
for {
time.Sleep(time.Second)
forEachConnPool(func(cp *ConnPool) {
cp.checkAvailability(false)
})
}
}()
}
var connPoolsMu sync.Mutex
var connPools []*ConnPool
func forEachConnPool(f func(cp *ConnPool)) {
connPoolsMu.Lock()
var wg sync.WaitGroup
for _, cp := range connPools {
wg.Add(1)
go func(cp *ConnPool) {
defer wg.Done()
f(cp)
}(cp)
}
wg.Wait()
connPoolsMu.Unlock()
}