app/vminsert: add vm_rpc_send_duration_seconds_total metric per each vminsert->vmstorage link

This metric is useful for determining high link saturation with the following alerting rule: rate(vm_rpc_send_duration_seconds_total) > 0.9s
2025-01-10 15:14:09 +00:00 · 2021-08-11 11:40:52 +03:00 · 2021-08-11 11:40:52 +03:00 · 9eb828b2c2
commit 9eb828b2c2
parent 3df6550153
2 changed files with 9 additions and 0 deletions
--- a/app/vminsert/netstorage/netstorage.go
+++ b/app/vminsert/netstorage/netstorage.go
@ -245,7 +245,10 @@ func (sn *storageNode) sendBufRowsNonblocking(br *bufRows) bool {
 		// sn.dial() should be called by sn.checkHealth() on unsuccessful call to sendBufToReplicasNonblocking().
 		return false
 	}
+	startTime := time.Now()
 	err := sendToConn(sn.bc, br.buf)
+	duration := time.Since(startTime)
+	sn.sendDurationSeconds.Add(duration.Seconds())
 	if err == nil {
 		// Successfully sent buf to bc.
 		sn.rowsSent.Add(br.rows)
@ -383,6 +386,10 @@ type storageNode struct {
 	// The number of rows rerouted to the given vmstorage node
 	// from other nodes when they were unhealthy.
 	rowsReroutedToHere *metrics.Counter
+
+	// The total duration spent for sending data to vmstorage node.
+	// This metric is useful for determining the saturation of vminsert->vmstorage link.
+	sendDurationSeconds *metrics.FloatCounter
 }

 // storageNodes contains a list of vmstorage node clients.
@ -418,6 +425,7 @@ func InitStorageNodes(addrs []string) {
 			rowsSent:             metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_sent_total{name="vminsert", addr=%q}`, addr)),
 			rowsReroutedFromHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_from_here_total{name="vminsert", addr=%q}`, addr)),
 			rowsReroutedToHere:   metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_to_here_total{name="vminsert", addr=%q}`, addr)),
+			sendDurationSeconds:  metrics.NewFloatCounter(fmt.Sprintf(`vm_rpc_send_duration_seconds_total{name="vminsert", addr=%q}`, addr)),
 		}
 		sn.brCond = sync.NewCond(&sn.brLock)
 		_ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_rows_pending{name="vminsert", addr=%q}`, addr), func() float64 {
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -14,6 +14,7 @@ sort: 15
 * FEATURE: add `-search.maxSamplesPerQuery` command-line flag for limiting the number of raw samples a single query can process across all the time series. This option can protect from heavy queries, which select too big number of raw samples. Thanks to @jiangxinlingdu for [the initial pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1478).
 * FEATURE: improve performance for queries that process big number of time series and/or samples on systems with big number of CPU cores.
 * FEATURE: vmalert: expose `vmalert_alerting_rules_last_evaluation_samples` and `vmalert_recording_rules_last_evaluation_samples` metrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1494).
+* FEATURE: vminsert: expose `vm_rpc_send_duration_seconds_total` counter, which can be used for determining high saturation of every `vminsert -> vmstorage` link with an alerting query `rate(vm_rpc_send_duration_seconds_total) > 0.9s`. This query triggers when the link is saturated by more than 90%.

 * BUGFIX: fix corner cases for queries on time ranges exceeding 40 days. Previously some series can be missing in query results. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1486).
 * BUGFIX: vmselect: return dummy response at `/rules` page in the same way as for `/api/v1/rules` page. The `/rules` page is requested by Grafana 8. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1493) for details.