From 9eb828b2c20be4f4685ffb277dec37604b81b02b Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 11 Aug 2021 11:40:52 +0300 Subject: [PATCH] app/vminsert: add vm_rpc_send_duration_seconds_total metric per each `vminsert->vmstorage` link This metric is useful for determining high link saturation with the following alerting rule: rate(vm_rpc_send_duration_seconds_total) > 0.9s --- app/vminsert/netstorage/netstorage.go | 8 ++++++++ docs/CHANGELOG.md | 1 + 2 files changed, 9 insertions(+) diff --git a/app/vminsert/netstorage/netstorage.go b/app/vminsert/netstorage/netstorage.go index 70c678d76..c22b0d52f 100644 --- a/app/vminsert/netstorage/netstorage.go +++ b/app/vminsert/netstorage/netstorage.go @@ -245,7 +245,10 @@ func (sn *storageNode) sendBufRowsNonblocking(br *bufRows) bool { // sn.dial() should be called by sn.checkHealth() on unsuccessful call to sendBufToReplicasNonblocking(). return false } + startTime := time.Now() err := sendToConn(sn.bc, br.buf) + duration := time.Since(startTime) + sn.sendDurationSeconds.Add(duration.Seconds()) if err == nil { // Successfully sent buf to bc. sn.rowsSent.Add(br.rows) @@ -383,6 +386,10 @@ type storageNode struct { // The number of rows rerouted to the given vmstorage node // from other nodes when they were unhealthy. rowsReroutedToHere *metrics.Counter + + // The total duration spent for sending data to vmstorage node. + // This metric is useful for determining the saturation of vminsert->vmstorage link. + sendDurationSeconds *metrics.FloatCounter } // storageNodes contains a list of vmstorage node clients. @@ -418,6 +425,7 @@ func InitStorageNodes(addrs []string) { rowsSent: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_sent_total{name="vminsert", addr=%q}`, addr)), rowsReroutedFromHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_from_here_total{name="vminsert", addr=%q}`, addr)), rowsReroutedToHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_to_here_total{name="vminsert", addr=%q}`, addr)), + sendDurationSeconds: metrics.NewFloatCounter(fmt.Sprintf(`vm_rpc_send_duration_seconds_total{name="vminsert", addr=%q}`, addr)), } sn.brCond = sync.NewCond(&sn.brLock) _ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_rows_pending{name="vminsert", addr=%q}`, addr), func() float64 { diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index b9e048217..f6b9af2de 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -14,6 +14,7 @@ sort: 15 * FEATURE: add `-search.maxSamplesPerQuery` command-line flag for limiting the number of raw samples a single query can process across all the time series. This option can protect from heavy queries, which select too big number of raw samples. Thanks to @jiangxinlingdu for [the initial pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1478). * FEATURE: improve performance for queries that process big number of time series and/or samples on systems with big number of CPU cores. * FEATURE: vmalert: expose `vmalert_alerting_rules_last_evaluation_samples` and `vmalert_recording_rules_last_evaluation_samples` metrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1494). +* FEATURE: vminsert: expose `vm_rpc_send_duration_seconds_total` counter, which can be used for determining high saturation of every `vminsert -> vmstorage` link with an alerting query `rate(vm_rpc_send_duration_seconds_total) > 0.9s`. This query triggers when the link is saturated by more than 90%. * BUGFIX: fix corner cases for queries on time ranges exceeding 40 days. Previously some series can be missing in query results. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1486). * BUGFIX: vmselect: return dummy response at `/rules` page in the same way as for `/api/v1/rules` page. The `/rules` page is requested by Grafana 8. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1493) for details.