From a844b9794247c2a9c6042dd1f6e1a0af8c4b19f2 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Tue, 17 Jan 2023 23:09:34 -0800
Subject: [PATCH] lib/promscrape: follow-up for
 d79f1b106c2fd60d8760419fc4acb57d39cfd1c2

- Document the fix at docs/CHANGELOG.md
- Limit the concurrency for sendStaleMarkers() function in order to limit its memory usage
  when big number of targets disappear and staleness markers are sent
  for all the metrics exposed by these targets.
- Make sure that the writeRequestCtx is returned to the pool
  when there is no need to send staleness markers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
---
 docs/CHANGELOG.md            |  1 +
 lib/promscrape/scrapework.go | 24 ++++++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index f6919a69d..a71e0fb0c 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -28,6 +28,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): consistently put the scrape url with scrape target labels to all error logs for failed scrapes. Previously some failed scrapes were logged without this information.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not send stale markers to remote storage for series exceeding the configured [series limit](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3660).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly apply [series limit](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter) when [staleness tracking](https://docs.victoriametrics.com/vmagent.html#prometheus-staleness-markers) is disabled.
+* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): reduce memory usage spikes when big number of scrape targets disappear at once. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668). Thanks to @lzfhust for [the initial fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3669).
 * BUGFIX: [Pushgateway import](https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format): properly return `200 OK` HTTP response code. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3636).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly parse `M` and `Mi` suffixes as `1e6` multipliers in `1M` and `1Mi` numeric constants. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3664). The issue has been introduced in [v1.86.0](https://docs.victoriametrics.com/CHANGELOG.html#v1860).
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): properly display range query results at `Table` view. For example, `up[5m]` query now shows all the raw samples for the last 5 minutes for the `up` metric at the `Table` view. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3516).
diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go
index 095138b0b..18ddb3905 100644
--- a/lib/promscrape/scrapework.go
+++ b/lib/promscrape/scrapework.go
@@ -431,15 +431,15 @@ func (sw *scrapeWork) scrapeInternal(scrapeTimestamp, realTimestamp int64) error
 	return err
 }
 
-var concurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())
+var processScrapedDataConcurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())
 
 func (sw *scrapeWork) processScrapedData(scrapeTimestamp, realTimestamp int64, body *bytesutil.ByteBuffer, err error) (bool, error) {
 	// This function is CPU-bound, while it may allocate big amounts of memory.
 	// That's why it is a good idea to limit the number of concurrent calls to this function
 	// in order to limit memory usage under high load without sacrificing the performance.
-	concurrencyLimitCh <- struct{}{}
+	processScrapedDataConcurrencyLimitCh <- struct{}{}
 	defer func() {
-		<-concurrencyLimitCh
+		<-processScrapedDataConcurrencyLimitCh
 	}()
 
 	endTimestamp := time.Now().UnixNano() / 1e6
@@ -765,7 +765,17 @@ func (sw *scrapeWork) applySeriesLimit(wc *writeRequestCtx) int {
 	return samplesDropped
 }
 
+var sendStaleSeriesConcurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())
+
 func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp int64, addAutoSeries bool) {
+	// This function is CPU-bound, while it may allocate big amounts of memory.
+	// That's why it is a good idea to limit the number of concurrent calls to this function
+	// in order to limit memory usage under high load without sacrificing the performance.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
+	sendStaleSeriesConcurrencyLimitCh <- struct{}{}
+	defer func() {
+		<-sendStaleSeriesConcurrencyLimitCh
+	}()
 	if sw.Config.NoStaleMarkers {
 		return
 	}
@@ -773,7 +783,11 @@ func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp i
 	if currScrape != "" {
 		bodyString = parser.GetRowsDiff(lastScrape, currScrape)
 	}
-        wc := writeRequestCtxPool.Get(sw.prevLabelsLen)
+	wc := writeRequestCtxPool.Get(sw.prevLabelsLen)
+	defer func() {
+		wc.reset()
+		writeRequestCtxPool.Put(wc)
+	}()
 	if bodyString != "" {
 		wc.rows.UnmarshalWithErrLogger(bodyString, sw.logError)
 		srcRows := wc.rows.Rows
@@ -805,8 +819,6 @@ func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp i
 		staleSamplesCreated.Add(len(samples))
 	}
 	sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
-        wc.reset()
-	writeRequestCtxPool.Put(wc)
 }
 
 var staleSamplesCreated = metrics.NewCounter(`vm_promscrape_stale_samples_created_total`)