lib/promscrape: follow-up for d79f1b106c

- Document the fix at docs/CHANGELOG.md - Limit the concurrency for sendStaleMarkers() function in order to limit its memory usage when big number of targets disappear and staleness markers are sent for all the metrics exposed by these targets. - Make sure that the writeRequestCtx is returned to the pool when there is no need to send staleness markers. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
2025-01-10 15:14:09 +00:00 · 2023-01-17 23:09:34 -08:00 · 2023-01-17 23:09:34 -08:00 · a844b97942
commit a844b97942
parent 5ac0f18ca8
2 changed files with 19 additions and 6 deletions
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -28,6 +28,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): consistently put the scrape url with scrape target labels to all error logs for failed scrapes. Previously some failed scrapes were logged without this information.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not send stale markers to remote storage for series exceeding the configured [series limit](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3660).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly apply [series limit](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter) when [staleness tracking](https://docs.victoriametrics.com/vmagent.html#prometheus-staleness-markers) is disabled.
+* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): reduce memory usage spikes when big number of scrape targets disappear at once. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668). Thanks to @lzfhust for [the initial fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3669).
 * BUGFIX: [Pushgateway import](https://docs.victoriametrics.com/#how-to-import-data-in-prometheus-exposition-format): properly return `200 OK` HTTP response code. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3636).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly parse `M` and `Mi` suffixes as `1e6` multipliers in `1M` and `1Mi` numeric constants. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3664). The issue has been introduced in [v1.86.0](https://docs.victoriametrics.com/CHANGELOG.html#v1860).
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): properly display range query results at `Table` view. For example, `up[5m]` query now shows all the raw samples for the last 5 minutes for the `up` metric at the `Table` view. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3516).
--- a/lib/promscrape/scrapework.go
+++ b/lib/promscrape/scrapework.go
@ -431,15 +431,15 @@ func (sw *scrapeWork) scrapeInternal(scrapeTimestamp, realTimestamp int64) error
 	return err
 }

-var concurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())
+var processScrapedDataConcurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())

 func (sw *scrapeWork) processScrapedData(scrapeTimestamp, realTimestamp int64, body *bytesutil.ByteBuffer, err error) (bool, error) {
 	// This function is CPU-bound, while it may allocate big amounts of memory.
 	// That's why it is a good idea to limit the number of concurrent calls to this function
 	// in order to limit memory usage under high load without sacrificing the performance.
-	concurrencyLimitCh <- struct{}{}
+	processScrapedDataConcurrencyLimitCh <- struct{}{}
 	defer func() {
-		<-concurrencyLimitCh
+		<-processScrapedDataConcurrencyLimitCh
 	}()

 	endTimestamp := time.Now().UnixNano() / 1e6
@ -765,7 +765,17 @@ func (sw *scrapeWork) applySeriesLimit(wc *writeRequestCtx) int {
 	return samplesDropped
 }

+var sendStaleSeriesConcurrencyLimitCh = make(chan struct{}, cgroup.AvailableCPUs())
+
 func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp int64, addAutoSeries bool) {
+	// This function is CPU-bound, while it may allocate big amounts of memory.
+	// That's why it is a good idea to limit the number of concurrent calls to this function
+	// in order to limit memory usage under high load without sacrificing the performance.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
+	sendStaleSeriesConcurrencyLimitCh <- struct{}{}
+	defer func() {
+		<-sendStaleSeriesConcurrencyLimitCh
+	}()
 	if sw.Config.NoStaleMarkers {
 		return
 	}
@ -773,7 +783,11 @@ func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp i
 	if currScrape != "" {
 		bodyString = parser.GetRowsDiff(lastScrape, currScrape)
 	}
-        wc := writeRequestCtxPool.Get(sw.prevLabelsLen)
+	wc := writeRequestCtxPool.Get(sw.prevLabelsLen)
+	defer func() {
+		wc.reset()
+		writeRequestCtxPool.Put(wc)
+	}()
 	if bodyString != "" {
 		wc.rows.UnmarshalWithErrLogger(bodyString, sw.logError)
 		srcRows := wc.rows.Rows
@ -805,8 +819,6 @@ func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp i
 		staleSamplesCreated.Add(len(samples))
 	}
 	sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
-        wc.reset()
-	writeRequestCtxPool.Put(wc)
 }

 var staleSamplesCreated = metrics.NewCounter(`vm_promscrape_stale_samples_created_total`)