From 57000f5105fb360f8b6a04eb13e97f7efed078b5 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Tue, 16 Jul 2024 12:24:14 +0200
Subject: [PATCH] lib/promscrape: follow-up for
 1e83598be330b844b58041966129ce9a728027ac

- Clarify that the -promscrape.maxScrapeSize value is used for limiting the maximum
  scrape size if max_scrape_size option isn't set at https://docs.victoriametrics.com/sd_configs/#scrape_configs

- Fix query example for scrape_response_size_bytes metric at https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics

- Mention about max_scrape_size option at the -help description for -promscrape.maxScrapeSize command-line flag

- Treat zero value for max_scrape_size option as 'no scrape size limit'

- Change float64 to int type for scrapeResponseSize struct fields and function args, since response size cannot be fractional

- Optimize isAutoMetric() function a bit

- Sort auto metrics in alphabetical order in isAutoMetric() and in scrapeWork.addAutoMetrics() functions
  for better maintainability in the future

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6434
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429
---
 docs/CHANGELOG.md                   |  2 +-
 docs/sd_configs.md                  |  2 +-
 docs/vmagent.md                     | 17 ++++-----
 lib/promscrape/client.go            |  4 +--
 lib/promscrape/config.go            | 12 ++++---
 lib/promscrape/config_test.go       |  4 +--
 lib/promscrape/scrapework.go        | 56 ++++++++++++++++++-----------
 lib/promscrape/targetstatus.go      |  6 ++--
 lib/promscrape/targetstatus.qtpl    |  4 +--
 lib/promscrape/targetstatus.qtpl.go |  4 +--
 10 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 92626d0b0..c3d704f36 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -81,7 +81,7 @@ Released at 2024-06-24
 * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `-idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs.
 * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add automatic retry for requests to backend for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): increase default value of `-promscrape.maxDroppedTargets` command-line flag to 10_000 from 1000. This makes it easier to track down large number of dropped targets. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6381).
-* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` parameter to a scrape config for setting a custom scrape limit for a job. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` was added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612).
+* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` option to [scrape config](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for setting custom limit on the response size target can send. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` is added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612).
 * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): check for ranged vector arguments in non-rollup expressions when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `sum(up[5m])` or `absent(up[5m])` will fail to execute if these flags are set.
 * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): validate that rollup expressions has ranged vector arguments passed when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `rate(metric)` or `count_over_time(metric)` will fail to execute if these flags are set.
 * FEATURE: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): support file path with hierarchical patterns and regexpes, and http url in unittest cmd-line flag `-files`, e.g. `-files="http://<some-server-addr>/path/to/rules"` or `-files="dir/**/*.yaml"`.
diff --git a/docs/sd_configs.md b/docs/sd_configs.md
index 9a959d833..dda1248f2 100644
--- a/docs/sd_configs.md
+++ b/docs/sd_configs.md
@@ -1693,7 +1693,7 @@ scrape_configs:
   # scrape_timeout: <duration>
 
   # max_scrape_size is an optional parameter for limiting the response size in bytes from scraped targets.
-  # By default, uses limit from -promscrape.maxScrapeSize command-line flag.
+  # If max_scrape_size isn't set, then the limit from -promscrape.maxScrapeSize command-line flag is used instead.
   # Example values:
   # - "10MiB" - 10 * 1024 * 1024 bytes
   # - "100MB" - 100 * 1000 * 1000 bytes
diff --git a/docs/vmagent.md b/docs/vmagent.md
index 4954ac0a5..35d509058 100644
--- a/docs/vmagent.md
+++ b/docs/vmagent.md
@@ -486,14 +486,6 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
   scrape_duration_seconds > 1.5
   ```
 
-* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped
-  and to adjust `max_scrape_size` for scraped targets. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/)
-  returns targets with scrape response > 10MiB:
-
-  ```metricsql
-  max_scrape_size > 10MiB
-  ```
-
 * `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`).
   This allows detecting targets with scrape durations close to the configured scrape timeout.
   For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets (identified by `instance` label),
@@ -503,6 +495,15 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
   scrape_duration_seconds / scrape_timeout_seconds > 0.8
   ```
 
+* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped
+  and to adjust [`max_scrape_size` option](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for scraped targets.
+  For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets with scrape response
+  bigger than `10MiB`:
+
+  ```metricsql
+  scrape_response_size_bytes > 10MiB
+  ```
+
 * `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets,
   which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/)
   returns targets, which expose more than 10000 metrics:
diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go
index fce6d8dce..bf3bb6e21 100644
--- a/lib/promscrape/client.go
+++ b/lib/promscrape/client.go
@@ -155,9 +155,9 @@ func (c *client) ReadData(dst *bytesutil.ByteBuffer) error {
 	}
 	if int64(len(dst.B)) >= c.maxScrapeSize {
 		maxScrapeSizeExceeded.Inc()
-		return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize=%d or max_scrape_size in a scrape config. "+
+		return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize or max_scrape_size in the scrape config (%d bytes). "+
 			"Possible solutions are: reduce the response size for the target, increase -promscrape.maxScrapeSize command-line flag, "+
-			"increase max_scrape_size value in scrape config", c.scrapeURL, maxScrapeSize.N)
+			"increase max_scrape_size value in scrape config for the given target", c.scrapeURL, maxScrapeSize.N)
 	}
 	return nil
 }
diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go
index 2c1ac2051..7c94f6ae5 100644
--- a/lib/promscrape/config.go
+++ b/lib/promscrape/config.go
@@ -78,7 +78,7 @@ var (
 		"then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. "+
 		"See https://docs.victoriametrics.com/vmagent/#scraping-big-number-of-targets for more info")
 	maxScrapeSize = flagutil.NewBytes("promscrape.maxScrapeSize", 16*1024*1024, "The maximum size of scrape response in bytes to process from Prometheus targets. "+
-		"Bigger responses are rejected")
+		"Bigger responses are rejected. See also max_scrape_size option at https://docs.victoriametrics.com/sd_configs/#scrape_configs")
 )
 
 var clusterMemberID int
@@ -852,12 +852,14 @@ func getScrapeWorkConfig(sc *ScrapeConfig, baseDir string, globalCfg *GlobalConf
 		// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1281#issuecomment-840538907
 		scrapeTimeout = scrapeInterval
 	}
-	var err error
 	mss := maxScrapeSize.N
-	if len(sc.MaxScrapeSize) > 0 {
-		mss, err = flagutil.ParseBytes(sc.MaxScrapeSize)
+	if sc.MaxScrapeSize != "" {
+		n, err := flagutil.ParseBytes(sc.MaxScrapeSize)
 		if err != nil {
-			return nil, fmt.Errorf("unexpected `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err)
+			return nil, fmt.Errorf("cannot parse `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err)
+		}
+		if n > 0 {
+			mss = n
 		}
 	}
 	honorLabels := sc.HonorLabels
diff --git a/lib/promscrape/config_test.go b/lib/promscrape/config_test.go
index ac1b12146..bd4a39c44 100644
--- a/lib/promscrape/config_test.go
+++ b/lib/promscrape/config_test.go
@@ -993,7 +993,7 @@ scrape_configs:
 scrape_configs:
 - job_name: foo
   scheme: https
-  max_scrape_size: 0
+  max_scrape_size: 1
   relabel_configs:
   - action: keep
     source_labels: [__address__]
@@ -1015,7 +1015,7 @@ scrape_configs:
 			ScrapeURL:      "http://foo.bar:1234/metrics",
 			ScrapeInterval: defaultScrapeInterval,
 			ScrapeTimeout:  defaultScrapeTimeout,
-			MaxScrapeSize:  0,
+			MaxScrapeSize:  1,
 			Labels: promutils.NewLabelsFromMap(map[string]string{
 				"instance": "foo.bar:1234",
 				"job":      "3",
diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go
index 24e882bc5..32f365334 100644
--- a/lib/promscrape/scrapework.go
+++ b/lib/promscrape/scrapework.go
@@ -500,10 +500,11 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
 	if sw.seriesLimitExceeded || !areIdenticalSeries {
 		samplesDropped = sw.applySeriesLimit(wc)
 	}
+	responseSize := len(bodyString)
 	am := &autoMetrics{
 		up:                        up,
 		scrapeDurationSeconds:     scrapeDurationSeconds,
-		scrapeResponseSize:        float64(len(bodyString)),
+		scrapeResponseSize:        responseSize,
 		samplesScraped:            samplesScraped,
 		samplesPostRelabeling:     samplesPostRelabeling,
 		seriesAdded:               seriesAdded,
@@ -512,7 +513,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
 	sw.addAutoMetrics(am, wc, scrapeTimestamp)
 	sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
 	sw.prevLabelsLen = len(wc.labels)
-	sw.prevBodyLen = len(bodyString)
+	sw.prevBodyLen = responseSize
 	wc.reset()
 	writeRequestCtxPool.Put(wc)
 	// body must be released only after wc is released, since wc refers to body.
@@ -523,7 +524,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
 		sw.storeLastScrape(body)
 	}
 	sw.finalizeLastScrape()
-	tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err)
+	tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err)
 	return err
 }
 
@@ -581,10 +582,11 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
 		// This is a trade-off between performance and accuracy.
 		seriesAdded = sw.getSeriesAdded(lastScrape, bodyString)
 	}
+	responseSize := len(bodyString)
 	am := &autoMetrics{
 		up:                        up,
 		scrapeDurationSeconds:     scrapeDurationSeconds,
-		scrapeResponseSize:        float64(len(bodyString)),
+		scrapeResponseSize:        responseSize,
 		samplesScraped:            samplesScraped,
 		samplesPostRelabeling:     samplesPostRelabeling,
 		seriesAdded:               seriesAdded,
@@ -593,7 +595,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
 	sw.addAutoMetrics(am, wc, scrapeTimestamp)
 	sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
 	sw.prevLabelsLen = len(wc.labels)
-	sw.prevBodyLen = len(bodyString)
+	sw.prevBodyLen = responseSize
 	wc.reset()
 	writeRequestCtxPool.Put(wc)
 	if !areIdenticalSeries {
@@ -603,7 +605,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
 		sw.storeLastScrape(body.B)
 	}
 	sw.finalizeLastScrape()
-	tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err)
+	tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err)
 	// Do not track active series in streaming mode, since this may need too big amounts of memory
 	// when the target exports too big number of metrics.
 	return err
@@ -815,7 +817,7 @@ func (sw *scrapeWork) getLabelsHash(labels []prompbmarshal.Label) uint64 {
 type autoMetrics struct {
 	up                        int
 	scrapeDurationSeconds     float64
-	scrapeResponseSize        float64
+	scrapeResponseSize        int
 	samplesScraped            int
 	samplesPostRelabeling     int
 	seriesAdded               int
@@ -823,35 +825,47 @@ type autoMetrics struct {
 }
 
 func isAutoMetric(s string) bool {
-	switch s {
-	case "up", "scrape_duration_seconds", "scrape_samples_scraped",
-		"scrape_samples_post_metric_relabeling", "scrape_series_added",
-		"scrape_timeout_seconds", "scrape_samples_limit",
-		"scrape_series_limit_samples_dropped", "scrape_series_limit",
-		"scrape_series_current", "scrape_response_size_bytes":
+	if s == "up" {
 		return true
 	}
-	return false
+	if !strings.HasPrefix(s, "scrape_") {
+		return false
+	}
+	switch s {
+	case "scrape_duration_seconds",
+		"scrape_response_size_bytes",
+		"scrape_samples_limit",
+		"scrape_samples_post_metric_relabeling",
+		"scrape_samples_scraped",
+		"scrape_series_added",
+		"scrape_series_current",
+		"scrape_series_limit",
+		"scrape_series_limit_samples_dropped",
+		"scrape_timeout_seconds":
+		return true
+	default:
+		return false
+	}
 }
 
 func (sw *scrapeWork) addAutoMetrics(am *autoMetrics, wc *writeRequestCtx, timestamp int64) {
-	sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp)
 	sw.addAutoTimeseries(wc, "scrape_duration_seconds", am.scrapeDurationSeconds, timestamp)
-	sw.addAutoTimeseries(wc, "scrape_response_size_bytes", am.scrapeResponseSize, timestamp)
-	sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp)
-	sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp)
-	sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp)
-	sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp)
+	sw.addAutoTimeseries(wc, "scrape_response_size_bytes", float64(am.scrapeResponseSize), timestamp)
 	if sampleLimit := sw.Config.SampleLimit; sampleLimit > 0 {
 		// Expose scrape_samples_limit metric if sample_limit config is set for the target.
 		// See https://github.com/VictoriaMetrics/operator/issues/497
 		sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sampleLimit), timestamp)
 	}
+	sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp)
+	sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp)
+	sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp)
 	if sl := sw.seriesLimiter; sl != nil {
+		sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp)
 		sw.addAutoTimeseries(wc, "scrape_series_limit_samples_dropped", float64(am.seriesLimitSamplesDropped), timestamp)
 		sw.addAutoTimeseries(wc, "scrape_series_limit", float64(sl.MaxItems()), timestamp)
-		sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp)
 	}
+	sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp)
+	sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp)
 }
 
 // addAutoTimeseries adds automatically generated time series with the given name, value and timestamp.
diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go
index 859fb68d8..2a74a3f69 100644
--- a/lib/promscrape/targetstatus.go
+++ b/lib/promscrape/targetstatus.go
@@ -178,7 +178,7 @@ func (tsm *targetStatusMap) Unregister(sw *scrapeWork) {
 	tsm.mu.Unlock()
 }
 
-func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize float64, samplesScraped int, err error) {
+func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize, samplesScraped int, err error) {
 	jobName := sw.Config.jobNameOriginal
 
 	tsm.mu.Lock()
@@ -300,7 +300,7 @@ type targetStatus struct {
 	up                 bool
 	scrapeTime         int64
 	scrapeDuration     int64
-	scrapeResponseSize float64
+	scrapeResponseSize int
 	samplesScraped     int
 	scrapesTotal       int
 	scrapesFailed      int
@@ -319,7 +319,7 @@ func (ts *targetStatus) getSizeFromLastScrape() string {
 	if ts.scrapeResponseSize <= 0 {
 		return "never scraped"
 	}
-	return fmt.Sprintf("%.3f kb", float64(ts.scrapeResponseSize)/1024)
+	return fmt.Sprintf("%.3fKiB", float64(ts.scrapeResponseSize)/1024)
 }
 
 type droppedTargets struct {
diff --git a/lib/promscrape/targetstatus.qtpl b/lib/promscrape/targetstatus.qtpl
index 8bacbcba1..6f0cd7a95 100644
--- a/lib/promscrape/targetstatus.qtpl
+++ b/lib/promscrape/targetstatus.qtpl
@@ -27,9 +27,9 @@
 		{% if filter.showOriginalLabels %}originalLabels={%s= ts.sw.Config.OriginalLabels.String() %},{% space %}{% endif %}
 		scrapes_total={%d ts.scrapesTotal %},{% space %}
 		scrapes_failed={%d ts.scrapesFailed %},{% space %}
-		last_scrape={%s ts.getDurationFromLastScrape() %},{% space %}
+		last_scrape={%s= ts.getDurationFromLastScrape() %},{% space %}
 		scrape_duration={%d int(ts.scrapeDuration) %}ms,{% space %}
-                scrape_response_size={%s ts.getSizeFromLastScrape() %},{% space %}
+		scrape_response_size={%s= ts.getSizeFromLastScrape() %},{% space %}
 		samples_scraped={%d ts.samplesScraped %},{% space %}
 		error={% if ts.err != nil %}{%s= ts.err.Error() %}{% endif %}
 		{% newline %}
diff --git a/lib/promscrape/targetstatus.qtpl.go b/lib/promscrape/targetstatus.qtpl.go
index 53e820a52..10a15586d 100644
--- a/lib/promscrape/targetstatus.qtpl.go
+++ b/lib/promscrape/targetstatus.qtpl.go
@@ -127,7 +127,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes
 //line lib/promscrape/targetstatus.qtpl:29
 			qw422016.N().S(`last_scrape=`)
 //line lib/promscrape/targetstatus.qtpl:30
-			qw422016.E().S(ts.getDurationFromLastScrape())
+			qw422016.N().S(ts.getDurationFromLastScrape())
 //line lib/promscrape/targetstatus.qtpl:30
 			qw422016.N().S(`,`)
 //line lib/promscrape/targetstatus.qtpl:30
@@ -143,7 +143,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes
 //line lib/promscrape/targetstatus.qtpl:31
 			qw422016.N().S(`scrape_response_size=`)
 //line lib/promscrape/targetstatus.qtpl:32
-			qw422016.E().S(ts.getSizeFromLastScrape())
+			qw422016.N().S(ts.getSizeFromLastScrape())
 //line lib/promscrape/targetstatus.qtpl:32
 			qw422016.N().S(`,`)
 //line lib/promscrape/targetstatus.qtpl:32