From 57000f5105fb360f8b6a04eb13e97f7efed078b5 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 16 Jul 2024 12:24:14 +0200 Subject: [PATCH] lib/promscrape: follow-up for 1e83598be330b844b58041966129ce9a728027ac - Clarify that the -promscrape.maxScrapeSize value is used for limiting the maximum scrape size if max_scrape_size option isn't set at https://docs.victoriametrics.com/sd_configs/#scrape_configs - Fix query example for scrape_response_size_bytes metric at https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics - Mention about max_scrape_size option at the -help description for -promscrape.maxScrapeSize command-line flag - Treat zero value for max_scrape_size option as 'no scrape size limit' - Change float64 to int type for scrapeResponseSize struct fields and function args, since response size cannot be fractional - Optimize isAutoMetric() function a bit - Sort auto metrics in alphabetical order in isAutoMetric() and in scrapeWork.addAutoMetrics() functions for better maintainability in the future Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6434 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429 --- docs/CHANGELOG.md | 2 +- docs/sd_configs.md | 2 +- docs/vmagent.md | 17 ++++----- lib/promscrape/client.go | 4 +-- lib/promscrape/config.go | 12 ++++--- lib/promscrape/config_test.go | 4 +-- lib/promscrape/scrapework.go | 56 ++++++++++++++++++----------- lib/promscrape/targetstatus.go | 6 ++-- lib/promscrape/targetstatus.qtpl | 4 +-- lib/promscrape/targetstatus.qtpl.go | 4 +-- 10 files changed, 64 insertions(+), 47 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 92626d0b0..c3d704f36 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -81,7 +81,7 @@ Released at 2024-06-24 * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `-idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs. * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add automatic retry for requests to backend for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): increase default value of `-promscrape.maxDroppedTargets` command-line flag to 10_000 from 1000. This makes it easier to track down large number of dropped targets. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6381). -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` parameter to a scrape config for setting a custom scrape limit for a job. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` was added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` option to [scrape config](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for setting custom limit on the response size target can send. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` is added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612). * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): check for ranged vector arguments in non-rollup expressions when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `sum(up[5m])` or `absent(up[5m])` will fail to execute if these flags are set. * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): validate that rollup expressions has ranged vector arguments passed when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `rate(metric)` or `count_over_time(metric)` will fail to execute if these flags are set. * FEATURE: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): support file path with hierarchical patterns and regexpes, and http url in unittest cmd-line flag `-files`, e.g. `-files="http:///path/to/rules"` or `-files="dir/**/*.yaml"`. diff --git a/docs/sd_configs.md b/docs/sd_configs.md index 9a959d833..dda1248f2 100644 --- a/docs/sd_configs.md +++ b/docs/sd_configs.md @@ -1693,7 +1693,7 @@ scrape_configs: # scrape_timeout: # max_scrape_size is an optional parameter for limiting the response size in bytes from scraped targets. - # By default, uses limit from -promscrape.maxScrapeSize command-line flag. + # If max_scrape_size isn't set, then the limit from -promscrape.maxScrapeSize command-line flag is used instead. # Example values: # - "10MiB" - 10 * 1024 * 1024 bytes # - "100MB" - 100 * 1000 * 1000 bytes diff --git a/docs/vmagent.md b/docs/vmagent.md index 4954ac0a5..35d509058 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -486,14 +486,6 @@ and attaches `instance`, `job` and other target-specific labels to these metrics scrape_duration_seconds > 1.5 ``` -* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped - and to adjust `max_scrape_size` for scraped targets. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) - returns targets with scrape response > 10MiB: - - ```metricsql - max_scrape_size > 10MiB - ``` - * `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets (identified by `instance` label), @@ -503,6 +495,15 @@ and attaches `instance`, `job` and other target-specific labels to these metrics scrape_duration_seconds / scrape_timeout_seconds > 0.8 ``` +* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped + and to adjust [`max_scrape_size` option](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for scraped targets. + For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets with scrape response + bigger than `10MiB`: + + ```metricsql + scrape_response_size_bytes > 10MiB + ``` + * `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets, which expose more than 10000 metrics: diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go index fce6d8dce..bf3bb6e21 100644 --- a/lib/promscrape/client.go +++ b/lib/promscrape/client.go @@ -155,9 +155,9 @@ func (c *client) ReadData(dst *bytesutil.ByteBuffer) error { } if int64(len(dst.B)) >= c.maxScrapeSize { maxScrapeSizeExceeded.Inc() - return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize=%d or max_scrape_size in a scrape config. "+ + return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize or max_scrape_size in the scrape config (%d bytes). "+ "Possible solutions are: reduce the response size for the target, increase -promscrape.maxScrapeSize command-line flag, "+ - "increase max_scrape_size value in scrape config", c.scrapeURL, maxScrapeSize.N) + "increase max_scrape_size value in scrape config for the given target", c.scrapeURL, maxScrapeSize.N) } return nil } diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go index 2c1ac2051..7c94f6ae5 100644 --- a/lib/promscrape/config.go +++ b/lib/promscrape/config.go @@ -78,7 +78,7 @@ var ( "then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. "+ "See https://docs.victoriametrics.com/vmagent/#scraping-big-number-of-targets for more info") maxScrapeSize = flagutil.NewBytes("promscrape.maxScrapeSize", 16*1024*1024, "The maximum size of scrape response in bytes to process from Prometheus targets. "+ - "Bigger responses are rejected") + "Bigger responses are rejected. See also max_scrape_size option at https://docs.victoriametrics.com/sd_configs/#scrape_configs") ) var clusterMemberID int @@ -852,12 +852,14 @@ func getScrapeWorkConfig(sc *ScrapeConfig, baseDir string, globalCfg *GlobalConf // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1281#issuecomment-840538907 scrapeTimeout = scrapeInterval } - var err error mss := maxScrapeSize.N - if len(sc.MaxScrapeSize) > 0 { - mss, err = flagutil.ParseBytes(sc.MaxScrapeSize) + if sc.MaxScrapeSize != "" { + n, err := flagutil.ParseBytes(sc.MaxScrapeSize) if err != nil { - return nil, fmt.Errorf("unexpected `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err) + return nil, fmt.Errorf("cannot parse `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err) + } + if n > 0 { + mss = n } } honorLabels := sc.HonorLabels diff --git a/lib/promscrape/config_test.go b/lib/promscrape/config_test.go index ac1b12146..bd4a39c44 100644 --- a/lib/promscrape/config_test.go +++ b/lib/promscrape/config_test.go @@ -993,7 +993,7 @@ scrape_configs: scrape_configs: - job_name: foo scheme: https - max_scrape_size: 0 + max_scrape_size: 1 relabel_configs: - action: keep source_labels: [__address__] @@ -1015,7 +1015,7 @@ scrape_configs: ScrapeURL: "http://foo.bar:1234/metrics", ScrapeInterval: defaultScrapeInterval, ScrapeTimeout: defaultScrapeTimeout, - MaxScrapeSize: 0, + MaxScrapeSize: 1, Labels: promutils.NewLabelsFromMap(map[string]string{ "instance": "foo.bar:1234", "job": "3", diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index 24e882bc5..32f365334 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -500,10 +500,11 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b if sw.seriesLimitExceeded || !areIdenticalSeries { samplesDropped = sw.applySeriesLimit(wc) } + responseSize := len(bodyString) am := &autoMetrics{ up: up, scrapeDurationSeconds: scrapeDurationSeconds, - scrapeResponseSize: float64(len(bodyString)), + scrapeResponseSize: responseSize, samplesScraped: samplesScraped, samplesPostRelabeling: samplesPostRelabeling, seriesAdded: seriesAdded, @@ -512,7 +513,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) - sw.prevBodyLen = len(bodyString) + sw.prevBodyLen = responseSize wc.reset() writeRequestCtxPool.Put(wc) // body must be released only after wc is released, since wc refers to body. @@ -523,7 +524,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b sw.storeLastScrape(body) } sw.finalizeLastScrape() - tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err) + tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err) return err } @@ -581,10 +582,11 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int // This is a trade-off between performance and accuracy. seriesAdded = sw.getSeriesAdded(lastScrape, bodyString) } + responseSize := len(bodyString) am := &autoMetrics{ up: up, scrapeDurationSeconds: scrapeDurationSeconds, - scrapeResponseSize: float64(len(bodyString)), + scrapeResponseSize: responseSize, samplesScraped: samplesScraped, samplesPostRelabeling: samplesPostRelabeling, seriesAdded: seriesAdded, @@ -593,7 +595,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) - sw.prevBodyLen = len(bodyString) + sw.prevBodyLen = responseSize wc.reset() writeRequestCtxPool.Put(wc) if !areIdenticalSeries { @@ -603,7 +605,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int sw.storeLastScrape(body.B) } sw.finalizeLastScrape() - tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err) + tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err) // Do not track active series in streaming mode, since this may need too big amounts of memory // when the target exports too big number of metrics. return err @@ -815,7 +817,7 @@ func (sw *scrapeWork) getLabelsHash(labels []prompbmarshal.Label) uint64 { type autoMetrics struct { up int scrapeDurationSeconds float64 - scrapeResponseSize float64 + scrapeResponseSize int samplesScraped int samplesPostRelabeling int seriesAdded int @@ -823,35 +825,47 @@ type autoMetrics struct { } func isAutoMetric(s string) bool { - switch s { - case "up", "scrape_duration_seconds", "scrape_samples_scraped", - "scrape_samples_post_metric_relabeling", "scrape_series_added", - "scrape_timeout_seconds", "scrape_samples_limit", - "scrape_series_limit_samples_dropped", "scrape_series_limit", - "scrape_series_current", "scrape_response_size_bytes": + if s == "up" { return true } - return false + if !strings.HasPrefix(s, "scrape_") { + return false + } + switch s { + case "scrape_duration_seconds", + "scrape_response_size_bytes", + "scrape_samples_limit", + "scrape_samples_post_metric_relabeling", + "scrape_samples_scraped", + "scrape_series_added", + "scrape_series_current", + "scrape_series_limit", + "scrape_series_limit_samples_dropped", + "scrape_timeout_seconds": + return true + default: + return false + } } func (sw *scrapeWork) addAutoMetrics(am *autoMetrics, wc *writeRequestCtx, timestamp int64) { - sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp) sw.addAutoTimeseries(wc, "scrape_duration_seconds", am.scrapeDurationSeconds, timestamp) - sw.addAutoTimeseries(wc, "scrape_response_size_bytes", am.scrapeResponseSize, timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp) - sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp) - sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp) + sw.addAutoTimeseries(wc, "scrape_response_size_bytes", float64(am.scrapeResponseSize), timestamp) if sampleLimit := sw.Config.SampleLimit; sampleLimit > 0 { // Expose scrape_samples_limit metric if sample_limit config is set for the target. // See https://github.com/VictoriaMetrics/operator/issues/497 sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sampleLimit), timestamp) } + sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp) + sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp) + sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp) if sl := sw.seriesLimiter; sl != nil { + sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp) sw.addAutoTimeseries(wc, "scrape_series_limit_samples_dropped", float64(am.seriesLimitSamplesDropped), timestamp) sw.addAutoTimeseries(wc, "scrape_series_limit", float64(sl.MaxItems()), timestamp) - sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp) } + sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp) + sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp) } // addAutoTimeseries adds automatically generated time series with the given name, value and timestamp. diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go index 859fb68d8..2a74a3f69 100644 --- a/lib/promscrape/targetstatus.go +++ b/lib/promscrape/targetstatus.go @@ -178,7 +178,7 @@ func (tsm *targetStatusMap) Unregister(sw *scrapeWork) { tsm.mu.Unlock() } -func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize float64, samplesScraped int, err error) { +func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize, samplesScraped int, err error) { jobName := sw.Config.jobNameOriginal tsm.mu.Lock() @@ -300,7 +300,7 @@ type targetStatus struct { up bool scrapeTime int64 scrapeDuration int64 - scrapeResponseSize float64 + scrapeResponseSize int samplesScraped int scrapesTotal int scrapesFailed int @@ -319,7 +319,7 @@ func (ts *targetStatus) getSizeFromLastScrape() string { if ts.scrapeResponseSize <= 0 { return "never scraped" } - return fmt.Sprintf("%.3f kb", float64(ts.scrapeResponseSize)/1024) + return fmt.Sprintf("%.3fKiB", float64(ts.scrapeResponseSize)/1024) } type droppedTargets struct { diff --git a/lib/promscrape/targetstatus.qtpl b/lib/promscrape/targetstatus.qtpl index 8bacbcba1..6f0cd7a95 100644 --- a/lib/promscrape/targetstatus.qtpl +++ b/lib/promscrape/targetstatus.qtpl @@ -27,9 +27,9 @@ {% if filter.showOriginalLabels %}originalLabels={%s= ts.sw.Config.OriginalLabels.String() %},{% space %}{% endif %} scrapes_total={%d ts.scrapesTotal %},{% space %} scrapes_failed={%d ts.scrapesFailed %},{% space %} - last_scrape={%s ts.getDurationFromLastScrape() %},{% space %} + last_scrape={%s= ts.getDurationFromLastScrape() %},{% space %} scrape_duration={%d int(ts.scrapeDuration) %}ms,{% space %} - scrape_response_size={%s ts.getSizeFromLastScrape() %},{% space %} + scrape_response_size={%s= ts.getSizeFromLastScrape() %},{% space %} samples_scraped={%d ts.samplesScraped %},{% space %} error={% if ts.err != nil %}{%s= ts.err.Error() %}{% endif %} {% newline %} diff --git a/lib/promscrape/targetstatus.qtpl.go b/lib/promscrape/targetstatus.qtpl.go index 53e820a52..10a15586d 100644 --- a/lib/promscrape/targetstatus.qtpl.go +++ b/lib/promscrape/targetstatus.qtpl.go @@ -127,7 +127,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes //line lib/promscrape/targetstatus.qtpl:29 qw422016.N().S(`last_scrape=`) //line lib/promscrape/targetstatus.qtpl:30 - qw422016.E().S(ts.getDurationFromLastScrape()) + qw422016.N().S(ts.getDurationFromLastScrape()) //line lib/promscrape/targetstatus.qtpl:30 qw422016.N().S(`,`) //line lib/promscrape/targetstatus.qtpl:30 @@ -143,7 +143,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes //line lib/promscrape/targetstatus.qtpl:31 qw422016.N().S(`scrape_response_size=`) //line lib/promscrape/targetstatus.qtpl:32 - qw422016.E().S(ts.getSizeFromLastScrape()) + qw422016.N().S(ts.getSizeFromLastScrape()) //line lib/promscrape/targetstatus.qtpl:32 qw422016.N().S(`,`) //line lib/promscrape/targetstatus.qtpl:32