mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
lib/promscrape: follow-up for 1e83598be3
- Clarify that the -promscrape.maxScrapeSize value is used for limiting the maximum scrape size if max_scrape_size option isn't set at https://docs.victoriametrics.com/sd_configs/#scrape_configs - Fix query example for scrape_response_size_bytes metric at https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics - Mention about max_scrape_size option at the -help description for -promscrape.maxScrapeSize command-line flag - Treat zero value for max_scrape_size option as 'no scrape size limit' - Change float64 to int type for scrapeResponseSize struct fields and function args, since response size cannot be fractional - Optimize isAutoMetric() function a bit - Sort auto metrics in alphabetical order in isAutoMetric() and in scrapeWork.addAutoMetrics() functions for better maintainability in the future Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6434 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429
This commit is contained in:
parent
9c02f27ef9
commit
57000f5105
10 changed files with 64 additions and 47 deletions
|
@ -81,7 +81,7 @@ Released at 2024-06-24
|
|||
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `-idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs.
|
||||
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add automatic retry for requests to backend for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): increase default value of `-promscrape.maxDroppedTargets` command-line flag to 10_000 from 1000. This makes it easier to track down large number of dropped targets. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6381).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` parameter to a scrape config for setting a custom scrape limit for a job. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` was added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` option to [scrape config](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for setting custom limit on the response size target can send. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` is added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612).
|
||||
* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): check for ranged vector arguments in non-rollup expressions when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `sum(up[5m])` or `absent(up[5m])` will fail to execute if these flags are set.
|
||||
* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): validate that rollup expressions has ranged vector arguments passed when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `rate(metric)` or `count_over_time(metric)` will fail to execute if these flags are set.
|
||||
* FEATURE: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): support file path with hierarchical patterns and regexpes, and http url in unittest cmd-line flag `-files`, e.g. `-files="http://<some-server-addr>/path/to/rules"` or `-files="dir/**/*.yaml"`.
|
||||
|
|
|
@ -1693,7 +1693,7 @@ scrape_configs:
|
|||
# scrape_timeout: <duration>
|
||||
|
||||
# max_scrape_size is an optional parameter for limiting the response size in bytes from scraped targets.
|
||||
# By default, uses limit from -promscrape.maxScrapeSize command-line flag.
|
||||
# If max_scrape_size isn't set, then the limit from -promscrape.maxScrapeSize command-line flag is used instead.
|
||||
# Example values:
|
||||
# - "10MiB" - 10 * 1024 * 1024 bytes
|
||||
# - "100MB" - 100 * 1000 * 1000 bytes
|
||||
|
|
|
@ -486,14 +486,6 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
|
|||
scrape_duration_seconds > 1.5
|
||||
```
|
||||
|
||||
* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped
|
||||
and to adjust `max_scrape_size` for scraped targets. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/)
|
||||
returns targets with scrape response > 10MiB:
|
||||
|
||||
```metricsql
|
||||
max_scrape_size > 10MiB
|
||||
```
|
||||
|
||||
* `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`).
|
||||
This allows detecting targets with scrape durations close to the configured scrape timeout.
|
||||
For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets (identified by `instance` label),
|
||||
|
@ -503,6 +495,15 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
|
|||
scrape_duration_seconds / scrape_timeout_seconds > 0.8
|
||||
```
|
||||
|
||||
* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped
|
||||
and to adjust [`max_scrape_size` option](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for scraped targets.
|
||||
For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets with scrape response
|
||||
bigger than `10MiB`:
|
||||
|
||||
```metricsql
|
||||
scrape_response_size_bytes > 10MiB
|
||||
```
|
||||
|
||||
* `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets,
|
||||
which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/)
|
||||
returns targets, which expose more than 10000 metrics:
|
||||
|
|
|
@ -155,9 +155,9 @@ func (c *client) ReadData(dst *bytesutil.ByteBuffer) error {
|
|||
}
|
||||
if int64(len(dst.B)) >= c.maxScrapeSize {
|
||||
maxScrapeSizeExceeded.Inc()
|
||||
return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize=%d or max_scrape_size in a scrape config. "+
|
||||
return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize or max_scrape_size in the scrape config (%d bytes). "+
|
||||
"Possible solutions are: reduce the response size for the target, increase -promscrape.maxScrapeSize command-line flag, "+
|
||||
"increase max_scrape_size value in scrape config", c.scrapeURL, maxScrapeSize.N)
|
||||
"increase max_scrape_size value in scrape config for the given target", c.scrapeURL, maxScrapeSize.N)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -78,7 +78,7 @@ var (
|
|||
"then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. "+
|
||||
"See https://docs.victoriametrics.com/vmagent/#scraping-big-number-of-targets for more info")
|
||||
maxScrapeSize = flagutil.NewBytes("promscrape.maxScrapeSize", 16*1024*1024, "The maximum size of scrape response in bytes to process from Prometheus targets. "+
|
||||
"Bigger responses are rejected")
|
||||
"Bigger responses are rejected. See also max_scrape_size option at https://docs.victoriametrics.com/sd_configs/#scrape_configs")
|
||||
)
|
||||
|
||||
var clusterMemberID int
|
||||
|
@ -852,12 +852,14 @@ func getScrapeWorkConfig(sc *ScrapeConfig, baseDir string, globalCfg *GlobalConf
|
|||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1281#issuecomment-840538907
|
||||
scrapeTimeout = scrapeInterval
|
||||
}
|
||||
var err error
|
||||
mss := maxScrapeSize.N
|
||||
if len(sc.MaxScrapeSize) > 0 {
|
||||
mss, err = flagutil.ParseBytes(sc.MaxScrapeSize)
|
||||
if sc.MaxScrapeSize != "" {
|
||||
n, err := flagutil.ParseBytes(sc.MaxScrapeSize)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unexpected `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err)
|
||||
return nil, fmt.Errorf("cannot parse `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err)
|
||||
}
|
||||
if n > 0 {
|
||||
mss = n
|
||||
}
|
||||
}
|
||||
honorLabels := sc.HonorLabels
|
||||
|
|
|
@ -993,7 +993,7 @@ scrape_configs:
|
|||
scrape_configs:
|
||||
- job_name: foo
|
||||
scheme: https
|
||||
max_scrape_size: 0
|
||||
max_scrape_size: 1
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [__address__]
|
||||
|
@ -1015,7 +1015,7 @@ scrape_configs:
|
|||
ScrapeURL: "http://foo.bar:1234/metrics",
|
||||
ScrapeInterval: defaultScrapeInterval,
|
||||
ScrapeTimeout: defaultScrapeTimeout,
|
||||
MaxScrapeSize: 0,
|
||||
MaxScrapeSize: 1,
|
||||
Labels: promutils.NewLabelsFromMap(map[string]string{
|
||||
"instance": "foo.bar:1234",
|
||||
"job": "3",
|
||||
|
|
|
@ -500,10 +500,11 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
|
|||
if sw.seriesLimitExceeded || !areIdenticalSeries {
|
||||
samplesDropped = sw.applySeriesLimit(wc)
|
||||
}
|
||||
responseSize := len(bodyString)
|
||||
am := &autoMetrics{
|
||||
up: up,
|
||||
scrapeDurationSeconds: scrapeDurationSeconds,
|
||||
scrapeResponseSize: float64(len(bodyString)),
|
||||
scrapeResponseSize: responseSize,
|
||||
samplesScraped: samplesScraped,
|
||||
samplesPostRelabeling: samplesPostRelabeling,
|
||||
seriesAdded: seriesAdded,
|
||||
|
@ -512,7 +513,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
|
|||
sw.addAutoMetrics(am, wc, scrapeTimestamp)
|
||||
sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
|
||||
sw.prevLabelsLen = len(wc.labels)
|
||||
sw.prevBodyLen = len(bodyString)
|
||||
sw.prevBodyLen = responseSize
|
||||
wc.reset()
|
||||
writeRequestCtxPool.Put(wc)
|
||||
// body must be released only after wc is released, since wc refers to body.
|
||||
|
@ -523,7 +524,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b
|
|||
sw.storeLastScrape(body)
|
||||
}
|
||||
sw.finalizeLastScrape()
|
||||
tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err)
|
||||
tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err)
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -581,10 +582,11 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
|
|||
// This is a trade-off between performance and accuracy.
|
||||
seriesAdded = sw.getSeriesAdded(lastScrape, bodyString)
|
||||
}
|
||||
responseSize := len(bodyString)
|
||||
am := &autoMetrics{
|
||||
up: up,
|
||||
scrapeDurationSeconds: scrapeDurationSeconds,
|
||||
scrapeResponseSize: float64(len(bodyString)),
|
||||
scrapeResponseSize: responseSize,
|
||||
samplesScraped: samplesScraped,
|
||||
samplesPostRelabeling: samplesPostRelabeling,
|
||||
seriesAdded: seriesAdded,
|
||||
|
@ -593,7 +595,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
|
|||
sw.addAutoMetrics(am, wc, scrapeTimestamp)
|
||||
sw.pushData(sw.Config.AuthToken, &wc.writeRequest)
|
||||
sw.prevLabelsLen = len(wc.labels)
|
||||
sw.prevBodyLen = len(bodyString)
|
||||
sw.prevBodyLen = responseSize
|
||||
wc.reset()
|
||||
writeRequestCtxPool.Put(wc)
|
||||
if !areIdenticalSeries {
|
||||
|
@ -603,7 +605,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int
|
|||
sw.storeLastScrape(body.B)
|
||||
}
|
||||
sw.finalizeLastScrape()
|
||||
tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err)
|
||||
tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err)
|
||||
// Do not track active series in streaming mode, since this may need too big amounts of memory
|
||||
// when the target exports too big number of metrics.
|
||||
return err
|
||||
|
@ -815,7 +817,7 @@ func (sw *scrapeWork) getLabelsHash(labels []prompbmarshal.Label) uint64 {
|
|||
type autoMetrics struct {
|
||||
up int
|
||||
scrapeDurationSeconds float64
|
||||
scrapeResponseSize float64
|
||||
scrapeResponseSize int
|
||||
samplesScraped int
|
||||
samplesPostRelabeling int
|
||||
seriesAdded int
|
||||
|
@ -823,35 +825,47 @@ type autoMetrics struct {
|
|||
}
|
||||
|
||||
func isAutoMetric(s string) bool {
|
||||
switch s {
|
||||
case "up", "scrape_duration_seconds", "scrape_samples_scraped",
|
||||
"scrape_samples_post_metric_relabeling", "scrape_series_added",
|
||||
"scrape_timeout_seconds", "scrape_samples_limit",
|
||||
"scrape_series_limit_samples_dropped", "scrape_series_limit",
|
||||
"scrape_series_current", "scrape_response_size_bytes":
|
||||
if s == "up" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
if !strings.HasPrefix(s, "scrape_") {
|
||||
return false
|
||||
}
|
||||
switch s {
|
||||
case "scrape_duration_seconds",
|
||||
"scrape_response_size_bytes",
|
||||
"scrape_samples_limit",
|
||||
"scrape_samples_post_metric_relabeling",
|
||||
"scrape_samples_scraped",
|
||||
"scrape_series_added",
|
||||
"scrape_series_current",
|
||||
"scrape_series_limit",
|
||||
"scrape_series_limit_samples_dropped",
|
||||
"scrape_timeout_seconds":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (sw *scrapeWork) addAutoMetrics(am *autoMetrics, wc *writeRequestCtx, timestamp int64) {
|
||||
sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_duration_seconds", am.scrapeDurationSeconds, timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_response_size_bytes", am.scrapeResponseSize, timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_response_size_bytes", float64(am.scrapeResponseSize), timestamp)
|
||||
if sampleLimit := sw.Config.SampleLimit; sampleLimit > 0 {
|
||||
// Expose scrape_samples_limit metric if sample_limit config is set for the target.
|
||||
// See https://github.com/VictoriaMetrics/operator/issues/497
|
||||
sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sampleLimit), timestamp)
|
||||
}
|
||||
sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp)
|
||||
if sl := sw.seriesLimiter; sl != nil {
|
||||
sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_series_limit_samples_dropped", float64(am.seriesLimitSamplesDropped), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_series_limit", float64(sl.MaxItems()), timestamp)
|
||||
sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp)
|
||||
}
|
||||
sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp)
|
||||
sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp)
|
||||
}
|
||||
|
||||
// addAutoTimeseries adds automatically generated time series with the given name, value and timestamp.
|
||||
|
|
|
@ -178,7 +178,7 @@ func (tsm *targetStatusMap) Unregister(sw *scrapeWork) {
|
|||
tsm.mu.Unlock()
|
||||
}
|
||||
|
||||
func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize float64, samplesScraped int, err error) {
|
||||
func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize, samplesScraped int, err error) {
|
||||
jobName := sw.Config.jobNameOriginal
|
||||
|
||||
tsm.mu.Lock()
|
||||
|
@ -300,7 +300,7 @@ type targetStatus struct {
|
|||
up bool
|
||||
scrapeTime int64
|
||||
scrapeDuration int64
|
||||
scrapeResponseSize float64
|
||||
scrapeResponseSize int
|
||||
samplesScraped int
|
||||
scrapesTotal int
|
||||
scrapesFailed int
|
||||
|
@ -319,7 +319,7 @@ func (ts *targetStatus) getSizeFromLastScrape() string {
|
|||
if ts.scrapeResponseSize <= 0 {
|
||||
return "never scraped"
|
||||
}
|
||||
return fmt.Sprintf("%.3f kb", float64(ts.scrapeResponseSize)/1024)
|
||||
return fmt.Sprintf("%.3fKiB", float64(ts.scrapeResponseSize)/1024)
|
||||
}
|
||||
|
||||
type droppedTargets struct {
|
||||
|
|
|
@ -27,9 +27,9 @@
|
|||
{% if filter.showOriginalLabels %}originalLabels={%s= ts.sw.Config.OriginalLabels.String() %},{% space %}{% endif %}
|
||||
scrapes_total={%d ts.scrapesTotal %},{% space %}
|
||||
scrapes_failed={%d ts.scrapesFailed %},{% space %}
|
||||
last_scrape={%s ts.getDurationFromLastScrape() %},{% space %}
|
||||
last_scrape={%s= ts.getDurationFromLastScrape() %},{% space %}
|
||||
scrape_duration={%d int(ts.scrapeDuration) %}ms,{% space %}
|
||||
scrape_response_size={%s ts.getSizeFromLastScrape() %},{% space %}
|
||||
scrape_response_size={%s= ts.getSizeFromLastScrape() %},{% space %}
|
||||
samples_scraped={%d ts.samplesScraped %},{% space %}
|
||||
error={% if ts.err != nil %}{%s= ts.err.Error() %}{% endif %}
|
||||
{% newline %}
|
||||
|
|
|
@ -127,7 +127,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes
|
|||
//line lib/promscrape/targetstatus.qtpl:29
|
||||
qw422016.N().S(`last_scrape=`)
|
||||
//line lib/promscrape/targetstatus.qtpl:30
|
||||
qw422016.E().S(ts.getDurationFromLastScrape())
|
||||
qw422016.N().S(ts.getDurationFromLastScrape())
|
||||
//line lib/promscrape/targetstatus.qtpl:30
|
||||
qw422016.N().S(`,`)
|
||||
//line lib/promscrape/targetstatus.qtpl:30
|
||||
|
@ -143,7 +143,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes
|
|||
//line lib/promscrape/targetstatus.qtpl:31
|
||||
qw422016.N().S(`scrape_response_size=`)
|
||||
//line lib/promscrape/targetstatus.qtpl:32
|
||||
qw422016.E().S(ts.getSizeFromLastScrape())
|
||||
qw422016.N().S(ts.getSizeFromLastScrape())
|
||||
//line lib/promscrape/targetstatus.qtpl:32
|
||||
qw422016.N().S(`,`)
|
||||
//line lib/promscrape/targetstatus.qtpl:32
|
||||
|
|
Loading…
Reference in a new issue