From 7d26414b2e98b0a949df207c1100293499c3db3b Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 17 Aug 2022 13:18:47 +0300 Subject: [PATCH] lib/promscrape: automatically generate additional per-target labels for targets with non-zero series limit The following metrics are generated: - scrape_series_limit - scrape_series_current - scrape_series_limit_samples_dropped These metrics simplify alerting on targets, which expose too many time series See https://docs.victoriametrics.com/vmagent.html#automatically-generated-metrics and https://docs.victoriametrics.com/vmagent.html#cardinality-limiter for more details --- app/vmagent/README.md | 50 ++++++++++--- docs/CHANGELOG.md | 1 + docs/vmagent.md | 50 ++++++++++--- lib/promscrape/scrapework.go | 112 ++++++++++++++++-------------- lib/promscrape/scrapework_test.go | 61 ++++++++++++++++ 5 files changed, 202 insertions(+), 72 deletions(-) diff --git a/app/vmagent/README.md b/app/vmagent/README.md index ea63ac7fb..30ef44779 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -259,37 +259,37 @@ Extra labels can be added to metrics collected by `vmagent` via the following me up == 0 ``` -* `scrape_duration_seconds` - this metric exposes scrape duration. This allows monitoring slow scrapes. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns scrapes, which take more than 1.5 seconds to complete: +* `scrape_duration_seconds` - the duration of the scrape for the given target. This allows monitoring slow scrapes. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns scrapes, which take more than 1.5 seconds to complete: ```metricsql scrape_duration_seconds > 1.5 ``` -* `scrape_timeout_seconds` - this metric exposes the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets (identified by `instance` label), which take more than 80% of the configured `scrape_timeout` during scrapes: +* `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets (identified by `instance` label), which take more than 80% of the configured `scrape_timeout` during scrapes: ```metricsql scrape_duration_seconds / scrape_timeout_seconds > 0.8 ``` -* `scrape_samples_scraped` - this metric exposes the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which expose more than 10000 metrics: +* `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which expose more than 10000 metrics: ```metricsql scrape_samples_scraped > 10000 ``` -* `scrape_samples_limit` - this metric exposes the configured limit on the number of metrics the given target can expose. The limit can be set via `sample_limit` option at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This allows detecting targets, which expose too many metrics compared to the configured `sample_limit`. For example, the following query returns targets (identified by `instance` label), which expose more than 80% metrics compared to the configed `sample_limit`: +* `scrape_samples_limit` - the configured limit on the number of metrics the given target can expose. The limit can be set via `sample_limit` option at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This metric is exposed only if the `sample_limit` is set. This allows detecting targets, which expose too many metrics compared to the configured `sample_limit`. For example, the following query returns targets (identified by `instance` label), which expose more than 80% metrics compared to the configed `sample_limit`: ```metricsql scrape_samples_scraped / scrape_samples_limit > 0.8 ``` -* `scrape_samples_post_metric_relabeling` - this metric exposes the number of samples (aka metrics) left after applying metric-level relabeling from `metric_relabel_configs` section (see [relabeling docs](#relabeling) for more details). This allows detecting targets with too many metrics after the relabeling. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets with more than 10000 metrics after the relabeling: +* `scrape_samples_post_metric_relabeling` - the number of samples (aka metrics) left after applying metric-level relabeling from `metric_relabel_configs` section (see [relabeling docs](#relabeling) for more details). This allows detecting targets with too many metrics after the relabeling. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets with more than 10000 metrics after the relabeling: ```metricsql scrape_samples_post_metric_relabeling > 10000 ``` -* `scrape_series_added` - this metric exposes **an approximate** number of new series the given target generates during the current scrape. This metric allows detecting targets (identified by `instance` label), which lead to [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which generate more than 1000 new series during the last hour: +* `scrape_series_added` - **an approximate** number of new series the given target generates during the current scrape. This metric allows detecting targets (identified by `instance` label), which lead to [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which generate more than 1000 new series during the last hour: ```metricsql sum_over_time(scrape_series_added[1h]) > 1000 @@ -297,6 +297,20 @@ Extra labels can be added to metrics collected by `vmagent` via the following me `vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line option (e.g. when [staleness markers](#prometheus-staleness-markers) are disabled). +* `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric is exposed only if the series limit is set. + +* `scrape_series_current` - the number of unique series the given target exposed so far. This metric is exposed only if the series limit is set according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric allows alerting when the number of exposed series by the given target reaches the limit. For example, the following query would alert when the target exposes more than 90% of unique series compared to the configured limit. + + ```metricsql + scrape_series_current / scrape_series_limit > 0.9 + ``` + +* `scrape_series_limit_samples_dropped` - exposes the number of dropped samples during the scrape because of the exceeded limit on the number of unique series. This metric is exposed only if the series limit is set according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric allows alerting when scraped samples are dropped because of the exceeded limit. For example, the following query alerts when at least a single sample is dropped because of the exceeded limit during the last hour: + + ```metricsql + sum_over_time(scrape_series_limit_samples_dropped[1h]) > 0 + ``` + ## Relabeling @@ -562,10 +576,24 @@ By default `vmagent` doesn't limit the number of time series each scrape target * Via `series_limit` config option at `scrape_config` section. This limit is applied individually to all the scrape targets defined in the given `scrape_config`. * Via `__series_limit__` label, which can be set with [relabeling](#relabeling) at `relabel_configs` section. This limit is applied to the corresponding scrape targets. Typical use case: to set the limit via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets, which may expose too high number of time series. -All the scraped metrics are dropped for time series exceeding the given limit. The exceeded limit can be [monitored](#monitoring) via `promscrape_series_limit_rows_dropped_total` metric. - See also `sample_limit` option at [scrape_config section](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). +Scraped metrics are dropped for time series exceeding the given limit. + +`vmagent` creates the following additional per-target metrics for targets with non-zero series limit: + +- `scrape_series_limit_samples_dropped` - the number of dropped samples during the scrape when the unique series limit is exceeded. +- `scrape_series_limit` - the series limit for the given target. +- `scrape_series_current` - the current number of series for the given target. + +These metrics are automatically sent to the configured `-remoteWrite.url` alongside with the scraped per-target metrics. + +These metrics allow building the following alerting rules: + +- `scrape_series_current / scrape_series_limit > 0.9` - alerts when the number of series exposed by the target reaches 90% of the limit. +- `rate(scrape_series_samples_dropped_total) > 0` - alerts when some samples are dropped because the series limit on a particular target is reached. + + By default `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`. The limit can be enforced by setting the following command-line flags: * `-remoteWrite.maxHourlySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last hour. Useful for limiting the number of active time series. @@ -573,10 +601,14 @@ By default `vmagent` doesn't limit the number of time series written to remote s Both limits can be set simultaneously. If any of these limits is reached, then samples for new time series are dropped instead of sending them to remote storage systems. A sample of dropped series is put in the log with `WARNING` level. -The exceeded limits can be [monitored](#monitoring) with the following metrics: +`vmagent` exposes the following metrics at `http://vmagent:8429/metrics` page (see [monitoring docs](#monitoring) for details): * `vmagent_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series. +* `vmagent_hourly_series_limit_max_series` - the hourly series limit set via `-remoteWrite.maxHourlySeries`. +* `vmagent_hourly_series_limit_current_series` - the current number of unique series registered during the last hour. * `vmagent_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series. +* `vmagent_daily_series_limit_max_series` - the daily series limit set via `-remoteWrite.maxDailySeries`. +* `vmagent_daily_series_limit_current_series` - the current number of unique series registered during the last day. These limits are approximate, so `vmagent` can underflow/overflow the limit by a small percentage (usually less than 1%). diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 580f40658..34fa854a8 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -21,6 +21,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) and [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs. * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance for heavy queries on systems with many CPU cores. +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): generate additional per-target metrics - `scrape_series_limit`, `scrape_series_current` and `scrape_series_limit_samples_dropped` if series limit is set according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This simplifies alerting on targets with the exceeded series limit. See [these docs](https://docs.victoriametrics.com/vmagent.html#automatically-generated-metrics) for details on these metrics. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for MX record types in [dns_sd_configs](https://docs.victoriametrics.com/sd_configs.html#dns_sd_configs) in the same way as Prometheus 2.38 [does](https://github.com/prometheus/prometheus/pull/10099). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `__meta_kubernetes_service_port_number` meta-label for `role: service` in [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs) in the same way as Prometheus 2.38 [does](https://github.com/prometheus/prometheus/pull/11002). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `__meta_kubernetes_pod_container_image` meta-label for `role: pod` in [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs) in the same way as Prometheus 2.38 [does](https://github.com/prometheus/prometheus/pull/11034). diff --git a/docs/vmagent.md b/docs/vmagent.md index 511670be2..987f43fd3 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -263,37 +263,37 @@ Extra labels can be added to metrics collected by `vmagent` via the following me up == 0 ``` -* `scrape_duration_seconds` - this metric exposes scrape duration. This allows monitoring slow scrapes. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns scrapes, which take more than 1.5 seconds to complete: +* `scrape_duration_seconds` - the duration of the scrape for the given target. This allows monitoring slow scrapes. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns scrapes, which take more than 1.5 seconds to complete: ```metricsql scrape_duration_seconds > 1.5 ``` -* `scrape_timeout_seconds` - this metric exposes the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets (identified by `instance` label), which take more than 80% of the configured `scrape_timeout` during scrapes: +* `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets (identified by `instance` label), which take more than 80% of the configured `scrape_timeout` during scrapes: ```metricsql scrape_duration_seconds / scrape_timeout_seconds > 0.8 ``` -* `scrape_samples_scraped` - this metric exposes the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which expose more than 10000 metrics: +* `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which expose more than 10000 metrics: ```metricsql scrape_samples_scraped > 10000 ``` -* `scrape_samples_limit` - this metric exposes the configured limit on the number of metrics the given target can expose. The limit can be set via `sample_limit` option at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This allows detecting targets, which expose too many metrics compared to the configured `sample_limit`. For example, the following query returns targets (identified by `instance` label), which expose more than 80% metrics compared to the configed `sample_limit`: +* `scrape_samples_limit` - the configured limit on the number of metrics the given target can expose. The limit can be set via `sample_limit` option at [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). This metric is exposed only if the `sample_limit` is set. This allows detecting targets, which expose too many metrics compared to the configured `sample_limit`. For example, the following query returns targets (identified by `instance` label), which expose more than 80% metrics compared to the configed `sample_limit`: ```metricsql scrape_samples_scraped / scrape_samples_limit > 0.8 ``` -* `scrape_samples_post_metric_relabeling` - this metric exposes the number of samples (aka metrics) left after applying metric-level relabeling from `metric_relabel_configs` section (see [relabeling docs](#relabeling) for more details). This allows detecting targets with too many metrics after the relabeling. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets with more than 10000 metrics after the relabeling: +* `scrape_samples_post_metric_relabeling` - the number of samples (aka metrics) left after applying metric-level relabeling from `metric_relabel_configs` section (see [relabeling docs](#relabeling) for more details). This allows detecting targets with too many metrics after the relabeling. For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets with more than 10000 metrics after the relabeling: ```metricsql scrape_samples_post_metric_relabeling > 10000 ``` -* `scrape_series_added` - this metric exposes **an approximate** number of new series the given target generates during the current scrape. This metric allows detecting targets (identified by `instance` label), which lead to [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which generate more than 1000 new series during the last hour: +* `scrape_series_added` - **an approximate** number of new series the given target generates during the current scrape. This metric allows detecting targets (identified by `instance` label), which lead to [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). For example, the following [MetricsQL query](https://docs.victoriametrics.com/MetricsQL.html) returns targets, which generate more than 1000 new series during the last hour: ```metricsql sum_over_time(scrape_series_added[1h]) > 1000 @@ -301,6 +301,20 @@ Extra labels can be added to metrics collected by `vmagent` via the following me `vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line option (e.g. when [staleness markers](#prometheus-staleness-markers) are disabled). +* `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric is exposed only if the series limit is set. + +* `scrape_series_current` - the number of unique series the given target exposed so far. This metric is exposed only if the series limit is set according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric allows alerting when the number of exposed series by the given target reaches the limit. For example, the following query would alert when the target exposes more than 90% of unique series compared to the configured limit. + + ```metricsql + scrape_series_current / scrape_series_limit > 0.9 + ``` + +* `scrape_series_limit_samples_dropped` - exposes the number of dropped samples during the scrape because of the exceeded limit on the number of unique series. This metric is exposed only if the series limit is set according to [these docs](https://docs.victoriametrics.com/vmagent.html#cardinality-limiter). This metric allows alerting when scraped samples are dropped because of the exceeded limit. For example, the following query alerts when at least a single sample is dropped because of the exceeded limit during the last hour: + + ```metricsql + sum_over_time(scrape_series_limit_samples_dropped[1h]) > 0 + ``` + ## Relabeling @@ -566,10 +580,24 @@ By default `vmagent` doesn't limit the number of time series each scrape target * Via `series_limit` config option at `scrape_config` section. This limit is applied individually to all the scrape targets defined in the given `scrape_config`. * Via `__series_limit__` label, which can be set with [relabeling](#relabeling) at `relabel_configs` section. This limit is applied to the corresponding scrape targets. Typical use case: to set the limit via [Kubernetes annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) for targets, which may expose too high number of time series. -All the scraped metrics are dropped for time series exceeding the given limit. The exceeded limit can be [monitored](#monitoring) via `promscrape_series_limit_rows_dropped_total` metric. - See also `sample_limit` option at [scrape_config section](https://docs.victoriametrics.com/sd_configs.html#scrape_configs). +Scraped metrics are dropped for time series exceeding the given limit. + +`vmagent` creates the following additional per-target metrics for targets with non-zero series limit: + +- `scrape_series_limit_samples_dropped` - the number of dropped samples during the scrape when the unique series limit is exceeded. +- `scrape_series_limit` - the series limit for the given target. +- `scrape_series_current` - the current number of series for the given target. + +These metrics are automatically sent to the configured `-remoteWrite.url` alongside with the scraped per-target metrics. + +These metrics allow building the following alerting rules: + +- `scrape_series_current / scrape_series_limit > 0.9` - alerts when the number of series exposed by the target reaches 90% of the limit. +- `rate(scrape_series_samples_dropped_total) > 0` - alerts when some samples are dropped because the series limit on a particular target is reached. + + By default `vmagent` doesn't limit the number of time series written to remote storage systems specified at `-remoteWrite.url`. The limit can be enforced by setting the following command-line flags: * `-remoteWrite.maxHourlySeries` - limits the number of unique time series `vmagent` can write to remote storage systems during the last hour. Useful for limiting the number of active time series. @@ -577,10 +605,14 @@ By default `vmagent` doesn't limit the number of time series written to remote s Both limits can be set simultaneously. If any of these limits is reached, then samples for new time series are dropped instead of sending them to remote storage systems. A sample of dropped series is put in the log with `WARNING` level. -The exceeded limits can be [monitored](#monitoring) with the following metrics: +`vmagent` exposes the following metrics at `http://vmagent:8429/metrics` page (see [monitoring docs](#monitoring) for details): * `vmagent_hourly_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded hourly limit on the number of unique time series. +* `vmagent_hourly_series_limit_max_series` - the hourly series limit set via `-remoteWrite.maxHourlySeries`. +* `vmagent_hourly_series_limit_current_series` - the current number of unique series registered during the last hour. * `vmagent_daily_series_limit_rows_dropped_total` - the number of metrics dropped due to exceeded daily limit on the number of unique time series. +* `vmagent_daily_series_limit_max_series` - the daily series limit set via `-remoteWrite.maxDailySeries`. +* `vmagent_daily_series_limit_current_series` - the current number of unique series registered during the last day. These limits are approximate, so `vmagent` can underflow/overflow the limit by a small percentage (usually less than 1%). diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index eb411a30c..c9e1fea7a 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -209,9 +209,6 @@ type scrapeWork struct { // Optional limiter on the number of unique series per scrape target. seriesLimiter *bloomfilter.Limiter - // Optional counter on the number of dropped samples if the limit on the number of unique series is set. - seriesLimiterRowsDroppedTotal *metrics.Counter - // prevBodyLen contains the previous response body length for the given scrape work. // It is used as a hint in order to reduce memory usage for body buffers. prevBodyLen int @@ -343,14 +340,8 @@ func (sw *scrapeWork) run(stopCh <-chan struct{}, globalStopCh <-chan struct{}) sw.sendStaleSeries(lastScrape, "", t, true) } if sw.seriesLimiter != nil { - job := sw.Config.Job() - metrics.UnregisterMetric(fmt.Sprintf(`promscrape_series_limit_rows_dropped_total{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL)) - metrics.UnregisterMetric(fmt.Sprintf(`promscrape_series_limit_max_series{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL)) - metrics.UnregisterMetric(fmt.Sprintf(`promscrape_series_limit_current_series{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL)) sw.seriesLimiter.MustStop() + sw.seriesLimiter = nil } return case tt := <-ticker.C: @@ -475,22 +466,22 @@ func (sw *scrapeWork) scrapeInternal(scrapeTimestamp, realTimestamp int64) error // This is a trade-off between performance and accuracy. seriesAdded = sw.getSeriesAdded(lastScrape, bodyString) } + samplesDropped := 0 if sw.seriesLimitExceeded || !areIdenticalSeries { - if sw.applySeriesLimit(wc) { + samplesDropped = sw.applySeriesLimit(wc) + if samplesDropped > 0 { sw.seriesLimitExceeded = true } } - sw.addAutoTimeseries(wc, "up", float64(up), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_duration_seconds", duration, scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(samplesScraped), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(samplesPostRelabeling), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_series_added", float64(seriesAdded), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), scrapeTimestamp) - if sw.Config.SampleLimit > 0 { - // Expose scrape_samples_limit metric if sample_limt config is set for the target. - // See https://github.com/VictoriaMetrics/operator/issues/497 - sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sw.Config.SampleLimit), scrapeTimestamp) + am := &autoMetrics{ + up: up, + scrapeDurationSeconds: duration, + samplesScraped: samplesScraped, + samplesPostRelabeling: samplesPostRelabeling, + seriesAdded: seriesAdded, + seriesLimitSamplesDropped: samplesDropped, } + sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) sw.prevBodyLen = len(bodyString) @@ -601,12 +592,14 @@ func (sw *scrapeWork) scrapeStream(scrapeTimestamp, realTimestamp int64) error { // This is a trade-off between performance and accuracy. seriesAdded = sw.getSeriesAdded(lastScrape, bodyString) } - sw.addAutoTimeseries(wc, "up", float64(up), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_duration_seconds", duration, scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(samplesScraped), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(samplesPostRelabeling), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_series_added", float64(seriesAdded), scrapeTimestamp) - sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), scrapeTimestamp) + am := &autoMetrics{ + up: up, + scrapeDurationSeconds: duration, + samplesScraped: samplesScraped, + samplesPostRelabeling: samplesPostRelabeling, + seriesAdded: seriesAdded, + } + sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) sw.prevBodyLen = sbr.bodyLen @@ -699,44 +692,30 @@ func (sw *scrapeWork) getSeriesAdded(lastScrape, currScrape string) int { return strings.Count(bodyString, "\n") } -func (sw *scrapeWork) applySeriesLimit(wc *writeRequestCtx) bool { +func (sw *scrapeWork) applySeriesLimit(wc *writeRequestCtx) int { seriesLimit := *seriesLimitPerTarget if sw.Config.SeriesLimit > 0 { seriesLimit = sw.Config.SeriesLimit } if sw.seriesLimiter == nil && seriesLimit > 0 { - job := sw.Config.Job() sw.seriesLimiter = bloomfilter.NewLimiter(seriesLimit, 24*time.Hour) - sw.seriesLimiterRowsDroppedTotal = metrics.GetOrCreateCounter(fmt.Sprintf(`promscrape_series_limit_rows_dropped_total{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL)) - _ = metrics.GetOrCreateGauge(fmt.Sprintf(`promscrape_series_limit_max_series{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL), func() float64 { - return float64(sw.seriesLimiter.MaxItems()) - }) - _ = metrics.GetOrCreateGauge(fmt.Sprintf(`promscrape_series_limit_current_series{scrape_job_original=%q,scrape_job=%q,scrape_target=%q}`, - sw.Config.jobNameOriginal, job, sw.Config.ScrapeURL), func() float64 { - return float64(sw.seriesLimiter.CurrentItems()) - }) } - hsl := sw.seriesLimiter - if hsl == nil { - return false + sl := sw.seriesLimiter + if sl == nil { + return 0 } dstSeries := wc.writeRequest.Timeseries[:0] - limitExceeded := false + samplesDropped := 0 for _, ts := range wc.writeRequest.Timeseries { h := sw.getLabelsHash(ts.Labels) - if !hsl.Add(h) { - // The limit on the number of hourly unique series per scrape target has been exceeded. - // Drop the metric. - sw.seriesLimiterRowsDroppedTotal.Inc() - limitExceeded = true + if !sl.Add(h) { + samplesDropped++ continue } dstSeries = append(dstSeries, ts) } wc.writeRequest.Timeseries = dstSeries - return limitExceeded + return samplesDropped } func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp int64, addAutoSeries bool) { @@ -756,11 +735,8 @@ func (sw *scrapeWork) sendStaleSeries(lastScrape, currScrape string, timestamp i } } if addAutoSeries { - sw.addAutoTimeseries(wc, "up", 0, timestamp) - sw.addAutoTimeseries(wc, "scrape_duration_seconds", 0, timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_scraped", 0, timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", 0, timestamp) - sw.addAutoTimeseries(wc, "scrape_series_added", 0, timestamp) + am := &autoMetrics{} + sw.addAutoMetrics(am, wc, timestamp) } series := wc.writeRequest.Timeseries if len(series) == 0 { @@ -791,6 +767,34 @@ func (sw *scrapeWork) getLabelsHash(labels []prompbmarshal.Label) uint64 { return xxhash.Sum64(b) } +type autoMetrics struct { + up int + scrapeDurationSeconds float64 + samplesScraped int + samplesPostRelabeling int + seriesAdded int + seriesLimitSamplesDropped int +} + +func (sw *scrapeWork) addAutoMetrics(am *autoMetrics, wc *writeRequestCtx, timestamp int64) { + sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp) + sw.addAutoTimeseries(wc, "scrape_duration_seconds", am.scrapeDurationSeconds, timestamp) + sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp) + sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp) + sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp) + sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp) + if sampleLimit := sw.Config.SampleLimit; sampleLimit > 0 { + // Expose scrape_samples_limit metric if sample_limt config is set for the target. + // See https://github.com/VictoriaMetrics/operator/issues/497 + sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sampleLimit), timestamp) + } + if sl := sw.seriesLimiter; sl != nil { + sw.addAutoTimeseries(wc, "scrape_series_limit_samples_dropped", float64(am.seriesLimitSamplesDropped), timestamp) + sw.addAutoTimeseries(wc, "scrape_series_limit", float64(sl.MaxItems()), timestamp) + sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp) + } +} + // addAutoTimeseries adds automatically generated time series with the given name, value and timestamp. // // See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series diff --git a/lib/promscrape/scrapework_test.go b/lib/promscrape/scrapework_test.go index 1e96449ea..e81b22c02 100644 --- a/lib/promscrape/scrapework_test.go +++ b/lib/promscrape/scrapework_test.go @@ -352,6 +352,25 @@ func TestScrapeWorkScrapeInternalSuccess(t *testing.T) { scrape_series_added{job="xx",instance="foo.com"} 4 123 scrape_timeout_seconds{job="xx",instance="foo.com"} 42 123 `) + // Scrape success with the given SampleLimit. + f(` + foo{bar="baz"} 34.44 + bar{a="b",c="d"} -3e4 + `, &ScrapeWork{ + ScrapeTimeout: time.Second * 42, + SampleLimit: 2, + }, ` + foo{bar="baz"} 34.44 123 + bar{a="b",c="d"} -3e4 123 + up 1 123 + scrape_samples_limit 2 123 + scrape_samples_scraped 2 123 + scrape_duration_seconds 0 123 + scrape_samples_post_metric_relabeling 2 123 + scrape_series_added 2 123 + scrape_timeout_seconds 42 123 + `) + // Scrape failure because of the exceeded SampleLimit f(` foo{bar="baz"} 34.44 bar{a="b",c="d"} -3e4 @@ -367,6 +386,48 @@ func TestScrapeWorkScrapeInternalSuccess(t *testing.T) { scrape_samples_post_metric_relabeling 2 123 scrape_samples_limit 1 123 scrape_series_added 0 123 + scrape_series_current 0 123 + scrape_series_limit 123 123 + scrape_series_limit_samples_dropped 0 123 + scrape_timeout_seconds 42 123 + `) + // Scrape success with the given SeriesLimit. + f(` + foo{bar="baz"} 34.44 + bar{a="b",c="d"} -3e4 + `, &ScrapeWork{ + ScrapeTimeout: time.Second * 42, + SeriesLimit: 123, + }, ` + foo{bar="baz"} 34.44 123 + bar{a="b",c="d"} -3e4 123 + up 1 123 + scrape_samples_scraped 2 123 + scrape_duration_seconds 0 123 + scrape_samples_post_metric_relabeling 2 123 + scrape_series_added 2 123 + scrape_series_current 2 123 + scrape_series_limit 123 123 + scrape_series_limit_samples_dropped 0 123 + scrape_timeout_seconds 42 123 + `) + // Exceed SeriesLimit. + f(` + foo{bar="baz"} 34.44 + bar{a="b",c="d"} -3e4 + `, &ScrapeWork{ + ScrapeTimeout: time.Second * 42, + SeriesLimit: 1, + }, ` + foo{bar="baz"} 34.44 123 + up 1 123 + scrape_samples_scraped 2 123 + scrape_duration_seconds 0 123 + scrape_samples_post_metric_relabeling 2 123 + scrape_series_added 2 123 + scrape_series_current 1 123 + scrape_series_limit 1 123 + scrape_series_limit_samples_dropped 1 123 scrape_timeout_seconds 42 123 `) }