app/vmselect: expose additional histograms at /metrics page, which may help get more insights for the query workload

This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2792
This commit is contained in:
Aliaksandr Valialkin 2022-06-28 20:18:08 +03:00
parent 9adff0b686
commit daefb64f38
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
4 changed files with 36 additions and 21 deletions

View file

@ -251,12 +251,13 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke
// Return just the first error, since other errors are likely duplicate the first error. // Return just the first error, since other errors are likely duplicate the first error.
firstErr = err firstErr = err
} }
rowsReadPerSeries.Update(float64(tsw.rowsProcessed))
rowsProcessedTotal += tsw.rowsProcessed rowsProcessedTotal += tsw.rowsProcessed
putTimeseriesWork(tsw) putTimeseriesWork(tsw)
} }
perQueryRowsProcessed.Update(float64(rowsProcessedTotal)) rowsReadPerQuery.Update(float64(rowsProcessedTotal))
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal)) seriesReadPerQuery.Update(float64(seriesProcessedTotal))
// Shut down local workers // Shut down local workers
for _, workCh := range workChs { for _, workCh := range workChs {
@ -268,8 +269,11 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke
return firstErr return firstErr
} }
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`) var (
var perQuerySeriesProcessed = metrics.NewHistogram(`vm_per_query_series_processed_count`) rowsReadPerSeries = metrics.NewHistogram(`vm_rows_read_per_series`)
rowsReadPerQuery = metrics.NewHistogram(`vm_rows_read_per_query`)
seriesReadPerQuery = metrics.NewHistogram(`vm_series_read_per_query`)
)
var gomaxprocs = cgroup.AvailableCPUs() var gomaxprocs = cgroup.AvailableCPUs()

View file

@ -805,10 +805,13 @@ func evalRollupFuncWithSubquery(qt *querytracer.Tracer, ec *EvalConfig, funcName
} }
return values, timestamps return values, timestamps
}) })
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
qt.Printf("rollup %s() over %d series returned by subquery: series=%d, samplesScanned=%d", funcName, len(tssSQ), len(tss), samplesScannedTotal) qt.Printf("rollup %s() over %d series returned by subquery: series=%d, samplesScanned=%d", funcName, len(tssSQ), len(tss), samplesScannedTotal)
return tss, nil return tss, nil
} }
var rowsScannedPerQuery = metrics.NewHistogram(`vm_rows_scanned_per_query`)
func getKeepMetricNames(expr metricsql.Expr) bool { func getKeepMetricNames(expr metricsql.Expr) bool {
if ae, ok := expr.(*metricsql.AggrFuncExpr); ok { if ae, ok := expr.(*metricsql.AggrFuncExpr); ok {
// Extract rollupFunc(...) from aggrFunc(rollupFunc(...)). // Extract rollupFunc(...) from aggrFunc(rollupFunc(...)).
@ -1039,6 +1042,7 @@ func evalRollupWithIncrementalAggregate(qt *querytracer.Tracer, funcName string,
return nil, err return nil, err
} }
tss := iafc.finalizeTimeseries() tss := iafc.finalizeTimeseries()
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
qt.Printf("series after aggregation with %s(): %d; samplesScanned=%d", iafc.ae.Name, len(tss), samplesScannedTotal) qt.Printf("series after aggregation with %s(): %d; samplesScanned=%d", iafc.ae.Name, len(tss), samplesScannedTotal)
return tss, nil return tss, nil
} }
@ -1071,10 +1075,11 @@ func evalRollupNoIncrementalAggregate(qt *querytracer.Tracer, funcName string, k
} }
return nil return nil
}) })
qt.Printf("samplesScanned=%d", samplesScannedTotal)
if err != nil { if err != nil {
return nil, err return nil, err
} }
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
qt.Printf("samplesScanned=%d", samplesScannedTotal)
return tss, nil return tss, nil
} }

View file

@ -34,6 +34,12 @@ scrape_configs:
* FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`. * FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`.
* FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series. * FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series.
* FEATURE: expose additional histogram metrics at `http://victoriametrics:8428/metrics`, which may help understanding query workload:
* `vm_rows_read_per_query` - the number of raw samples read per query.
* `vm_rows_scanned_per_query` - the number of raw samples scanned per query. This number can exceed `vm_rows_read_per_query` if `step` query arg passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) is smaller than the lookbehind window set in square brackets of [rollup function](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, if `increase(some_metric[1h])` is executed with the `step=5m`, then the same raw samples on a hour time range are scanned `1h/5m=12` times. See [this article](https://valyala.medium.com/how-to-optimize-promql-and-metricsql-queries-85a1b75bf986) for details.
* `vm_rows_read_per_series` - the number of raw samples read per queried series.
* `vm_series_read_per_query` - the number of series read per query.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow using `__name__` label (aka [metric name](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)) in alerting annotations. For example `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}`. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow using `__name__` label (aka [metric name](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)) in alerting annotations. For example `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}`.
* BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes. * BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.

View file

@ -131,36 +131,36 @@ functions used with gauges are [aggregation and grouping functions](#aggregation
Histogram is a set of [counter](#counter) metrics with different labels for tracking the dispersion Histogram is a set of [counter](#counter) metrics with different labels for tracking the dispersion
and [quantiles](https://prometheus.io/docs/practices/histograms/#quantiles) of the observed value. For example, in and [quantiles](https://prometheus.io/docs/practices/histograms/#quantiles) of the observed value. For example, in
VictoriaMetrics we track how many rows is processed per query using the histogram with the VictoriaMetrics we track how many rows is processed per query using the histogram with the
name `vm_per_query_rows_processed_count`. The exposition format for this histogram has the following form: name `vm_rows_read_per_query`. The exposition format for this histogram has the following form:
``` ```
vm_per_query_rows_processed_count_bucket{vmrange="4.084e+02...4.642e+02"} 2 vm_rows_read_per_query_bucket{vmrange="4.084e+02...4.642e+02"} 2
vm_per_query_rows_processed_count_bucket{vmrange="5.275e+02...5.995e+02"} 1 vm_rows_read_per_query_bucket{vmrange="5.275e+02...5.995e+02"} 1
vm_per_query_rows_processed_count_bucket{vmrange="8.799e+02...1.000e+03"} 1 vm_rows_read_per_query_bucket{vmrange="8.799e+02...1.000e+03"} 1
vm_per_query_rows_processed_count_bucket{vmrange="1.468e+03...1.668e+03"} 3 vm_rows_read_per_query_bucket{vmrange="1.468e+03...1.668e+03"} 3
vm_per_query_rows_processed_count_bucket{vmrange="1.896e+03...2.154e+03"} 4 vm_rows_read_per_query_bucket{vmrange="1.896e+03...2.154e+03"} 4
vm_per_query_rows_processed_count_sum 15582 vm_rows_read_per_query_sum 15582
vm_per_query_rows_processed_count_count 11 vm_rows_read_per_query_count 11
``` ```
In practice, histogram `vm_per_query_rows_processed_count` may be used in the following way: In practice, histogram `vm_rows_read_per_query` may be used in the following way:
```go ```go
// define the histogram // define the histogram
perQueryRowsProcessed := metrics.NewHistogram(`vm_per_query_rows_processed_count`) rowsReadPerQuery := metrics.NewHistogram(`vm_rows_read_per_query`)
// use the histogram during processing // use the histogram during processing
for _, query := range queries { for _, query := range queries {
perQueryRowsProcessed.Update(len(query.Rows)) rowsReadPerQuery.Update(float64(len(query.Rows)))
} }
``` ```
Now let's see what happens each time when `perQueryRowsProcessed.Update` is called: Now let's see what happens each time when `rowsReadPerQuery.Update` is called:
* counter `vm_per_query_rows_processed_count_sum` increments by value of `len(query.Rows)` expression and accounts for * counter `vm_rows_read_per_query_sum` increments by value of `len(query.Rows)` expression and accounts for
total sum of all observed values; total sum of all observed values;
* counter `vm_per_query_rows_processed_count_count` increments by 1 and accounts for total number of observations; * counter `vm_rows_read_per_query_count` increments by 1 and accounts for total number of observations;
* counter `vm_per_query_rows_processed_count_bucket` gets incremented only if observed value is within the * counter `vm_rows_read_per_query_bucket` gets incremented only if observed value is within the
range (`bucket`) defined in `vmrange`. range (`bucket`) defined in `vmrange`.
Such a combination of `counter` metrics allows Such a combination of `counter` metrics allows