mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
app/vmselect: expose additional histograms at /metrics
page, which may help get more insights for the query workload
This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2792
This commit is contained in:
parent
9adff0b686
commit
daefb64f38
4 changed files with 36 additions and 21 deletions
|
@ -251,12 +251,13 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke
|
||||||
// Return just the first error, since other errors are likely duplicate the first error.
|
// Return just the first error, since other errors are likely duplicate the first error.
|
||||||
firstErr = err
|
firstErr = err
|
||||||
}
|
}
|
||||||
|
rowsReadPerSeries.Update(float64(tsw.rowsProcessed))
|
||||||
rowsProcessedTotal += tsw.rowsProcessed
|
rowsProcessedTotal += tsw.rowsProcessed
|
||||||
putTimeseriesWork(tsw)
|
putTimeseriesWork(tsw)
|
||||||
}
|
}
|
||||||
|
|
||||||
perQueryRowsProcessed.Update(float64(rowsProcessedTotal))
|
rowsReadPerQuery.Update(float64(rowsProcessedTotal))
|
||||||
perQuerySeriesProcessed.Update(float64(seriesProcessedTotal))
|
seriesReadPerQuery.Update(float64(seriesProcessedTotal))
|
||||||
|
|
||||||
// Shut down local workers
|
// Shut down local workers
|
||||||
for _, workCh := range workChs {
|
for _, workCh := range workChs {
|
||||||
|
@ -268,8 +269,11 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke
|
||||||
return firstErr
|
return firstErr
|
||||||
}
|
}
|
||||||
|
|
||||||
var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`)
|
var (
|
||||||
var perQuerySeriesProcessed = metrics.NewHistogram(`vm_per_query_series_processed_count`)
|
rowsReadPerSeries = metrics.NewHistogram(`vm_rows_read_per_series`)
|
||||||
|
rowsReadPerQuery = metrics.NewHistogram(`vm_rows_read_per_query`)
|
||||||
|
seriesReadPerQuery = metrics.NewHistogram(`vm_series_read_per_query`)
|
||||||
|
)
|
||||||
|
|
||||||
var gomaxprocs = cgroup.AvailableCPUs()
|
var gomaxprocs = cgroup.AvailableCPUs()
|
||||||
|
|
||||||
|
|
|
@ -805,10 +805,13 @@ func evalRollupFuncWithSubquery(qt *querytracer.Tracer, ec *EvalConfig, funcName
|
||||||
}
|
}
|
||||||
return values, timestamps
|
return values, timestamps
|
||||||
})
|
})
|
||||||
|
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
|
||||||
qt.Printf("rollup %s() over %d series returned by subquery: series=%d, samplesScanned=%d", funcName, len(tssSQ), len(tss), samplesScannedTotal)
|
qt.Printf("rollup %s() over %d series returned by subquery: series=%d, samplesScanned=%d", funcName, len(tssSQ), len(tss), samplesScannedTotal)
|
||||||
return tss, nil
|
return tss, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var rowsScannedPerQuery = metrics.NewHistogram(`vm_rows_scanned_per_query`)
|
||||||
|
|
||||||
func getKeepMetricNames(expr metricsql.Expr) bool {
|
func getKeepMetricNames(expr metricsql.Expr) bool {
|
||||||
if ae, ok := expr.(*metricsql.AggrFuncExpr); ok {
|
if ae, ok := expr.(*metricsql.AggrFuncExpr); ok {
|
||||||
// Extract rollupFunc(...) from aggrFunc(rollupFunc(...)).
|
// Extract rollupFunc(...) from aggrFunc(rollupFunc(...)).
|
||||||
|
@ -1039,6 +1042,7 @@ func evalRollupWithIncrementalAggregate(qt *querytracer.Tracer, funcName string,
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
tss := iafc.finalizeTimeseries()
|
tss := iafc.finalizeTimeseries()
|
||||||
|
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
|
||||||
qt.Printf("series after aggregation with %s(): %d; samplesScanned=%d", iafc.ae.Name, len(tss), samplesScannedTotal)
|
qt.Printf("series after aggregation with %s(): %d; samplesScanned=%d", iafc.ae.Name, len(tss), samplesScannedTotal)
|
||||||
return tss, nil
|
return tss, nil
|
||||||
}
|
}
|
||||||
|
@ -1071,10 +1075,11 @@ func evalRollupNoIncrementalAggregate(qt *querytracer.Tracer, funcName string, k
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
qt.Printf("samplesScanned=%d", samplesScannedTotal)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
rowsScannedPerQuery.Update(float64(samplesScannedTotal))
|
||||||
|
qt.Printf("samplesScanned=%d", samplesScannedTotal)
|
||||||
return tss, nil
|
return tss, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,12 @@ scrape_configs:
|
||||||
|
|
||||||
* FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`.
|
* FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`.
|
||||||
* FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series.
|
* FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series.
|
||||||
|
* FEATURE: expose additional histogram metrics at `http://victoriametrics:8428/metrics`, which may help understanding query workload:
|
||||||
|
|
||||||
|
* `vm_rows_read_per_query` - the number of raw samples read per query.
|
||||||
|
* `vm_rows_scanned_per_query` - the number of raw samples scanned per query. This number can exceed `vm_rows_read_per_query` if `step` query arg passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) is smaller than the lookbehind window set in square brackets of [rollup function](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, if `increase(some_metric[1h])` is executed with the `step=5m`, then the same raw samples on a hour time range are scanned `1h/5m=12` times. See [this article](https://valyala.medium.com/how-to-optimize-promql-and-metricsql-queries-85a1b75bf986) for details.
|
||||||
|
* `vm_rows_read_per_series` - the number of raw samples read per queried series.
|
||||||
|
* `vm_series_read_per_query` - the number of series read per query.
|
||||||
|
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow using `__name__` label (aka [metric name](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)) in alerting annotations. For example `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}`.
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow using `__name__` label (aka [metric name](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)) in alerting annotations. For example `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}`.
|
||||||
* BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.
|
* BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.
|
||||||
|
|
|
@ -131,36 +131,36 @@ functions used with gauges are [aggregation and grouping functions](#aggregation
|
||||||
Histogram is a set of [counter](#counter) metrics with different labels for tracking the dispersion
|
Histogram is a set of [counter](#counter) metrics with different labels for tracking the dispersion
|
||||||
and [quantiles](https://prometheus.io/docs/practices/histograms/#quantiles) of the observed value. For example, in
|
and [quantiles](https://prometheus.io/docs/practices/histograms/#quantiles) of the observed value. For example, in
|
||||||
VictoriaMetrics we track how many rows is processed per query using the histogram with the
|
VictoriaMetrics we track how many rows is processed per query using the histogram with the
|
||||||
name `vm_per_query_rows_processed_count`. The exposition format for this histogram has the following form:
|
name `vm_rows_read_per_query`. The exposition format for this histogram has the following form:
|
||||||
|
|
||||||
```
|
```
|
||||||
vm_per_query_rows_processed_count_bucket{vmrange="4.084e+02...4.642e+02"} 2
|
vm_rows_read_per_query_bucket{vmrange="4.084e+02...4.642e+02"} 2
|
||||||
vm_per_query_rows_processed_count_bucket{vmrange="5.275e+02...5.995e+02"} 1
|
vm_rows_read_per_query_bucket{vmrange="5.275e+02...5.995e+02"} 1
|
||||||
vm_per_query_rows_processed_count_bucket{vmrange="8.799e+02...1.000e+03"} 1
|
vm_rows_read_per_query_bucket{vmrange="8.799e+02...1.000e+03"} 1
|
||||||
vm_per_query_rows_processed_count_bucket{vmrange="1.468e+03...1.668e+03"} 3
|
vm_rows_read_per_query_bucket{vmrange="1.468e+03...1.668e+03"} 3
|
||||||
vm_per_query_rows_processed_count_bucket{vmrange="1.896e+03...2.154e+03"} 4
|
vm_rows_read_per_query_bucket{vmrange="1.896e+03...2.154e+03"} 4
|
||||||
vm_per_query_rows_processed_count_sum 15582
|
vm_rows_read_per_query_sum 15582
|
||||||
vm_per_query_rows_processed_count_count 11
|
vm_rows_read_per_query_count 11
|
||||||
```
|
```
|
||||||
|
|
||||||
In practice, histogram `vm_per_query_rows_processed_count` may be used in the following way:
|
In practice, histogram `vm_rows_read_per_query` may be used in the following way:
|
||||||
|
|
||||||
```go
|
```go
|
||||||
// define the histogram
|
// define the histogram
|
||||||
perQueryRowsProcessed := metrics.NewHistogram(`vm_per_query_rows_processed_count`)
|
rowsReadPerQuery := metrics.NewHistogram(`vm_rows_read_per_query`)
|
||||||
|
|
||||||
// use the histogram during processing
|
// use the histogram during processing
|
||||||
for _, query := range queries {
|
for _, query := range queries {
|
||||||
perQueryRowsProcessed.Update(len(query.Rows))
|
rowsReadPerQuery.Update(float64(len(query.Rows)))
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Now let's see what happens each time when `perQueryRowsProcessed.Update` is called:
|
Now let's see what happens each time when `rowsReadPerQuery.Update` is called:
|
||||||
|
|
||||||
* counter `vm_per_query_rows_processed_count_sum` increments by value of `len(query.Rows)` expression and accounts for
|
* counter `vm_rows_read_per_query_sum` increments by value of `len(query.Rows)` expression and accounts for
|
||||||
total sum of all observed values;
|
total sum of all observed values;
|
||||||
* counter `vm_per_query_rows_processed_count_count` increments by 1 and accounts for total number of observations;
|
* counter `vm_rows_read_per_query_count` increments by 1 and accounts for total number of observations;
|
||||||
* counter `vm_per_query_rows_processed_count_bucket` gets incremented only if observed value is within the
|
* counter `vm_rows_read_per_query_bucket` gets incremented only if observed value is within the
|
||||||
range (`bucket`) defined in `vmrange`.
|
range (`bucket`) defined in `vmrange`.
|
||||||
|
|
||||||
Such a combination of `counter` metrics allows
|
Such a combination of `counter` metrics allows
|
||||||
|
|
Loading…
Reference in a new issue