From 83a4db813e965ddfd197963f37b5e06d16d94128 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Fri, 18 Jun 2021 19:04:42 +0300 Subject: [PATCH] app/vmselect: log slow requests to all the `/api/v1/*` handlers if their execution time exceeds `-search.logSlowQueryDuration` --- app/vmselect/main.go | 36 +++++++++++++++++++++++++++++++++++- app/vmselect/promql/exec.go | 15 --------------- docs/CHANGELOG.md | 1 + 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/app/vmselect/main.go b/app/vmselect/main.go index cac8f2934..c2d620421 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -28,9 +28,12 @@ var ( "It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration") maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests "+ "limit is reached; see also -search.maxQueryDuration") - resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call") + resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call") + logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging") ) +var slowQueries = metrics.NewCounter(`vm_slow_queries_total`) + func getDefaultMaxConcurrentRequests() int { n := cgroup.AvailableCPUs() if n <= 4 { @@ -108,6 +111,20 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } } + if *logSlowQueryDuration > 0 { + actualStartTime := time.Now() + defer func() { + d := time.Since(actualStartTime) + if d >= *logSlowQueryDuration { + remoteAddr := httpserver.GetQuotedRemoteAddr(r) + requestURI := getRequestURI(r) + logger.Warnf("slow query according to -search.logSlowQueryDuration=%s: remoteAddr=%s, duration=%.3f seconds; requestURI: %q", + *logSlowQueryDuration, remoteAddr, d.Seconds(), requestURI) + slowQueries.Inc() + } + }() + } + path := strings.Replace(r.URL.Path, "//", "/", -1) if path == "/internal/resetRollupResultCache" { if len(*resetCacheAuthKey) > 0 && r.FormValue("authKey") != *resetCacheAuthKey { @@ -418,6 +435,23 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) { prometheus.WriteErrorResponse(w, statusCode, err) } +func getRequestURI(r *http.Request) string { + requestURI := r.RequestURI + if r.Method != "POST" { + return requestURI + } + _ = r.ParseForm() + queryArgs := r.PostForm.Encode() + if len(queryArgs) == 0 { + return requestURI + } + delimiter := "?" + if strings.Contains(requestURI, delimiter) { + delimiter = "&" + } + return requestURI + delimiter + queryArgs +} + var ( labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/label/{}/values"}`) labelValuesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/label/{}/values"}`) diff --git a/app/vmselect/promql/exec.go b/app/vmselect/promql/exec.go index 5d594ab90..daa5b4f22 100644 --- a/app/vmselect/promql/exec.go +++ b/app/vmselect/promql/exec.go @@ -13,34 +13,19 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/querystats" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/metrics" "github.com/VictoriaMetrics/metricsql" ) var ( - logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging") treatDotsAsIsInRegexps = flag.Bool("search.treatDotsAsIsInRegexps", false, "Whether to treat dots as is in regexp label filters used in queries. "+ `For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped `+ `in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. `+ `This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter`) ) -var slowQueries = metrics.NewCounter(`vm_slow_queries_total`) - // Exec executes q for the given ec. func Exec(ec *EvalConfig, q string, isFirstPointOnly bool) ([]netstorage.Result, error) { - if *logSlowQueryDuration > 0 { - startTime := time.Now() - defer func() { - d := time.Since(startTime) - if d >= *logSlowQueryDuration { - logger.Warnf("slow query according to -search.logSlowQueryDuration=%s: remoteAddr=%s, duration=%.3f seconds, start=%d, end=%d, step=%d, query=%q", - *logSlowQueryDuration, ec.QuotedRemoteAddr, d.Seconds(), ec.Start/1000, ec.End/1000, ec.Step/1000, q) - slowQueries.Inc() - } - }() - } if querystats.Enabled() { startTime := time.Now() defer querystats.RegisterQuery(q, ec.End-ec.Start, startTime) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index fc7236676..38edfba55 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,7 @@ sort: 15 * FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377). * FEATURE: vmagent: show jobs with zero discovered targets on `/targets` page. This should help debugging improperly configured scrape configs. * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): limit the number of samples per each imported JSON line. This should limit the memory usage at VictoriaMetrics when importing time series with big number of samples to it. +* FEATURE: vmselect: log slow queries across all the `/api/v1/*` handlers (aka [Prometheus query API](https://prometheus.io/docs/prometheus/latest/querying/api)) if their execution duration exceeds `-search.logSlowQueryDuration`. This should simplify debugging slow requests to such handlers as `/api/v1/labels` or `/api/v1/series` additionally to `/api/v1/query` and `/api/v1/query_range`, which were logged in the previous releases. * BUGFIX: prevent from adding new samples to deleted time series after the rotation of the inverted index (the rotation is performed once per `-retentionPeriod`). See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136) for details. * BUGFIX: vmstorage: reduce disk write IO usage on systems with big number of CPU cores. The issue has been introduced in the release [v1.59.0](#v1590). See [this commit](aa9b56a046b6ae8083fa659df35dd5e994bf9115) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338#issuecomment-863046999) for details.