From 83a4db813e965ddfd197963f37b5e06d16d94128 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@gmail.com>
Date: Fri, 18 Jun 2021 19:04:42 +0300
Subject: [PATCH] app/vmselect: log slow requests to all the `/api/v1/*`
 handlers if their execution time exceeds `-search.logSlowQueryDuration`

---
 app/vmselect/main.go        | 36 +++++++++++++++++++++++++++++++++++-
 app/vmselect/promql/exec.go | 15 ---------------
 docs/CHANGELOG.md           |  1 +
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/app/vmselect/main.go b/app/vmselect/main.go
index cac8f2934..c2d620421 100644
--- a/app/vmselect/main.go
+++ b/app/vmselect/main.go
@@ -28,9 +28,12 @@ var (
 		"It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration")
 	maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests "+
 		"limit is reached; see also -search.maxQueryDuration")
-	resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call")
+	resetCacheAuthKey    = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call")
+	logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging")
 )
 
+var slowQueries = metrics.NewCounter(`vm_slow_queries_total`)
+
 func getDefaultMaxConcurrentRequests() int {
 	n := cgroup.AvailableCPUs()
 	if n <= 4 {
@@ -108,6 +111,20 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 		}
 	}
 
+	if *logSlowQueryDuration > 0 {
+		actualStartTime := time.Now()
+		defer func() {
+			d := time.Since(actualStartTime)
+			if d >= *logSlowQueryDuration {
+				remoteAddr := httpserver.GetQuotedRemoteAddr(r)
+				requestURI := getRequestURI(r)
+				logger.Warnf("slow query according to -search.logSlowQueryDuration=%s: remoteAddr=%s, duration=%.3f seconds; requestURI: %q",
+					*logSlowQueryDuration, remoteAddr, d.Seconds(), requestURI)
+				slowQueries.Inc()
+			}
+		}()
+	}
+
 	path := strings.Replace(r.URL.Path, "//", "/", -1)
 	if path == "/internal/resetRollupResultCache" {
 		if len(*resetCacheAuthKey) > 0 && r.FormValue("authKey") != *resetCacheAuthKey {
@@ -418,6 +435,23 @@ func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
 	prometheus.WriteErrorResponse(w, statusCode, err)
 }
 
+func getRequestURI(r *http.Request) string {
+	requestURI := r.RequestURI
+	if r.Method != "POST" {
+		return requestURI
+	}
+	_ = r.ParseForm()
+	queryArgs := r.PostForm.Encode()
+	if len(queryArgs) == 0 {
+		return requestURI
+	}
+	delimiter := "?"
+	if strings.Contains(requestURI, delimiter) {
+		delimiter = "&"
+	}
+	return requestURI + delimiter + queryArgs
+}
+
 var (
 	labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/label/{}/values"}`)
 	labelValuesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/label/{}/values"}`)
diff --git a/app/vmselect/promql/exec.go b/app/vmselect/promql/exec.go
index 5d594ab90..daa5b4f22 100644
--- a/app/vmselect/promql/exec.go
+++ b/app/vmselect/promql/exec.go
@@ -13,34 +13,19 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/querystats"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/metrics"
 	"github.com/VictoriaMetrics/metricsql"
 )
 
 var (
-	logSlowQueryDuration   = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging")
 	treatDotsAsIsInRegexps = flag.Bool("search.treatDotsAsIsInRegexps", false, "Whether to treat dots as is in regexp label filters used in queries. "+
 		`For example, foo{bar=~"a.b.c"} will be automatically converted to foo{bar=~"a\\.b\\.c"}, i.e. all the dots in regexp filters will be automatically escaped `+
 		`in order to match only dot char instead of matching any char. Dots in ".+", ".*" and ".{n}" regexps aren't escaped. `+
 		`This option is DEPRECATED in favor of {__graphite__="a.*.c"} syntax for selecting metrics matching the given Graphite metrics filter`)
 )
 
-var slowQueries = metrics.NewCounter(`vm_slow_queries_total`)
-
 // Exec executes q for the given ec.
 func Exec(ec *EvalConfig, q string, isFirstPointOnly bool) ([]netstorage.Result, error) {
-	if *logSlowQueryDuration > 0 {
-		startTime := time.Now()
-		defer func() {
-			d := time.Since(startTime)
-			if d >= *logSlowQueryDuration {
-				logger.Warnf("slow query according to -search.logSlowQueryDuration=%s: remoteAddr=%s, duration=%.3f seconds, start=%d, end=%d, step=%d, query=%q",
-					*logSlowQueryDuration, ec.QuotedRemoteAddr, d.Seconds(), ec.Start/1000, ec.End/1000, ec.Step/1000, q)
-				slowQueries.Inc()
-			}
-		}()
-	}
 	if querystats.Enabled() {
 		startTime := time.Now()
 		defer querystats.RegisterQuery(q, ec.End-ec.Start, startTime)
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index fc7236676..38edfba55 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -11,6 +11,7 @@ sort: 15
 * FEATURE: vmagent: show the number of samples the target returned during the last scrape on `/targets` and `/api/v1/targets` pages. This should simplify debugging targets, which may return too big or too low number of samples. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1377).
 * FEATURE: vmagent: show jobs with zero discovered targets on `/targets` page. This should help debugging improperly configured scrape configs.
 * FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): limit the number of samples per each imported JSON line. This should limit the memory usage at VictoriaMetrics when importing time series with big number of samples to it.
+* FEATURE: vmselect: log slow queries across all the `/api/v1/*` handlers (aka [Prometheus query API](https://prometheus.io/docs/prometheus/latest/querying/api)) if their execution duration exceeds `-search.logSlowQueryDuration`. This should simplify debugging slow requests to such handlers as `/api/v1/labels` or `/api/v1/series` additionally to `/api/v1/query` and `/api/v1/query_range`, which were logged in the previous releases.
 
 * BUGFIX: prevent from adding new samples to deleted time series after the rotation of the inverted index (the rotation is performed once per `-retentionPeriod`). See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1347#issuecomment-861232136) for details.
 * BUGFIX: vmstorage: reduce disk write IO usage on systems with big number of CPU cores. The issue has been introduced in the release [v1.59.0](#v1590). See [this commit](aa9b56a046b6ae8083fa659df35dd5e994bf9115) and [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1338#issuecomment-863046999) for details.