From 58326dbf25548e9715e7043e3619fe2234af47d9 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 7 Sep 2023 16:07:03 +0200 Subject: [PATCH] app/vmselect: return 503 status code when partial responses are denied and some of vmstorage nodes are temporarily unavailable This should help detecting this case and automatic retrying the query at healthy cluster replica in another availability zone. This commit is needed as a preparation for automatic query retry at another backend at vmauth on 5xx errors as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561 --- app/vmselect/netstorage/netstorage.go | 13 ++++++++++++- docs/CHANGELOG.md | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index 0760e6bcf..e4173d7ae 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -1754,6 +1754,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co // and the number of partial responses reach -replicationFactor, // since this means that the response is partial. snr.finishQueryTracers("cancel request because partial responses are denied and some vmstorage nodes failed to return response") + + // Returns 503 status code for partial response, so the caller could retry it if needed. + err = &httpserver.ErrorWithStatusCode{ + Err: err, + StatusCode: http.StatusServiceUnavailable, + } return false, err } continue @@ -1780,7 +1786,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co if len(errsPartial) == len(sns) { // All the vmstorage nodes returned error. // Return only the first error, since it has no sense in returning all errors. - return false, errsPartial[0] + // Returns 503 status code for partial response, so the caller could retry it if needed. + err := &httpserver.ErrorWithStatusCode{ + Err: errsPartial[0], + StatusCode: http.StatusServiceUnavailable, + } + return false, err } // Return partial results. // This allows gracefully degrade vmselect in the case diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 726110a63..6eeaf0533 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -49,6 +49,7 @@ ssue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4825) and [these * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix the bug causing render looping when switching to heatmap. * BUGFIX: [vminsert enterprise](https://docs.victoriametrics.com/enterprise.html): properly parse `/insert/multitenant/*` urls, which have been broken since [v1.93.2](#v1932). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947). * BUGFIX: properly build production armv5 binaries for `GOARCH=arm`. This has been broken after the upgrading of Go builder to Go1.21.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965). +* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): return `503 Service Unavailable` status code when [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability) are denied and some of `vmstorage` nodes are temporarily unavailable. Previously `422 Unprocessable Entiry` status code was mistakenly returned in this case, which could prevent from automatic recovery by re-sending the request to healthy cluster replica in another availability zone. ## [v1.93.3](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.3)