From 8ff9235717fbdb8d96545e7a6d3ed50a6f7cfb84 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 7 Sep 2023 16:07:03 +0200 Subject: [PATCH] app/vmselect: return 503 status code when partial responses are denied and some of vmstorage nodes are temporarily unavailable This should help detecting this case and automatic retrying the query at healthy cluster replica in another availability zone. This commit is needed as a preparation for automatic query retry at another backend at vmauth on 5xx errors as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561 --- app/vmselect/netstorage/netstorage.go | 13 ++++++++++++- docs/CHANGELOG.md | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index ba11b5d19..6081cc893 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -1745,6 +1745,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co // and the number of partial responses reach -replicationFactor, // since this means that the response is partial. snr.finishQueryTracers("cancel request because partial responses are denied and some vmstorage nodes failed to return response") + + // Returns 503 status code for partial response, so the caller could retry it if needed. + err = &httpserver.ErrorWithStatusCode{ + Err: err, + StatusCode: http.StatusServiceUnavailable, + } return false, err } continue @@ -1771,7 +1777,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co if len(errsPartial) == len(sns) { // All the vmstorage nodes returned error. // Return only the first error, since it has no sense in returning all errors. - return false, errsPartial[0] + // Returns 503 status code for partial response, so the caller could retry it if needed. + err := &httpserver.ErrorWithStatusCode{ + Err: errsPartial[0], + StatusCode: http.StatusServiceUnavailable, + } + return false, err } // Return partial results. // This allows gracefully degrade vmselect in the case diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 2297289cd..7c9c1e56c 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -19,6 +19,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * BUGFIX: [vminsert enterprise](https://docs.victoriametrics.com/enterprise.html): properly parse `/insert/multitenant/*` urls, which have been broken since [v1.93.2](#v1932). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947). * BUGFIX: properly build production armv5 binaries for `GOARCH=arm`. This has been broken after the upgrading of Go builder to Go1.21.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965). +* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): return `503 Service Unavailable` status code when [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability) are denied and some of `vmstorage` nodes are temporarily unavailable. Previously `422 Unprocessable Entiry` status code was mistakenly returned in this case, which could prevent from automatic recovery by re-sending the request to healthy cluster replica in another availability zone. ## [v1.87.8](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.8)