app/vmselect: return 503 status code when partial responses are denied and some of vmstorage nodes are temporarily unavailable

This should help detecting this case and automatic retrying the query at healthy cluster replica
in another availability zone.

This commit is needed as a preparation for automatic query retry at another backend at vmauth on 5xx errors
as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561
This commit is contained in:
Aliaksandr Valialkin 2023-09-07 16:07:03 +02:00
parent 841465d98a
commit 8ff9235717
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
2 changed files with 13 additions and 1 deletions

View file

@ -1745,6 +1745,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
// and the number of partial responses reach -replicationFactor,
// since this means that the response is partial.
snr.finishQueryTracers("cancel request because partial responses are denied and some vmstorage nodes failed to return response")
// Returns 503 status code for partial response, so the caller could retry it if needed.
err = &httpserver.ErrorWithStatusCode{
Err: err,
StatusCode: http.StatusServiceUnavailable,
}
return false, err
}
continue
@ -1771,7 +1777,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
if len(errsPartial) == len(sns) {
// All the vmstorage nodes returned error.
// Return only the first error, since it has no sense in returning all errors.
return false, errsPartial[0]
// Returns 503 status code for partial response, so the caller could retry it if needed.
err := &httpserver.ErrorWithStatusCode{
Err: errsPartial[0],
StatusCode: http.StatusServiceUnavailable,
}
return false, err
}
// Return partial results.
// This allows gracefully degrade vmselect in the case

View file

@ -19,6 +19,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* BUGFIX: [vminsert enterprise](https://docs.victoriametrics.com/enterprise.html): properly parse `/insert/multitenant/*` urls, which have been broken since [v1.93.2](#v1932). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947).
* BUGFIX: properly build production armv5 binaries for `GOARCH=arm`. This has been broken after the upgrading of Go builder to Go1.21.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965).
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): return `503 Service Unavailable` status code when [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability) are denied and some of `vmstorage` nodes are temporarily unavailable. Previously `422 Unprocessable Entiry` status code was mistakenly returned in this case, which could prevent from automatic recovery by re-sending the request to healthy cluster replica in another availability zone.
## [v1.87.8](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.8)