app/vmselect/netstorage: report vmstorage errors to vmselect clients even if partial responses are allowed

If a vmstorage is reachable and returns an application-level error to vmselect,
then such error must be returned to the caller even if partial responses are allowed,
since it usually means cluster mis-configuration.

Partial responses may be returned only if some vmstorage nodes are temporarily unavailable.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678
This commit is contained in:
Aliaksandr Valialkin 2022-02-21 21:15:02 +02:00
parent fcaa0c5202
commit 89ead3daca
No known key found for this signature in database
GPG key ID: A72BEC6CD3D0DED1
2 changed files with 36 additions and 28 deletions

View file

@ -1514,29 +1514,38 @@ func startStorageNodesRequest(denyPartialResponse bool, f func(idx int, sn *stor
}
func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) error) error {
var errors []error
for i := 0; i < len(storageNodes); i++ {
result := <-snr.resultsCh
if err := f(result); err != nil {
errors = append(errors, err)
continue
// Immediately return the error to the caller without waiting for responses from other vmstorage nodes -
// they will be processed in brackground.
return err
}
}
if len(errors) > 0 {
return errors[0]
}
return nil
}
func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) {
var errors []error
var errsPartial []error
resultsCollected := 0
for i := 0; i < len(storageNodes); i++ {
// There is no need in timer here, since all the goroutines executing the f function
// passed to startStorageNodesRequest must be finished until the deadline.
result := <-snr.resultsCh
if err := f(result); err != nil {
errors = append(errors, err)
if snr.denyPartialResponse {
// Immediately return the error to the caller if partial responses are denied.
// There is no need to wait for responses from other vmstorage nodes - they will be processed in background.
return false, err
}
var er *errRemote
if errors.As(err, &er) {
// Immediately return the error reported by vmstorage to the caller,
// since such errors usually mean misconfiguration at vmstorage.
// The misconfiguration must be known by the caller, so it is fixed ASAP.
return false, err
}
errsPartial = append(errsPartial, err)
continue
}
resultsCollected++
@ -1550,26 +1559,21 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
return false, nil
}
}
isPartial := false
if len(errors) > 0 {
if len(errors) == len(storageNodes) {
// All the vmstorage nodes returned error.
// Return only the first error, since it has no sense in returning all errors.
return false, errors[0]
}
// Return partial results.
// This allows gracefully degrade vmselect in the case
// if a part of storageNodes are temporarily unavailable.
// Do not return the error, since it may spam logs on busy vmselect
// serving high amounts of requests.
partialResultsCounter.Inc()
if snr.denyPartialResponse {
return true, errors[0]
}
isPartial = true
if len(errsPartial) == 0 {
return false, nil
}
return isPartial, nil
if len(errsPartial) == len(storageNodes) {
// All the vmstorage nodes returned error.
// Return only the first error, since it has no sense in returning all errors.
return false, errsPartial[0]
}
// Return partial results.
// This allows gracefully degrade vmselect in the case
// if a part of storageNodes are temporarily unavailable.
// Do not return the error, since it may spam logs on busy vmselect
// serving high amounts of requests.
partialResultsCounter.Inc()
return true, nil
}
type storageNode struct {
@ -1925,7 +1929,10 @@ func (sn *storageNode) execOnConn(rpcName string, f func(bc *handshake.BufferedC
// since it may be broken.
_ = bc.Close()
}
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
if deadline.Exceeded() {
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
}
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q: %w", rpcName, remoteAddr, err)
}
// Return the connection back to the pool, assuming it is healthy.
sn.connPool.Put(bc)

View file

@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* BUGFIX: vmalert: add support for `$externalLabels` and `$externalURL` template vars in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2193).
* BUGFIX: vmalert: make sure notifiers are discovered during initialization if they are configured via `consul_sd_configs`. Previously they could be discovered in 30 seconds (the default value for `-promscrape.consulSDCheckInterval` command-line flag) after the initialization. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2202).
* BUGFIX: update default value for `-promscrape.fileSDCheckInterval`, so it matches default duration used by Prometheus for checking for updates in `file_sd_configs`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2187). Thanks to @corporate-gadfly for the fix.
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): do not return partial responses from `vmselect` if at least a single `vmstorage` node was reachable and returned an app-level error. Such errors are usually related to cluster mis-configuration, so they must be returned to the caller instead of being masked by [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability). Partial responses can be returned only if some of `vmstorage` nodes are unreachable during the query. This may help the following issues: [one](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941), [two](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678).
## [v1.73.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.73.0)