mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
app/vmselect/netstorage: report vmstorage errors to vmselect clients even if partial responses are allowed
If a vmstorage is reachable and returns an application-level error to vmselect, then such error must be returned to the caller even if partial responses are allowed, since it usually means cluster mis-configuration. Partial responses may be returned only if some vmstorage nodes are temporarily unavailable. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678
This commit is contained in:
parent
fcaa0c5202
commit
89ead3daca
2 changed files with 36 additions and 28 deletions
|
@ -1514,29 +1514,38 @@ func startStorageNodesRequest(denyPartialResponse bool, f func(idx int, sn *stor
|
|||
}
|
||||
|
||||
func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) error) error {
|
||||
var errors []error
|
||||
for i := 0; i < len(storageNodes); i++ {
|
||||
result := <-snr.resultsCh
|
||||
if err := f(result); err != nil {
|
||||
errors = append(errors, err)
|
||||
continue
|
||||
// Immediately return the error to the caller without waiting for responses from other vmstorage nodes -
|
||||
// they will be processed in brackground.
|
||||
return err
|
||||
}
|
||||
}
|
||||
if len(errors) > 0 {
|
||||
return errors[0]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) {
|
||||
var errors []error
|
||||
var errsPartial []error
|
||||
resultsCollected := 0
|
||||
for i := 0; i < len(storageNodes); i++ {
|
||||
// There is no need in timer here, since all the goroutines executing the f function
|
||||
// passed to startStorageNodesRequest must be finished until the deadline.
|
||||
result := <-snr.resultsCh
|
||||
if err := f(result); err != nil {
|
||||
errors = append(errors, err)
|
||||
if snr.denyPartialResponse {
|
||||
// Immediately return the error to the caller if partial responses are denied.
|
||||
// There is no need to wait for responses from other vmstorage nodes - they will be processed in background.
|
||||
return false, err
|
||||
}
|
||||
var er *errRemote
|
||||
if errors.As(err, &er) {
|
||||
// Immediately return the error reported by vmstorage to the caller,
|
||||
// since such errors usually mean misconfiguration at vmstorage.
|
||||
// The misconfiguration must be known by the caller, so it is fixed ASAP.
|
||||
return false, err
|
||||
}
|
||||
errsPartial = append(errsPartial, err)
|
||||
continue
|
||||
}
|
||||
resultsCollected++
|
||||
|
@ -1550,26 +1559,21 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
|
|||
return false, nil
|
||||
}
|
||||
}
|
||||
isPartial := false
|
||||
if len(errors) > 0 {
|
||||
if len(errors) == len(storageNodes) {
|
||||
// All the vmstorage nodes returned error.
|
||||
// Return only the first error, since it has no sense in returning all errors.
|
||||
return false, errors[0]
|
||||
}
|
||||
|
||||
// Return partial results.
|
||||
// This allows gracefully degrade vmselect in the case
|
||||
// if a part of storageNodes are temporarily unavailable.
|
||||
// Do not return the error, since it may spam logs on busy vmselect
|
||||
// serving high amounts of requests.
|
||||
partialResultsCounter.Inc()
|
||||
if snr.denyPartialResponse {
|
||||
return true, errors[0]
|
||||
}
|
||||
isPartial = true
|
||||
if len(errsPartial) == 0 {
|
||||
return false, nil
|
||||
}
|
||||
return isPartial, nil
|
||||
if len(errsPartial) == len(storageNodes) {
|
||||
// All the vmstorage nodes returned error.
|
||||
// Return only the first error, since it has no sense in returning all errors.
|
||||
return false, errsPartial[0]
|
||||
}
|
||||
// Return partial results.
|
||||
// This allows gracefully degrade vmselect in the case
|
||||
// if a part of storageNodes are temporarily unavailable.
|
||||
// Do not return the error, since it may spam logs on busy vmselect
|
||||
// serving high amounts of requests.
|
||||
partialResultsCounter.Inc()
|
||||
return true, nil
|
||||
}
|
||||
|
||||
type storageNode struct {
|
||||
|
@ -1925,7 +1929,10 @@ func (sn *storageNode) execOnConn(rpcName string, f func(bc *handshake.BufferedC
|
|||
// since it may be broken.
|
||||
_ = bc.Close()
|
||||
}
|
||||
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
|
||||
if deadline.Exceeded() {
|
||||
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
|
||||
}
|
||||
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q: %w", rpcName, remoteAddr, err)
|
||||
}
|
||||
// Return the connection back to the pool, assuming it is healthy.
|
||||
sn.connPool.Put(bc)
|
||||
|
|
|
@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
|||
* BUGFIX: vmalert: add support for `$externalLabels` and `$externalURL` template vars in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2193).
|
||||
* BUGFIX: vmalert: make sure notifiers are discovered during initialization if they are configured via `consul_sd_configs`. Previously they could be discovered in 30 seconds (the default value for `-promscrape.consulSDCheckInterval` command-line flag) after the initialization. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2202).
|
||||
* BUGFIX: update default value for `-promscrape.fileSDCheckInterval`, so it matches default duration used by Prometheus for checking for updates in `file_sd_configs`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2187). Thanks to @corporate-gadfly for the fix.
|
||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): do not return partial responses from `vmselect` if at least a single `vmstorage` node was reachable and returned an app-level error. Such errors are usually related to cluster mis-configuration, so they must be returned to the caller instead of being masked by [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability). Partial responses can be returned only if some of `vmstorage` nodes are unreachable during the query. This may help the following issues: [one](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941), [two](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678).
|
||||
|
||||
|
||||
## [v1.73.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.73.0)
|
||||
|
|
Loading…
Reference in a new issue