mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-03-11 15:34:56 +00:00
app/vmselect/netstorage: report vmstorage errors to vmselect clients even if partial responses are allowed
If a vmstorage is reachable and returns an application-level error to vmselect, then such error must be returned to the caller even if partial responses are allowed, since it usually means cluster mis-configuration. Partial responses may be returned only if some vmstorage nodes are temporarily unavailable. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678
This commit is contained in:
parent
fcaa0c5202
commit
89ead3daca
2 changed files with 36 additions and 28 deletions
|
@ -1514,29 +1514,38 @@ func startStorageNodesRequest(denyPartialResponse bool, f func(idx int, sn *stor
|
||||||
}
|
}
|
||||||
|
|
||||||
func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) error) error {
|
func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) error) error {
|
||||||
var errors []error
|
|
||||||
for i := 0; i < len(storageNodes); i++ {
|
for i := 0; i < len(storageNodes); i++ {
|
||||||
result := <-snr.resultsCh
|
result := <-snr.resultsCh
|
||||||
if err := f(result); err != nil {
|
if err := f(result); err != nil {
|
||||||
errors = append(errors, err)
|
// Immediately return the error to the caller without waiting for responses from other vmstorage nodes -
|
||||||
continue
|
// they will be processed in brackground.
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(errors) > 0 {
|
|
||||||
return errors[0]
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) {
|
func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) {
|
||||||
var errors []error
|
var errsPartial []error
|
||||||
resultsCollected := 0
|
resultsCollected := 0
|
||||||
for i := 0; i < len(storageNodes); i++ {
|
for i := 0; i < len(storageNodes); i++ {
|
||||||
// There is no need in timer here, since all the goroutines executing the f function
|
// There is no need in timer here, since all the goroutines executing the f function
|
||||||
// passed to startStorageNodesRequest must be finished until the deadline.
|
// passed to startStorageNodesRequest must be finished until the deadline.
|
||||||
result := <-snr.resultsCh
|
result := <-snr.resultsCh
|
||||||
if err := f(result); err != nil {
|
if err := f(result); err != nil {
|
||||||
errors = append(errors, err)
|
if snr.denyPartialResponse {
|
||||||
|
// Immediately return the error to the caller if partial responses are denied.
|
||||||
|
// There is no need to wait for responses from other vmstorage nodes - they will be processed in background.
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
var er *errRemote
|
||||||
|
if errors.As(err, &er) {
|
||||||
|
// Immediately return the error reported by vmstorage to the caller,
|
||||||
|
// since such errors usually mean misconfiguration at vmstorage.
|
||||||
|
// The misconfiguration must be known by the caller, so it is fixed ASAP.
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
errsPartial = append(errsPartial, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
resultsCollected++
|
resultsCollected++
|
||||||
|
@ -1550,26 +1559,21 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
isPartial := false
|
if len(errsPartial) == 0 {
|
||||||
if len(errors) > 0 {
|
return false, nil
|
||||||
if len(errors) == len(storageNodes) {
|
|
||||||
// All the vmstorage nodes returned error.
|
|
||||||
// Return only the first error, since it has no sense in returning all errors.
|
|
||||||
return false, errors[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return partial results.
|
|
||||||
// This allows gracefully degrade vmselect in the case
|
|
||||||
// if a part of storageNodes are temporarily unavailable.
|
|
||||||
// Do not return the error, since it may spam logs on busy vmselect
|
|
||||||
// serving high amounts of requests.
|
|
||||||
partialResultsCounter.Inc()
|
|
||||||
if snr.denyPartialResponse {
|
|
||||||
return true, errors[0]
|
|
||||||
}
|
|
||||||
isPartial = true
|
|
||||||
}
|
}
|
||||||
return isPartial, nil
|
if len(errsPartial) == len(storageNodes) {
|
||||||
|
// All the vmstorage nodes returned error.
|
||||||
|
// Return only the first error, since it has no sense in returning all errors.
|
||||||
|
return false, errsPartial[0]
|
||||||
|
}
|
||||||
|
// Return partial results.
|
||||||
|
// This allows gracefully degrade vmselect in the case
|
||||||
|
// if a part of storageNodes are temporarily unavailable.
|
||||||
|
// Do not return the error, since it may spam logs on busy vmselect
|
||||||
|
// serving high amounts of requests.
|
||||||
|
partialResultsCounter.Inc()
|
||||||
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type storageNode struct {
|
type storageNode struct {
|
||||||
|
@ -1925,7 +1929,10 @@ func (sn *storageNode) execOnConn(rpcName string, f func(bc *handshake.BufferedC
|
||||||
// since it may be broken.
|
// since it may be broken.
|
||||||
_ = bc.Close()
|
_ = bc.Close()
|
||||||
}
|
}
|
||||||
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
|
if deadline.Exceeded() {
|
||||||
|
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q: %w", rpcName, remoteAddr, err)
|
||||||
}
|
}
|
||||||
// Return the connection back to the pool, assuming it is healthy.
|
// Return the connection back to the pool, assuming it is healthy.
|
||||||
sn.connPool.Put(bc)
|
sn.connPool.Put(bc)
|
||||||
|
|
|
@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||||
* BUGFIX: vmalert: add support for `$externalLabels` and `$externalURL` template vars in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2193).
|
* BUGFIX: vmalert: add support for `$externalLabels` and `$externalURL` template vars in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2193).
|
||||||
* BUGFIX: vmalert: make sure notifiers are discovered during initialization if they are configured via `consul_sd_configs`. Previously they could be discovered in 30 seconds (the default value for `-promscrape.consulSDCheckInterval` command-line flag) after the initialization. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2202).
|
* BUGFIX: vmalert: make sure notifiers are discovered during initialization if they are configured via `consul_sd_configs`. Previously they could be discovered in 30 seconds (the default value for `-promscrape.consulSDCheckInterval` command-line flag) after the initialization. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2202).
|
||||||
* BUGFIX: update default value for `-promscrape.fileSDCheckInterval`, so it matches default duration used by Prometheus for checking for updates in `file_sd_configs`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2187). Thanks to @corporate-gadfly for the fix.
|
* BUGFIX: update default value for `-promscrape.fileSDCheckInterval`, so it matches default duration used by Prometheus for checking for updates in `file_sd_configs`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2187). Thanks to @corporate-gadfly for the fix.
|
||||||
|
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): do not return partial responses from `vmselect` if at least a single `vmstorage` node was reachable and returned an app-level error. Such errors are usually related to cluster mis-configuration, so they must be returned to the caller instead of being masked by [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability). Partial responses can be returned only if some of `vmstorage` nodes are unreachable during the query. This may help the following issues: [one](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941), [two](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678).
|
||||||
|
|
||||||
|
|
||||||
## [v1.73.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.73.0)
|
## [v1.73.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.73.0)
|
||||||
|
|
Loading…
Reference in a new issue