diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index b00318a27..7d74f5fec 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -1514,29 +1514,38 @@ func startStorageNodesRequest(denyPartialResponse bool, f func(idx int, sn *stor } func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) error) error { - var errors []error for i := 0; i < len(storageNodes); i++ { result := <-snr.resultsCh if err := f(result); err != nil { - errors = append(errors, err) - continue + // Immediately return the error to the caller without waiting for responses from other vmstorage nodes - + // they will be processed in brackground. + return err } } - if len(errors) > 0 { - return errors[0] - } return nil } func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) { - var errors []error + var errsPartial []error resultsCollected := 0 for i := 0; i < len(storageNodes); i++ { // There is no need in timer here, since all the goroutines executing the f function // passed to startStorageNodesRequest must be finished until the deadline. result := <-snr.resultsCh if err := f(result); err != nil { - errors = append(errors, err) + if snr.denyPartialResponse { + // Immediately return the error to the caller if partial responses are denied. + // There is no need to wait for responses from other vmstorage nodes - they will be processed in background. + return false, err + } + var er *errRemote + if errors.As(err, &er) { + // Immediately return the error reported by vmstorage to the caller, + // since such errors usually mean misconfiguration at vmstorage. + // The misconfiguration must be known by the caller, so it is fixed ASAP. + return false, err + } + errsPartial = append(errsPartial, err) continue } resultsCollected++ @@ -1550,26 +1559,21 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co return false, nil } } - isPartial := false - if len(errors) > 0 { - if len(errors) == len(storageNodes) { - // All the vmstorage nodes returned error. - // Return only the first error, since it has no sense in returning all errors. - return false, errors[0] - } - - // Return partial results. - // This allows gracefully degrade vmselect in the case - // if a part of storageNodes are temporarily unavailable. - // Do not return the error, since it may spam logs on busy vmselect - // serving high amounts of requests. - partialResultsCounter.Inc() - if snr.denyPartialResponse { - return true, errors[0] - } - isPartial = true + if len(errsPartial) == 0 { + return false, nil } - return isPartial, nil + if len(errsPartial) == len(storageNodes) { + // All the vmstorage nodes returned error. + // Return only the first error, since it has no sense in returning all errors. + return false, errsPartial[0] + } + // Return partial results. + // This allows gracefully degrade vmselect in the case + // if a part of storageNodes are temporarily unavailable. + // Do not return the error, since it may spam logs on busy vmselect + // serving high amounts of requests. + partialResultsCounter.Inc() + return true, nil } type storageNode struct { @@ -1925,7 +1929,10 @@ func (sn *storageNode) execOnConn(rpcName string, f func(bc *handshake.BufferedC // since it may be broken. _ = bc.Close() } - return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err) + if deadline.Exceeded() { + return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q with timeout %s: %w", rpcName, remoteAddr, deadline.String(), err) + } + return fmt.Errorf("cannot execute rpcName=%q on vmstorage %q: %w", rpcName, remoteAddr, err) } // Return the connection back to the pool, assuming it is healthy. sn.connPool.Put(bc) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index cdbaba37e..1165f4e51 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * BUGFIX: vmalert: add support for `$externalLabels` and `$externalURL` template vars in the same way as Prometheus does. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2193). * BUGFIX: vmalert: make sure notifiers are discovered during initialization if they are configured via `consul_sd_configs`. Previously they could be discovered in 30 seconds (the default value for `-promscrape.consulSDCheckInterval` command-line flag) after the initialization. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2202). * BUGFIX: update default value for `-promscrape.fileSDCheckInterval`, so it matches default duration used by Prometheus for checking for updates in `file_sd_configs`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2187). Thanks to @corporate-gadfly for the fix. +* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): do not return partial responses from `vmselect` if at least a single `vmstorage` node was reachable and returned an app-level error. Such errors are usually related to cluster mis-configuration, so they must be returned to the caller instead of being masked by [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability). Partial responses can be returned only if some of `vmstorage` nodes are unreachable during the query. This may help the following issues: [one](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1941), [two](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/678). ## [v1.73.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.73.0)