app/vmselect/netstorage: make sure that at least a single result is collected from every storage group before deciding whether it is OK to skip results from the remaining storage nodes

This commit is contained in:
Aliaksandr Valialkin 2023-12-20 19:53:46 +02:00
parent 46a335aa1d
commit c888d76c4b
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
2 changed files with 18 additions and 4 deletions

View file

@ -1754,10 +1754,14 @@ func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) err
}
func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) {
errsPartialPerGroup := make(map[*storageNodesGroup][]error)
resultsCollectedPerGroup := make(map[*storageNodesGroup]int)
sns := snr.sns
for i := 0; i < len(sns); i++ {
if len(sns) == 0 {
return false, nil
}
groupsCount := sns[0].group.groupsCount
resultsCollectedPerGroup := make(map[*storageNodesGroup]int, groupsCount)
errsPartialPerGroup := make(map[*storageNodesGroup][]error)
for range sns {
// There is no need in timer here, since all the goroutines executing the f function
// passed to startStorageNodesRequest must be finished until the deadline.
result := <-snr.resultsCh
@ -1799,7 +1803,7 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
}
snr.finishQueryTracer(result.qt, "")
resultsCollectedPerGroup[group]++
if *skipSlowReplicas {
if *skipSlowReplicas && len(resultsCollectedPerGroup) == groupsCount {
canSkipSlowReplicas := true
for g, n := range resultsCollectedPerGroup {
if n <= g.nodesCount-g.replicationFactor {
@ -1870,6 +1874,9 @@ type storageNodesGroup struct {
// the number of nodes in the group
nodesCount int
// groupsCount is the number of groups in the list the given group belongs to
groupsCount int
}
func initStorageNodeGroups(addrs []string) map[string]*storageNodesGroup {
@ -1886,6 +1893,12 @@ func initStorageNodeGroups(addrs []string) map[string]*storageNodesGroup {
}
g.nodesCount++
}
groupsCount := len(m)
for _, g := range m {
g.groupsCount = groupsCount
}
return m
}

View file

@ -33,6 +33,7 @@ The sandbox cluster installation is running under the constant load generated by
* FEATURE: all VictoriaMetrics components: add ability to specify arbitrary HTTP headers to send with every request to `-pushmetrics.url`. See [`push metrics` docs](https://docs.victoriametrics.com/#push-metrics).
* FEATURE: all VictoriaMetrics components: add `-metrics.exposeMetadata` command-line flag, which allows displaying `TYPE` and `HELP` metadata at `/metrics` page exposed at `-httpListenAddr`. This may be needed when the `/metrics` page is scraped by collector, which requires the `TYPE` and `HELP` metadata such as [Google Cloud Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type).
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly return full results when `-search.skipSlowReplicas` command-line flag is passed to `vmselect` and when [vmstorage groups](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmstorage-groups-at-vmselect) are in use. Previously partial results could be returned in this case.
* BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped.
* BUGFIX: `vmstorage`: added missing `-inmemoryDataFlushInterval` command-line flag, which was missing in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) after implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337) in [v1.85.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.0).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.