diff --git a/app/vmauth/auth_config.go b/app/vmauth/auth_config.go index 237bd18e6..018dce6ba 100644 --- a/app/vmauth/auth_config.go +++ b/app/vmauth/auth_config.go @@ -462,13 +462,17 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) * // Slow path - select other backend urls. n := atomicCounter.Add(1) - 1 - + buMin := bus[n%uint32(len(bus))] for i := uint32(0); i < uint32(len(bus)); i++ { idx := (n + i) % uint32(len(bus)) bu := bus[idx] if bu.isBroken() { continue } + if buMin.isBroken() { + // verify that buMin isn't set as broken + buMin = bu + } if bu.concurrentRequests.Load() == 0 { // Fast path - return the backend with zero concurrently executed requests. // Do not use CompareAndSwap() instead of Load(), since it is much slower on systems with many CPU cores. @@ -478,7 +482,6 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) * } // Slow path - return the backend with the minimum number of concurrently executed requests. - buMin := bus[n%uint32(len(bus))] minRequests := buMin.concurrentRequests.Load() for _, bu := range bus { if bu.isBroken() { diff --git a/app/vmauth/auth_config_test.go b/app/vmauth/auth_config_test.go index 8ee7c0d18..5e686224f 100644 --- a/app/vmauth/auth_config_test.go +++ b/app/vmauth/auth_config_test.go @@ -777,6 +777,28 @@ func TestGetLeastLoadedBackendURL(t *testing.T) { fn(7, 7, 7) } +func TestBrokenBackend(t *testing.T) { + up := mustParseURLs([]string{ + "http://node1:343", + "http://node2:343", + "http://node3:343", + }) + up.loadBalancingPolicy = "least_loaded" + pbus := up.bus.Load() + bus := *pbus + + // explicitly mark one of the backends as broken + bus[1].setBroken() + + // broken backend should never return while there are healthy backends + for i := 0; i < 1e3; i++ { + b := up.getBackendURL() + if b.isBroken() { + t.Fatalf("unexpected broken backend %q", b.url) + } + } +} + func getRegexs(paths []string) []*Regex { var sps []*Regex for _, path := range paths { diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index 218b6c1ed..76f19c5fe 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -21,6 +21,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): drop rows that do not belong to the current series during import. The dropped rows should belong to another series whose tags are a superset of the current series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7301) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7330). Thanks to @dpedu for reporting and cooperating with the test. * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068). * BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details. +* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth): properly check availability of all the backends before giving up when proxying requests. Previously, vmauth could return an error even if there were healthy backends available. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061) for details. ## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0)