From a0a154511a1f26035cd714987fc2924e70087f96 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Fri, 8 Nov 2024 11:45:16 +0100 Subject: [PATCH] app/vmauth: properly check for backend health before leastLoaded policy (#7489) Previously, vmauth could have pick `buMin` as least loaded backend without checking its status. In result, vmauth could have respond to the user with an error even if there were healthy backends. That could happen if healthy backends already had non-zero amount of concurrent requests executing at the moment of least-loaded backend choosing logic. Steps to reproduce: 1. Setup vmauth with two backends: healthy and non-healthy 2. Execute a bunch of concurrent requests against vmauth (i.e. Grafana dash reload) 3. Observe that some requests will fail with message that all backends are unavailable Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061 --- Signed-off-by: hagen1778 --- app/vmauth/auth_config.go | 7 +++++-- app/vmauth/auth_config_test.go | 22 ++++++++++++++++++++++ docs/changelog/CHANGELOG.md | 1 + 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/app/vmauth/auth_config.go b/app/vmauth/auth_config.go index 237bd18e6..018dce6ba 100644 --- a/app/vmauth/auth_config.go +++ b/app/vmauth/auth_config.go @@ -462,13 +462,17 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) * // Slow path - select other backend urls. n := atomicCounter.Add(1) - 1 - + buMin := bus[n%uint32(len(bus))] for i := uint32(0); i < uint32(len(bus)); i++ { idx := (n + i) % uint32(len(bus)) bu := bus[idx] if bu.isBroken() { continue } + if buMin.isBroken() { + // verify that buMin isn't set as broken + buMin = bu + } if bu.concurrentRequests.Load() == 0 { // Fast path - return the backend with zero concurrently executed requests. // Do not use CompareAndSwap() instead of Load(), since it is much slower on systems with many CPU cores. @@ -478,7 +482,6 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) * } // Slow path - return the backend with the minimum number of concurrently executed requests. - buMin := bus[n%uint32(len(bus))] minRequests := buMin.concurrentRequests.Load() for _, bu := range bus { if bu.isBroken() { diff --git a/app/vmauth/auth_config_test.go b/app/vmauth/auth_config_test.go index 8ee7c0d18..5e686224f 100644 --- a/app/vmauth/auth_config_test.go +++ b/app/vmauth/auth_config_test.go @@ -777,6 +777,28 @@ func TestGetLeastLoadedBackendURL(t *testing.T) { fn(7, 7, 7) } +func TestBrokenBackend(t *testing.T) { + up := mustParseURLs([]string{ + "http://node1:343", + "http://node2:343", + "http://node3:343", + }) + up.loadBalancingPolicy = "least_loaded" + pbus := up.bus.Load() + bus := *pbus + + // explicitly mark one of the backends as broken + bus[1].setBroken() + + // broken backend should never return while there are healthy backends + for i := 0; i < 1e3; i++ { + b := up.getBackendURL() + if b.isBroken() { + t.Fatalf("unexpected broken backend %q", b.url) + } + } +} + func getRegexs(paths []string) []*Regex { var sps []*Regex for _, path := range paths { diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index 218b6c1ed..76f19c5fe 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -21,6 +21,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): drop rows that do not belong to the current series during import. The dropped rows should belong to another series whose tags are a superset of the current series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7301) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7330). Thanks to @dpedu for reporting and cooperating with the test. * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068). * BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details. +* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth): properly check availability of all the backends before giving up when proxying requests. Previously, vmauth could return an error even if there were healthy backends available. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061) for details. ## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0)