mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-30 15:22:07 +00:00
lib/promscrape/discovery/kubernetes: fix watcher start order for roles endpoints and endpointslice (#5557)
* lib/promscrape/discovery/kubernetes: fix watcher start order for roles endpoints and endpointslice Previously the groupWatcher could be mistakenly stopped when requests for pod or services resources take too long. * remove mislead comment * docs/sd_configs.md: mention -promscrape.kubernetes.attachNodeMetadataAll flag in the description for attach_metadata section Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4640 * wip * lib/promscrape/kubernetes: prevent from stopping groupWatcher when there are in-flight apiWatcher.mustStart() calls groupWatcher is stopped if it has zero registered apiWatchers during 14 seconds. But such a groupWatcher can be still in use if apiWatcher for `role: endpoints` or `role: endpointslice` is being registered and the discovery of the associated `pod` and/or `service` objects takes longer than 14 seconds - see the beginning of groupWatcher.startWatchersForRole() function for details. Track the number of in-flight calls to apiWatcher.mustStart() and prevent from stopping the associated groupWatcher if the number of in-flight calls is non-zero. P.S. postponing the discovery of `pod` and/or `service` objects associated with `endpoints` or `endpointslice` roles isn't the best solution, since it slows down initial discovery of `endpoints` and `endpointslice` targets. * typo fix --------- Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
This commit is contained in:
parent
18f0776ade
commit
26980cd76d
2 changed files with 20 additions and 11 deletions
|
@ -13,6 +13,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
|
|||
|
||||
* BUGFIX: [vmselect](https://docs.victoriametrics.com/vmselect.html): properly determine time range search for instant queries with too big look-behind window like `foo[100y]`. Previously, such queries could return empty responses even if `foo` is present in database.
|
||||
* BUGFIX: properly return errors from [export APIs](https://docs.victoriametrics.com/#how-to-export-time-series). Previously these errors were silently suppressed. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5649).
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover targets for `role: endpoints` and `role: endpointslice` in [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs). Previously some `endpoints` and `endpointslice` targets could be left undiscovered or some targets could have missing `__meta_*` labels when performing service discovery in busy Kubernetes clusters with large number of pods. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5557).
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle possible negative results caused by float operations precision error in rollup functions like rate() or increase(). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5571).
|
||||
|
||||
## [v1.93.10](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.10)
|
||||
|
|
|
@ -92,7 +92,9 @@ func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc
|
|||
}
|
||||
|
||||
func (aw *apiWatcher) mustStart() {
|
||||
atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, 1)
|
||||
aw.gw.startWatchersForRole(aw.role, aw)
|
||||
atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, -1)
|
||||
}
|
||||
|
||||
func (aw *apiWatcher) updateSwosCount(multiplier int, swosByKey map[string][]interface{}) {
|
||||
|
@ -198,6 +200,10 @@ func (aw *apiWatcher) getScrapeWorkObjects() []interface{} {
|
|||
// groupWatcher watches for Kubernetes objects on the given apiServer with the given namespaces,
|
||||
// selectors and attachNodeMetadata using the given client.
|
||||
type groupWatcher struct {
|
||||
// The number of in-flight apiWatcher.mustStart() calls for the given groupWatcher.
|
||||
// This field is used by groupWatchersCleaner() in order to determine when the given groupWatcher can be stopped.
|
||||
apiWatcherInflightStartCalls int32
|
||||
|
||||
// Old Kubernetes doesn't support /apis/networking.k8s.io/v1/, so /apis/networking.k8s.io/v1beta1/ must be used instead.
|
||||
// This flag is used for automatic substitution of v1 API path with v1beta1 API path during requests to apiServer.
|
||||
useNetworkingV1Beta1 uint32
|
||||
|
@ -288,11 +294,7 @@ func selectorsKey(selectors []Selector) string {
|
|||
|
||||
var (
|
||||
groupWatchersLock sync.Mutex
|
||||
groupWatchers = func() map[string]*groupWatcher {
|
||||
gws := make(map[string]*groupWatcher)
|
||||
go groupWatchersCleaner(gws)
|
||||
return gws
|
||||
}()
|
||||
groupWatchers map[string]*groupWatcher
|
||||
|
||||
_ = metrics.NewGauge(`vm_promscrape_discovery_kubernetes_group_watchers`, func() float64 {
|
||||
groupWatchersLock.Lock()
|
||||
|
@ -302,11 +304,16 @@ var (
|
|||
})
|
||||
)
|
||||
|
||||
func groupWatchersCleaner(gws map[string]*groupWatcher) {
|
||||
func init() {
|
||||
groupWatchers = make(map[string]*groupWatcher)
|
||||
go groupWatchersCleaner()
|
||||
}
|
||||
|
||||
func groupWatchersCleaner() {
|
||||
for {
|
||||
time.Sleep(7 * time.Second)
|
||||
groupWatchersLock.Lock()
|
||||
for key, gw := range gws {
|
||||
for key, gw := range groupWatchers {
|
||||
gw.mu.Lock()
|
||||
// Calculate the number of apiWatcher instances subscribed to gw.
|
||||
awsTotal := 0
|
||||
|
@ -314,14 +321,14 @@ func groupWatchersCleaner(gws map[string]*groupWatcher) {
|
|||
awsTotal += len(uw.aws) + len(uw.awsPending)
|
||||
}
|
||||
|
||||
if awsTotal == 0 {
|
||||
// There are no API watchers subscribed to gw.
|
||||
// Stop all the urlWatcher instances at gw and drop gw from gws in this case,
|
||||
if awsTotal == 0 && atomic.LoadInt32(&gw.apiWatcherInflightStartCalls) == 0 {
|
||||
// There are no API watchers subscribed to gw and there are no in-flight apiWatcher.mustStart() calls.
|
||||
// Stop all the urlWatcher instances at gw and drop gw from groupWatchers in this case,
|
||||
// but do it only on the second iteration in order to reduce urlWatcher churn
|
||||
// during scrape config reloads.
|
||||
if gw.noAPIWatchers {
|
||||
gw.cancel()
|
||||
delete(gws, key)
|
||||
delete(groupWatchers, key)
|
||||
} else {
|
||||
gw.noAPIWatchers = true
|
||||
}
|
||||
|
@ -411,6 +418,7 @@ func (gw *groupWatcher) startWatchersForRole(role string, aw *apiWatcher) {
|
|||
if gw.attachNodeMetadata && (role == "pod" || role == "endpoints" || role == "endpointslice") {
|
||||
gw.startWatchersForRole("node", nil)
|
||||
}
|
||||
|
||||
paths := getAPIPathsWithNamespaces(role, gw.namespaces, gw.selectors)
|
||||
for _, path := range paths {
|
||||
apiURL := gw.apiServer + path
|
||||
|
|
Loading…
Reference in a new issue