lib/promscrape: retry service discovery in a few seconds if it starts returning 0 targets

This should reduce recovery time from temporary issues during service discovery
This commit is contained in:
Aliaksandr Valialkin 2021-03-14 21:53:20 +02:00
parent bf15d6a6a2
commit 83edbb7cab

View file

@ -231,11 +231,17 @@ func (scfg *scrapeConfig) run() {
cfg := <-scfg.cfgCh
var swsPrev []*ScrapeWork
updateScrapeWork := func(cfg *Config) {
startTime := time.Now()
sws := scfg.getScrapeWork(cfg, swsPrev)
sg.update(sws)
swsPrev = sws
scfg.discoveryDuration.UpdateDuration(startTime)
for {
startTime := time.Now()
sws := scfg.getScrapeWork(cfg, swsPrev)
retry := sg.update(sws)
swsPrev = sws
scfg.discoveryDuration.UpdateDuration(startTime)
if !retry {
return
}
time.Sleep(2 * time.Second)
}
}
updateScrapeWork(cfg)
atomic.AddInt32(&PendingScrapeConfigs, -1)
@ -295,7 +301,7 @@ func (sg *scraperGroup) stop() {
sg.wg.Wait()
}
func (sg *scraperGroup) update(sws []*ScrapeWork) {
func (sg *scraperGroup) update(sws []*ScrapeWork) (retry bool) {
sg.mLock.Lock()
defer sg.mLock.Unlock()
@ -352,6 +358,7 @@ func (sg *scraperGroup) update(sws []*ScrapeWork) {
sg.changesCount.Add(additionsCount + deletionsCount)
logger.Infof("%s: added targets: %d, removed targets: %d; total targets: %d", sg.name, additionsCount, deletionsCount, len(sg.m))
}
return deletionsCount > 0 && len(sg.m) == 0
}
type scraper struct {