From 4c808d58bfb737a528a8e9ee0ed022d2ea9a412b Mon Sep 17 00:00:00 2001
From: Nikolay <nik@victoriametrics.com>
Date: Wed, 4 Nov 2020 21:29:18 +0300
Subject: [PATCH] Adds ready probe (#874)

* adds leading forward slash check for scrapeURL path
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/835

* adds ready probe for scrape config initialization,
it should prevent metrics loss during vmagent rolling update,
/ready api will return 425 http code, if some scrape config still waits for initialization.

* updates docs

* Update app/vmagent/README.md

* renames var

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
---
 app/vmagent/README.md     |  2 ++
 app/vmagent/main.go       | 11 +++++++++++
 app/vminsert/main.go      | 10 ++++++++++
 docs/vmagent.md           |  2 ++
 lib/promscrape/scraper.go | 13 ++++++++++++-
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/app/vmagent/README.md b/app/vmagent/README.md
index 6d132ad5a5..099947c5b6 100644
--- a/app/vmagent/README.md
+++ b/app/vmagent/README.md
@@ -219,6 +219,8 @@ It accepts optional `show_original_labels=1` query arg, which shows the original
 This information may be useful for debugging target relabeling.
 * `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
 
+* `http://vmagent-host:8429/ready` - this handler returns http 200 status code when `vmagent` finishes initialization for all service_discovery configs.
+  It may be useful, when you have many entries at `-promscrape.config` and want to perform `vmagent` rolling update without scrape loss.
 
 ### Troubleshooting
 
diff --git a/app/vmagent/main.go b/app/vmagent/main.go
index d3fa1d6b9f..4d2b7acb57 100644
--- a/app/vmagent/main.go
+++ b/app/vmagent/main.go
@@ -7,6 +7,7 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
@@ -222,6 +223,16 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
 		procutil.SelfSIGHUP()
 		w.WriteHeader(http.StatusOK)
 		return true
+	case "/ready":
+		if rdy := atomic.LoadInt32(&promscrape.PendingScrapeConfigs); rdy > 0 {
+			errMsg := fmt.Sprintf("waiting for scrapes to init, left: %d", rdy)
+			http.Error(w, errMsg, http.StatusTooEarly)
+		} else {
+			w.Header().Set("Content-Type", "text/plain")
+			w.WriteHeader(http.StatusOK)
+			w.Write([]byte("OK"))
+		}
+		return true
 	}
 	return false
 }
diff --git a/app/vminsert/main.go b/app/vminsert/main.go
index 504922c946..1ab37655d0 100644
--- a/app/vminsert/main.go
+++ b/app/vminsert/main.go
@@ -170,6 +170,16 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 		procutil.SelfSIGHUP()
 		w.WriteHeader(http.StatusNoContent)
 		return true
+	case "/ready":
+		if rdy := atomic.LoadInt32(&promscrape.PendingScrapeConfigs); rdy > 0 {
+			errMsg := fmt.Sprintf("waiting for scrape config to init targets, configs left: %d", rdy)
+			http.Error(w, errMsg, http.StatusTooEarly)
+		} else {
+			w.Header().Set("Content-Type", "text/plain")
+			w.WriteHeader(http.StatusOK)
+			w.Write([]byte("OK"))
+		}
+		return true
 	default:
 		// This is not our link
 		return false
diff --git a/docs/vmagent.md b/docs/vmagent.md
index 6d132ad5a5..87559d8880 100644
--- a/docs/vmagent.md
+++ b/docs/vmagent.md
@@ -219,6 +219,8 @@ It accepts optional `show_original_labels=1` query arg, which shows the original
 This information may be useful for debugging target relabeling.
 * `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
 
+* `http://vmagent-host:8429/ready` - this handler returns http 200 status code, when `vmagent` finished initialization for all service_discovery configs. 
+  It may be useful, when you have a lof of entries at promscrape.config and want to perform `vmagent` rolling update without metrics loss.
 
 ### Troubleshooting
 
diff --git a/lib/promscrape/scraper.go b/lib/promscrape/scraper.go
index 341e66462a..52f99a97eb 100644
--- a/lib/promscrape/scraper.go
+++ b/lib/promscrape/scraper.go
@@ -5,6 +5,7 @@ import (
 	"flag"
 	"fmt"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -75,6 +76,9 @@ func Stop() {
 var (
 	globalStopCh chan struct{}
 	scraperWG    sync.WaitGroup
+	// PendingScrapeConfigs - zero value means, that
+	// all scrapeConfigs are inited and ready for work.
+	PendingScrapeConfigs int32
 )
 
 func runScraper(configFile string, pushData func(wr *prompbmarshal.WriteRequest), globalStopCh <-chan struct{}) {
@@ -166,6 +170,7 @@ func newScrapeConfigs(pushData func(wr *prompbmarshal.WriteRequest)) *scrapeConf
 }
 
 func (scs *scrapeConfigs) add(name string, checkInterval time.Duration, getScrapeWork func(cfg *Config, swsPrev []ScrapeWork) []ScrapeWork) {
+	atomic.AddInt32(&PendingScrapeConfigs, 1)
 	scfg := &scrapeConfig{
 		name:          name,
 		pushData:      scs.pushData,
@@ -216,10 +221,15 @@ func (scfg *scrapeConfig) run() {
 
 	cfg := <-scfg.cfgCh
 	var swsPrev []ScrapeWork
-	for {
+	updateScrapeWork := func(cfg *Config) {
 		sws := scfg.getScrapeWork(cfg, swsPrev)
 		sg.update(sws)
 		swsPrev = sws
+	}
+	updateScrapeWork(cfg)
+	atomic.AddInt32(&PendingScrapeConfigs, -1)
+
+	for {
 
 		select {
 		case <-scfg.stopCh:
@@ -227,6 +237,7 @@ func (scfg *scrapeConfig) run() {
 		case cfg = <-scfg.cfgCh:
 		case <-tickerCh:
 		}
+		updateScrapeWork(cfg)
 	}
 }