From 15dda54e7938872bf4de24845680747ddd6d2c28 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Fri, 27 Oct 2023 20:22:44 +0200
Subject: [PATCH] lib/promscrape/discovery/kubernetes: propagate possible
 errors at newAPIWatcher() to the caller

This allows substituting FATAL panics with recoverable runtime errors such as missing or invalid TLS CA file
and/or missing/invalid /var/run/secrets/kubernetes.io/serviceaccount/namespace file.
Now these errors are logged instead of PANIC'ing, so they can be fixed by updating the corresponding files
without the need to restart vmagent.

This is a follow-up for 90427abc65a048aa70196223a810b7e208fb8e0b
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5243
---
 lib/promscrape/discovery/kubernetes/api.go    |  7 ++--
 .../discovery/kubernetes/api_watcher.go       | 33 ++++++++++++-------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/lib/promscrape/discovery/kubernetes/api.go b/lib/promscrape/discovery/kubernetes/api.go
index a01feaad1f..7ef6c4464d 100644
--- a/lib/promscrape/discovery/kubernetes/api.go
+++ b/lib/promscrape/discovery/kubernetes/api.go
@@ -90,11 +90,10 @@ func newAPIConfig(sdc *SDConfig, baseDir string, swcFunc ScrapeWorkConstructorFu
 	for strings.HasSuffix(apiServer, "/") {
 		apiServer = apiServer[:len(apiServer)-1]
 	}
-	// pre-check tls config
-	if _, err := ac.NewTLSConfig(); err != nil {
-		return nil, fmt.Errorf("cannot initialize tls config: %w", err)
+	aw, err := newAPIWatcher(apiServer, ac, sdc, swcFunc)
+	if err != nil {
+		return nil, fmt.Errorf("cannot initialize Kubernetes API watcher: %w", err)
 	}
-	aw := newAPIWatcher(apiServer, ac, sdc, swcFunc)
 	cfg := &apiConfig{
 		aw: aw,
 	}
diff --git a/lib/promscrape/discovery/kubernetes/api_watcher.go b/lib/promscrape/discovery/kubernetes/api_watcher.go
index d44b60cd64..dab6c1feff 100644
--- a/lib/promscrape/discovery/kubernetes/api_watcher.go
+++ b/lib/promscrape/discovery/kubernetes/api_watcher.go
@@ -66,13 +66,13 @@ type apiWatcher struct {
 	swosCount *metrics.Counter
 }
 
-func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc ScrapeWorkConstructorFunc) *apiWatcher {
+func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc ScrapeWorkConstructorFunc) (*apiWatcher, error) {
 	namespaces := sdc.Namespaces.Names
 	if len(namespaces) == 0 {
 		if sdc.Namespaces.OwnNamespace {
 			namespace, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace")
 			if err != nil {
-				logger.Panicf("FATAL: cannot determine namespace for the current pod according to `own_namespace: true` option in kubernetes_sd_config: %s", err)
+				return nil, fmt.Errorf("cannot determine namespace for the current pod according to `own_namespace: true` option in kubernetes_sd_config: %w", err)
 			}
 			namespaces = []string{string(namespace)}
 		}
@@ -80,15 +80,19 @@ func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc
 	selectors := sdc.Selectors
 	attachNodeMetadata := sdc.AttachMetadata.Node
 	proxyURL := sdc.ProxyURL.GetURL()
-	gw := getGroupWatcher(apiServer, ac, namespaces, selectors, attachNodeMetadata, proxyURL)
+	gw, err := getGroupWatcher(apiServer, ac, namespaces, selectors, attachNodeMetadata, proxyURL)
+	if err != nil {
+		return nil, err
+	}
 	role := sdc.role()
-	return &apiWatcher{
+	aw := &apiWatcher{
 		role:             role,
 		swcFunc:          swcFunc,
 		gw:               gw,
 		swosByURLWatcher: make(map[*urlWatcher]map[string][]interface{}),
 		swosCount:        metrics.GetOrCreateCounter(fmt.Sprintf(`vm_promscrape_discovery_kubernetes_scrape_works{role=%q}`, role)),
 	}
+	return aw, nil
 }
 
 func (aw *apiWatcher) mustStart() {
@@ -228,15 +232,14 @@ type groupWatcher struct {
 	noAPIWatchers bool
 }
 
-func newGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string, selectors []Selector, attachNodeMetadata bool, proxyURL *url.URL) *groupWatcher {
+func newGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string, selectors []Selector, attachNodeMetadata bool, proxyURL *url.URL) (*groupWatcher, error) {
 	var proxy func(*http.Request) (*url.URL, error)
 	if proxyURL != nil {
 		proxy = http.ProxyURL(proxyURL)
 	}
 	tlsConfig, err := ac.NewTLSConfig()
-	// we should always check tlsconfig in advance to avoid panic here
 	if err != nil {
-		logger.Panicf("FATAL: cannot initialize tls config: %s", err)
+		return nil, fmt.Errorf("cannot initialize tls config: %w", err)
 	}
 	client := &http.Client{
 		Transport: &http.Transport{
@@ -249,7 +252,7 @@ func newGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string,
 		Timeout: *apiServerTimeout,
 	}
 	ctx, cancel := context.WithCancel(context.Background())
-	return &groupWatcher{
+	gw := &groupWatcher{
 		apiServer:          apiServer,
 		namespaces:         namespaces,
 		selectors:          selectors,
@@ -264,9 +267,10 @@ func newGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string,
 		ctx:    ctx,
 		cancel: cancel,
 	}
+	return gw, nil
 }
 
-func getGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string, selectors []Selector, attachNodeMetadata bool, proxyURL *url.URL) *groupWatcher {
+func getGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string, selectors []Selector, attachNodeMetadata bool, proxyURL *url.URL) (*groupWatcher, error) {
 	proxyURLStr := "<nil>"
 	if proxyURL != nil {
 		proxyURLStr = proxyURL.String()
@@ -275,12 +279,17 @@ func getGroupWatcher(apiServer string, ac *promauth.Config, namespaces []string,
 		apiServer, namespaces, selectorsKey(selectors), attachNodeMetadata, proxyURLStr, ac.String())
 	groupWatchersLock.Lock()
 	gw := groupWatchers[key]
+	var err error
 	if gw == nil {
-		gw = newGroupWatcher(apiServer, ac, namespaces, selectors, attachNodeMetadata, proxyURL)
-		groupWatchers[key] = gw
+		gw, err = newGroupWatcher(apiServer, ac, namespaces, selectors, attachNodeMetadata, proxyURL)
+		if err != nil {
+			err = fmt.Errorf("cannot initialize watcher for key={%s}: %w", key, err)
+		} else {
+			groupWatchers[key] = gw
+		}
 	}
 	groupWatchersLock.Unlock()
-	return gw
+	return gw, err
 }
 
 func selectorsKey(selectors []Selector) string {