lib/promscrape: show dropped targets because of sharding at /service-discovery page

Previously the /service-discovery page didn't show targets dropped because of sharding ( https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets ). Show also the reason why every target is dropped at /service-discovery page. This should improve debuging why particular targets are dropped. While at it, do not remove dropped targets from the list at /service-discovery page until the total number of targets exceeds the limit passed to -promscrape.maxDroppedTargets . Previously the list was cleaned up every 10 minutes from the entries, which weren't updated for the last minute. This could complicate debugging of dropped targets. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389
2024-11-21 14:44:00 +00:00 · 2023-12-01 16:37:57 +02:00 · 2023-12-01 16:37:57 +02:00 · 487f6380d0
commit 487f6380d0
parent e1359c904c
6 changed files with 61 additions and 39 deletions
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -31,6 +31,7 @@ The sandbox cluster installation is running under the constant load generated by
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.disableOnDiskQueue` command-line flag, which can be used for disabling data queueing to disk when the remote storage cannot keep up with the data ingestion rate. See [these docs](https://docs.victoriametrics.com/vmagent.html#disabling-on-disk-persistence) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2110).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for reading and writing samples via [Google PubSub](https://cloud.google.com/pubsub). See [these docs](https://docs.victoriametrics.com/vmagent.html#google-pubsub-integration).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for Datadog `/api/v2/series` and `/api/beta/sketches` ingestion protocols to vmagent/vminsert components. See this [doc](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) for examples. Thanks to @AndrewChubatiuk for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5094).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): show all the dropped targets together with the reason why they are dropped at `http://vmagent:8429/service-discovery` page. Previously targets, which were dropped because of [target sharding](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets) weren't displayed on this page. This could complicate service discovery debugging. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389).
 * FEATURE: reduce the default value for `-import.maxLineLen` command-line flag from 100MB to 10MB in order to prevent excessive memory usage during data import via [/api/v1/import](https://docs.victoriametrics.com/#how-to-import-data-in-json-line-format).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
--- a/lib/promscrape/config.go
+++ b/lib/promscrape/config.go
@ -1049,14 +1049,18 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
 	defer promutils.PutLabels(labels)
 	mergeLabels(labels, swc, target, extraLabels, metaLabels)
-	var originalLabels *promutils.Labels
+	originalLabels := labels.Clone()
 	if !*dropOriginalLabels {
 		originalLabels = labels.Clone()
 	}
 	labels.Labels = swc.relabelConfigs.Apply(labels.Labels, 0)
 	// Remove labels starting from "__meta_" prefix according to https://www.robustperception.io/life-of-a-label/
 	labels.RemoveMetaLabels()
 	if labels.Len() == 0 {
 		// Drop target without labels.
 		originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
 		droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonRelabeling)
 		return nil, nil
 	}
 	// Verify whether the scrape work must be skipped because of `-promscrape.cluster.*` configs.
 	// Perform the verification on labels after the relabeling in order to guarantee that targets with the same set of labels
 	// go to the same vmagent shard.
@ -1067,23 +1071,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
 		needSkip := needSkipScrapeWork(bytesutil.ToUnsafeString(bb.B), *clusterMembersCount, *clusterReplicationFactor, clusterMemberID)
 		scrapeWorkKeyBufPool.Put(bb)
 		if needSkip {
 			originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
 			droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonSharding)
 			return nil, nil
 		}
 	}
 	if !*dropOriginalLabels {
 		originalLabels.Sort()
 		// Reduce memory usage by interning all the strings in originalLabels.
 		originalLabels.InternStrings()
 	}
 	if labels.Len() == 0 {
 		// Drop target without labels.
 		droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
 		return nil, nil
 	}
 	scrapeURL, address := promrelabel.GetScrapeURL(labels, swc.params)
 	if scrapeURL == "" {
 		// Drop target without URL.
-		droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
+		originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
 		droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonMissingScrapeURL)
 		return nil, nil
 	}
 	if _, err := url.Parse(scrapeURL); err != nil {
@ -1155,6 +1152,7 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
 	// Reduce memory usage by interning all the strings in labels.
 	labelsCopy.InternStrings()
 	originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
 	sw := &ScrapeWork{
 		ScrapeURL:            scrapeURL,
 		ScrapeInterval:       scrapeInterval,
@ -1185,6 +1183,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
 	return sw, nil
 }
 func sortOriginalLabelsIfNeeded(originalLabels *promutils.Labels) *promutils.Labels {
 	if *dropOriginalLabels {
 		return nil
 	}
 	originalLabels.Sort()
 	// Reduce memory usage by interning all the strings in originalLabels.
 	originalLabels.InternStrings()
 	return originalLabels
 }
 func mergeLabels(dst *promutils.Labels, swc *scrapeWorkConfig, target string, extraLabels, metaLabels *promutils.Labels) {
 	if n := dst.Len(); n > 0 {
 		logger.Panicf("BUG: len(dst.Labels) must be 0; got %d", n)
--- a/lib/promscrape/scraper.go
+++ b/lib/promscrape/scraper.go
@ -370,7 +370,7 @@ func (sg *scraperGroup) update(sws []*ScrapeWork) {
 					"original labels for target1: %s; original labels for target2: %s",
 					sw.ScrapeURL, sw.Labels.String(), originalLabels.String(), sw.OriginalLabels.String())
 			}
-			droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs)
+			droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs, targetDropReasonDuplicate)
 			continue
 		}
 		swsMap[key] = sw.OriginalLabels
--- a/lib/promscrape/targetstatus.go
+++ b/lib/promscrape/targetstatus.go
@ -13,7 +13,6 @@ import (
 	"time"
 	"unsafe"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
 	"github.com/cespare/xxhash/v2"
@ -248,17 +247,25 @@ func (ts *targetStatus) getDurationFromLastScrape() time.Duration {
 }
 type droppedTargets struct {
-	mu              sync.Mutex
+	mu sync.Mutex
-	m               map[uint64]droppedTarget
+	m  map[uint64]droppedTarget
 	lastCleanupTime uint64
 }
 type droppedTarget struct {
 	originalLabels *promutils.Labels
 	relabelConfigs *promrelabel.ParsedConfigs
-	deadline       uint64
+	dropReason     targetDropReason
 }
 type targetDropReason string
 const (
 	targetDropReasonRelabeling       = targetDropReason("relabeling")         // target dropped because of relabeling
 	targetDropReasonMissingScrapeURL = targetDropReason("missing scrape URL") // target dropped because of missing scrape URL
 	targetDropReasonDuplicate        = targetDropReason("duplicate")          // target with the given set of labels already exists
 	targetDropReasonSharding         = targetDropReason("sharding")           // target is dropped becase of sharding https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets
 )
 func (dt *droppedTargets) getTargetsList() []droppedTarget {
 	dt.mu.Lock()
 	dts := make([]droppedTarget, 0, len(dt.m))
@ -275,30 +282,30 @@ func (dt *droppedTargets) getTargetsList() []droppedTarget {
 	return dts
 }
-func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs) {
+// Register registers dropped target with the given originalLabels.
-	if *dropOriginalLabels {
+//
-		// The originalLabels must be dropped, so do not register it.
+// The relabelConfigs must contain relabel configs, which were applied to originalLabels.
 // The reason must contain the reason why the target has been dropped.
 func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs, reason targetDropReason) {
 	if originalLabels == nil {
 		// Do not register target without originalLabels. This is the case when *dropOriginalLabels is set to true.
 		return
 	}
 	// It is better to have hash collisions instead of spending additional CPU on originalLabels.String() call.
 	key := labelsHash(originalLabels)
 	currentTime := fasttime.UnixTimestamp()
 	dt.mu.Lock()
-	_, ok := dt.m[key]
+	dt.m[key] = droppedTarget{
-	if ok || len(dt.m) < *maxDroppedTargets {
+		originalLabels: originalLabels,
-		dt.m[key] = droppedTarget{
+		relabelConfigs: relabelConfigs,
-			originalLabels: originalLabels,
+		dropReason:     reason,
 			relabelConfigs: relabelConfigs,
 			deadline:       currentTime + 10*60,
 		}
 	}
-	if currentTime-dt.lastCleanupTime > 60 {
+	if len(dt.m) >= *maxDroppedTargets {
-		for k, v := range dt.m {
+		for k := range dt.m {
-			if currentTime > v.deadline {
+			delete(dt.m, k)
-				delete(dt.m, k)
+			if len(dt.m) < *maxDroppedTargets {
 				break
 			}
 		}
 		dt.lastCleanupTime = currentTime
 	}
 	dt.mu.Unlock()
 }
@ -514,6 +521,7 @@ type targetLabels struct {
 	up             bool
 	originalLabels *promutils.Labels
 	labels         *promutils.Labels
 	dropReason     targetDropReason
 }
 type targetLabelsByJob struct {
 	jobName        string
@ -604,6 +612,7 @@ func (tsr *targetsStatusResult) getTargetLabelsByJob() []*targetLabelsByJob {
 		m.droppedTargets++
 		m.targets = append(m.targets, targetLabels{
 			originalLabels: dt.originalLabels,
 			dropReason:     dt.dropReason,
 		})
 	}
 	a := make([]*targetLabelsByJob, 0, len(byJob))
--- a/lib/promscrape/targetstatus.qtpl
+++ b/lib/promscrape/targetstatus.qtpl
@ -336,7 +336,7 @@
                        {% elseif t.labels.Len() > 0 %}
                            <span class="badge bg-danger">DOWN</span>
                        {% else %}
-                            <span class="badge bg-warning">DROPPED</span>
+                            <span class="badge bg-warning">DROPPED ({%s string(t.dropReason) %})</span>
                        {% endif %}
                    </td>
                    <td class="labels">
--- a/lib/promscrape/targetstatus.qtpl.go
+++ b/lib/promscrape/targetstatus.qtpl.go
@ -875,7 +875,11 @@ func streamdiscoveredJobTargets(qw422016 *qt422016.Writer, num int, tlj *targetL
 //line lib/promscrape/targetstatus.qtpl:338
 		} else {
 //line lib/promscrape/targetstatus.qtpl:338
-			qw422016.N().S(`<span class="badge bg-warning">DROPPED</span>`)
+			qw422016.N().S(`<span class="badge bg-warning">DROPPED (`)
 //line lib/promscrape/targetstatus.qtpl:339
 			qw422016.E().S(string(t.dropReason))
 //line lib/promscrape/targetstatus.qtpl:339
 			qw422016.N().S(`)</span>`)
 //line lib/promscrape/targetstatus.qtpl:340
 		}
 //line lib/promscrape/targetstatus.qtpl:340