mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/promscrape: show dropped targets because of sharding at /service-discovery page
Previously the /service-discovery page didn't show targets dropped because of sharding ( https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets ). Show also the reason why every target is dropped at /service-discovery page. This should improve debuging why particular targets are dropped. While at it, do not remove dropped targets from the list at /service-discovery page until the total number of targets exceeds the limit passed to -promscrape.maxDroppedTargets . Previously the list was cleaned up every 10 minutes from the entries, which weren't updated for the last minute. This could complicate debugging of dropped targets. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389
This commit is contained in:
parent
e1359c904c
commit
487f6380d0
6 changed files with 61 additions and 39 deletions
|
@ -31,6 +31,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.disableOnDiskQueue` command-line flag, which can be used for disabling data queueing to disk when the remote storage cannot keep up with the data ingestion rate. See [these docs](https://docs.victoriametrics.com/vmagent.html#disabling-on-disk-persistence) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2110).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for reading and writing samples via [Google PubSub](https://cloud.google.com/pubsub). See [these docs](https://docs.victoriametrics.com/vmagent.html#google-pubsub-integration).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for Datadog `/api/v2/series` and `/api/beta/sketches` ingestion protocols to vmagent/vminsert components. See this [doc](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) for examples. Thanks to @AndrewChubatiuk for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5094).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): show all the dropped targets together with the reason why they are dropped at `http://vmagent:8429/service-discovery` page. Previously targets, which were dropped because of [target sharding](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets) weren't displayed on this page. This could complicate service discovery debugging. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389).
|
||||
* FEATURE: reduce the default value for `-import.maxLineLen` command-line flag from 100MB to 10MB in order to prevent excessive memory usage during data import via [/api/v1/import](https://docs.victoriametrics.com/#how-to-import-data-in-json-line-format).
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
|
||||
|
|
|
@ -1049,14 +1049,18 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
|||
defer promutils.PutLabels(labels)
|
||||
|
||||
mergeLabels(labels, swc, target, extraLabels, metaLabels)
|
||||
var originalLabels *promutils.Labels
|
||||
if !*dropOriginalLabels {
|
||||
originalLabels = labels.Clone()
|
||||
}
|
||||
originalLabels := labels.Clone()
|
||||
labels.Labels = swc.relabelConfigs.Apply(labels.Labels, 0)
|
||||
// Remove labels starting from "__meta_" prefix according to https://www.robustperception.io/life-of-a-label/
|
||||
labels.RemoveMetaLabels()
|
||||
|
||||
if labels.Len() == 0 {
|
||||
// Drop target without labels.
|
||||
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonRelabeling)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Verify whether the scrape work must be skipped because of `-promscrape.cluster.*` configs.
|
||||
// Perform the verification on labels after the relabeling in order to guarantee that targets with the same set of labels
|
||||
// go to the same vmagent shard.
|
||||
|
@ -1067,23 +1071,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
|||
needSkip := needSkipScrapeWork(bytesutil.ToUnsafeString(bb.B), *clusterMembersCount, *clusterReplicationFactor, clusterMemberID)
|
||||
scrapeWorkKeyBufPool.Put(bb)
|
||||
if needSkip {
|
||||
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonSharding)
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
if !*dropOriginalLabels {
|
||||
originalLabels.Sort()
|
||||
// Reduce memory usage by interning all the strings in originalLabels.
|
||||
originalLabels.InternStrings()
|
||||
}
|
||||
if labels.Len() == 0 {
|
||||
// Drop target without labels.
|
||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
|
||||
return nil, nil
|
||||
}
|
||||
scrapeURL, address := promrelabel.GetScrapeURL(labels, swc.params)
|
||||
if scrapeURL == "" {
|
||||
// Drop target without URL.
|
||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
|
||||
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonMissingScrapeURL)
|
||||
return nil, nil
|
||||
}
|
||||
if _, err := url.Parse(scrapeURL); err != nil {
|
||||
|
@ -1155,6 +1152,7 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
|||
// Reduce memory usage by interning all the strings in labels.
|
||||
labelsCopy.InternStrings()
|
||||
|
||||
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||
sw := &ScrapeWork{
|
||||
ScrapeURL: scrapeURL,
|
||||
ScrapeInterval: scrapeInterval,
|
||||
|
@ -1185,6 +1183,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
|||
return sw, nil
|
||||
}
|
||||
|
||||
func sortOriginalLabelsIfNeeded(originalLabels *promutils.Labels) *promutils.Labels {
|
||||
if *dropOriginalLabels {
|
||||
return nil
|
||||
}
|
||||
originalLabels.Sort()
|
||||
// Reduce memory usage by interning all the strings in originalLabels.
|
||||
originalLabels.InternStrings()
|
||||
return originalLabels
|
||||
}
|
||||
|
||||
func mergeLabels(dst *promutils.Labels, swc *scrapeWorkConfig, target string, extraLabels, metaLabels *promutils.Labels) {
|
||||
if n := dst.Len(); n > 0 {
|
||||
logger.Panicf("BUG: len(dst.Labels) must be 0; got %d", n)
|
||||
|
|
|
@ -370,7 +370,7 @@ func (sg *scraperGroup) update(sws []*ScrapeWork) {
|
|||
"original labels for target1: %s; original labels for target2: %s",
|
||||
sw.ScrapeURL, sw.Labels.String(), originalLabels.String(), sw.OriginalLabels.String())
|
||||
}
|
||||
droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs)
|
||||
droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs, targetDropReasonDuplicate)
|
||||
continue
|
||||
}
|
||||
swsMap[key] = sw.OriginalLabels
|
||||
|
|
|
@ -13,7 +13,6 @@ import (
|
|||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/cespare/xxhash/v2"
|
||||
|
@ -248,17 +247,25 @@ func (ts *targetStatus) getDurationFromLastScrape() time.Duration {
|
|||
}
|
||||
|
||||
type droppedTargets struct {
|
||||
mu sync.Mutex
|
||||
m map[uint64]droppedTarget
|
||||
lastCleanupTime uint64
|
||||
mu sync.Mutex
|
||||
m map[uint64]droppedTarget
|
||||
}
|
||||
|
||||
type droppedTarget struct {
|
||||
originalLabels *promutils.Labels
|
||||
relabelConfigs *promrelabel.ParsedConfigs
|
||||
deadline uint64
|
||||
dropReason targetDropReason
|
||||
}
|
||||
|
||||
type targetDropReason string
|
||||
|
||||
const (
|
||||
targetDropReasonRelabeling = targetDropReason("relabeling") // target dropped because of relabeling
|
||||
targetDropReasonMissingScrapeURL = targetDropReason("missing scrape URL") // target dropped because of missing scrape URL
|
||||
targetDropReasonDuplicate = targetDropReason("duplicate") // target with the given set of labels already exists
|
||||
targetDropReasonSharding = targetDropReason("sharding") // target is dropped becase of sharding https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets
|
||||
)
|
||||
|
||||
func (dt *droppedTargets) getTargetsList() []droppedTarget {
|
||||
dt.mu.Lock()
|
||||
dts := make([]droppedTarget, 0, len(dt.m))
|
||||
|
@ -275,30 +282,30 @@ func (dt *droppedTargets) getTargetsList() []droppedTarget {
|
|||
return dts
|
||||
}
|
||||
|
||||
func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs) {
|
||||
if *dropOriginalLabels {
|
||||
// The originalLabels must be dropped, so do not register it.
|
||||
// Register registers dropped target with the given originalLabels.
|
||||
//
|
||||
// The relabelConfigs must contain relabel configs, which were applied to originalLabels.
|
||||
// The reason must contain the reason why the target has been dropped.
|
||||
func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs, reason targetDropReason) {
|
||||
if originalLabels == nil {
|
||||
// Do not register target without originalLabels. This is the case when *dropOriginalLabels is set to true.
|
||||
return
|
||||
}
|
||||
// It is better to have hash collisions instead of spending additional CPU on originalLabels.String() call.
|
||||
key := labelsHash(originalLabels)
|
||||
currentTime := fasttime.UnixTimestamp()
|
||||
dt.mu.Lock()
|
||||
_, ok := dt.m[key]
|
||||
if ok || len(dt.m) < *maxDroppedTargets {
|
||||
dt.m[key] = droppedTarget{
|
||||
originalLabels: originalLabels,
|
||||
relabelConfigs: relabelConfigs,
|
||||
deadline: currentTime + 10*60,
|
||||
}
|
||||
dt.m[key] = droppedTarget{
|
||||
originalLabels: originalLabels,
|
||||
relabelConfigs: relabelConfigs,
|
||||
dropReason: reason,
|
||||
}
|
||||
if currentTime-dt.lastCleanupTime > 60 {
|
||||
for k, v := range dt.m {
|
||||
if currentTime > v.deadline {
|
||||
delete(dt.m, k)
|
||||
if len(dt.m) >= *maxDroppedTargets {
|
||||
for k := range dt.m {
|
||||
delete(dt.m, k)
|
||||
if len(dt.m) < *maxDroppedTargets {
|
||||
break
|
||||
}
|
||||
}
|
||||
dt.lastCleanupTime = currentTime
|
||||
}
|
||||
dt.mu.Unlock()
|
||||
}
|
||||
|
@ -514,6 +521,7 @@ type targetLabels struct {
|
|||
up bool
|
||||
originalLabels *promutils.Labels
|
||||
labels *promutils.Labels
|
||||
dropReason targetDropReason
|
||||
}
|
||||
type targetLabelsByJob struct {
|
||||
jobName string
|
||||
|
@ -604,6 +612,7 @@ func (tsr *targetsStatusResult) getTargetLabelsByJob() []*targetLabelsByJob {
|
|||
m.droppedTargets++
|
||||
m.targets = append(m.targets, targetLabels{
|
||||
originalLabels: dt.originalLabels,
|
||||
dropReason: dt.dropReason,
|
||||
})
|
||||
}
|
||||
a := make([]*targetLabelsByJob, 0, len(byJob))
|
||||
|
|
|
@ -336,7 +336,7 @@
|
|||
{% elseif t.labels.Len() > 0 %}
|
||||
<span class="badge bg-danger">DOWN</span>
|
||||
{% else %}
|
||||
<span class="badge bg-warning">DROPPED</span>
|
||||
<span class="badge bg-warning">DROPPED ({%s string(t.dropReason) %})</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="labels">
|
||||
|
|
|
@ -875,7 +875,11 @@ func streamdiscoveredJobTargets(qw422016 *qt422016.Writer, num int, tlj *targetL
|
|||
//line lib/promscrape/targetstatus.qtpl:338
|
||||
} else {
|
||||
//line lib/promscrape/targetstatus.qtpl:338
|
||||
qw422016.N().S(`<span class="badge bg-warning">DROPPED</span>`)
|
||||
qw422016.N().S(`<span class="badge bg-warning">DROPPED (`)
|
||||
//line lib/promscrape/targetstatus.qtpl:339
|
||||
qw422016.E().S(string(t.dropReason))
|
||||
//line lib/promscrape/targetstatus.qtpl:339
|
||||
qw422016.N().S(`)</span>`)
|
||||
//line lib/promscrape/targetstatus.qtpl:340
|
||||
}
|
||||
//line lib/promscrape/targetstatus.qtpl:340
|
||||
|
|
Loading…
Reference in a new issue