mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
lib/promscrape: show dropped targets because of sharding at /service-discovery page
Previously the /service-discovery page didn't show targets dropped because of sharding ( https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets ). Show also the reason why every target is dropped at /service-discovery page. This should improve debuging why particular targets are dropped. While at it, do not remove dropped targets from the list at /service-discovery page until the total number of targets exceeds the limit passed to -promscrape.maxDroppedTargets . Previously the list was cleaned up every 10 minutes from the entries, which weren't updated for the last minute. This could complicate debugging of dropped targets. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389
This commit is contained in:
parent
e1359c904c
commit
487f6380d0
6 changed files with 61 additions and 39 deletions
|
@ -31,6 +31,7 @@ The sandbox cluster installation is running under the constant load generated by
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.disableOnDiskQueue` command-line flag, which can be used for disabling data queueing to disk when the remote storage cannot keep up with the data ingestion rate. See [these docs](https://docs.victoriametrics.com/vmagent.html#disabling-on-disk-persistence) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2110).
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.disableOnDiskQueue` command-line flag, which can be used for disabling data queueing to disk when the remote storage cannot keep up with the data ingestion rate. See [these docs](https://docs.victoriametrics.com/vmagent.html#disabling-on-disk-persistence) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2110).
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for reading and writing samples via [Google PubSub](https://cloud.google.com/pubsub). See [these docs](https://docs.victoriametrics.com/vmagent.html#google-pubsub-integration).
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for reading and writing samples via [Google PubSub](https://cloud.google.com/pubsub). See [these docs](https://docs.victoriametrics.com/vmagent.html#google-pubsub-integration).
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for Datadog `/api/v2/series` and `/api/beta/sketches` ingestion protocols to vmagent/vminsert components. See this [doc](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) for examples. Thanks to @AndrewChubatiuk for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5094).
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add support for Datadog `/api/v2/series` and `/api/beta/sketches` ingestion protocols to vmagent/vminsert components. See this [doc](https://docs.victoriametrics.com/#how-to-send-data-from-datadog-agent) for examples. Thanks to @AndrewChubatiuk for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5094).
|
||||||
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): show all the dropped targets together with the reason why they are dropped at `http://vmagent:8429/service-discovery` page. Previously targets, which were dropped because of [target sharding](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets) weren't displayed on this page. This could complicate service discovery debugging. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389).
|
||||||
* FEATURE: reduce the default value for `-import.maxLineLen` command-line flag from 100MB to 10MB in order to prevent excessive memory usage during data import via [/api/v1/import](https://docs.victoriametrics.com/#how-to-import-data-in-json-line-format).
|
* FEATURE: reduce the default value for `-import.maxLineLen` command-line flag from 100MB to 10MB in order to prevent excessive memory usage during data import via [/api/v1/import](https://docs.victoriametrics.com/#how-to-import-data-in-json-line-format).
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `keep_if_contains` and `drop_if_contains` relabeling actions. See [these docs](https://docs.victoriametrics.com/vmagent.html#relabeling-enhancements) for details.
|
||||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
|
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [day_of_year()](https://docs.victoriametrics.com/MetricsQL.html#day_of_year) function, which returns the day of the year for each of the given unix timestamps. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5345) for details. Thanks to @luckyxiaoqiang for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5368/).
|
||||||
|
|
|
@ -1049,14 +1049,18 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
||||||
defer promutils.PutLabels(labels)
|
defer promutils.PutLabels(labels)
|
||||||
|
|
||||||
mergeLabels(labels, swc, target, extraLabels, metaLabels)
|
mergeLabels(labels, swc, target, extraLabels, metaLabels)
|
||||||
var originalLabels *promutils.Labels
|
originalLabels := labels.Clone()
|
||||||
if !*dropOriginalLabels {
|
|
||||||
originalLabels = labels.Clone()
|
|
||||||
}
|
|
||||||
labels.Labels = swc.relabelConfigs.Apply(labels.Labels, 0)
|
labels.Labels = swc.relabelConfigs.Apply(labels.Labels, 0)
|
||||||
// Remove labels starting from "__meta_" prefix according to https://www.robustperception.io/life-of-a-label/
|
// Remove labels starting from "__meta_" prefix according to https://www.robustperception.io/life-of-a-label/
|
||||||
labels.RemoveMetaLabels()
|
labels.RemoveMetaLabels()
|
||||||
|
|
||||||
|
if labels.Len() == 0 {
|
||||||
|
// Drop target without labels.
|
||||||
|
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||||
|
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonRelabeling)
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Verify whether the scrape work must be skipped because of `-promscrape.cluster.*` configs.
|
// Verify whether the scrape work must be skipped because of `-promscrape.cluster.*` configs.
|
||||||
// Perform the verification on labels after the relabeling in order to guarantee that targets with the same set of labels
|
// Perform the verification on labels after the relabeling in order to guarantee that targets with the same set of labels
|
||||||
// go to the same vmagent shard.
|
// go to the same vmagent shard.
|
||||||
|
@ -1067,23 +1071,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
||||||
needSkip := needSkipScrapeWork(bytesutil.ToUnsafeString(bb.B), *clusterMembersCount, *clusterReplicationFactor, clusterMemberID)
|
needSkip := needSkipScrapeWork(bytesutil.ToUnsafeString(bb.B), *clusterMembersCount, *clusterReplicationFactor, clusterMemberID)
|
||||||
scrapeWorkKeyBufPool.Put(bb)
|
scrapeWorkKeyBufPool.Put(bb)
|
||||||
if needSkip {
|
if needSkip {
|
||||||
|
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||||
|
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonSharding)
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !*dropOriginalLabels {
|
|
||||||
originalLabels.Sort()
|
|
||||||
// Reduce memory usage by interning all the strings in originalLabels.
|
|
||||||
originalLabels.InternStrings()
|
|
||||||
}
|
|
||||||
if labels.Len() == 0 {
|
|
||||||
// Drop target without labels.
|
|
||||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
scrapeURL, address := promrelabel.GetScrapeURL(labels, swc.params)
|
scrapeURL, address := promrelabel.GetScrapeURL(labels, swc.params)
|
||||||
if scrapeURL == "" {
|
if scrapeURL == "" {
|
||||||
// Drop target without URL.
|
// Drop target without URL.
|
||||||
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs)
|
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||||
|
droppedTargetsMap.Register(originalLabels, swc.relabelConfigs, targetDropReasonMissingScrapeURL)
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
if _, err := url.Parse(scrapeURL); err != nil {
|
if _, err := url.Parse(scrapeURL); err != nil {
|
||||||
|
@ -1155,6 +1152,7 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
||||||
// Reduce memory usage by interning all the strings in labels.
|
// Reduce memory usage by interning all the strings in labels.
|
||||||
labelsCopy.InternStrings()
|
labelsCopy.InternStrings()
|
||||||
|
|
||||||
|
originalLabels = sortOriginalLabelsIfNeeded(originalLabels)
|
||||||
sw := &ScrapeWork{
|
sw := &ScrapeWork{
|
||||||
ScrapeURL: scrapeURL,
|
ScrapeURL: scrapeURL,
|
||||||
ScrapeInterval: scrapeInterval,
|
ScrapeInterval: scrapeInterval,
|
||||||
|
@ -1185,6 +1183,16 @@ func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabel
|
||||||
return sw, nil
|
return sw, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func sortOriginalLabelsIfNeeded(originalLabels *promutils.Labels) *promutils.Labels {
|
||||||
|
if *dropOriginalLabels {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
originalLabels.Sort()
|
||||||
|
// Reduce memory usage by interning all the strings in originalLabels.
|
||||||
|
originalLabels.InternStrings()
|
||||||
|
return originalLabels
|
||||||
|
}
|
||||||
|
|
||||||
func mergeLabels(dst *promutils.Labels, swc *scrapeWorkConfig, target string, extraLabels, metaLabels *promutils.Labels) {
|
func mergeLabels(dst *promutils.Labels, swc *scrapeWorkConfig, target string, extraLabels, metaLabels *promutils.Labels) {
|
||||||
if n := dst.Len(); n > 0 {
|
if n := dst.Len(); n > 0 {
|
||||||
logger.Panicf("BUG: len(dst.Labels) must be 0; got %d", n)
|
logger.Panicf("BUG: len(dst.Labels) must be 0; got %d", n)
|
||||||
|
|
|
@ -370,7 +370,7 @@ func (sg *scraperGroup) update(sws []*ScrapeWork) {
|
||||||
"original labels for target1: %s; original labels for target2: %s",
|
"original labels for target1: %s; original labels for target2: %s",
|
||||||
sw.ScrapeURL, sw.Labels.String(), originalLabels.String(), sw.OriginalLabels.String())
|
sw.ScrapeURL, sw.Labels.String(), originalLabels.String(), sw.OriginalLabels.String())
|
||||||
}
|
}
|
||||||
droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs)
|
droppedTargetsMap.Register(sw.OriginalLabels, sw.RelabelConfigs, targetDropReasonDuplicate)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
swsMap[key] = sw.OriginalLabels
|
swsMap[key] = sw.OriginalLabels
|
||||||
|
|
|
@ -13,7 +13,6 @@ import (
|
||||||
"time"
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||||
"github.com/cespare/xxhash/v2"
|
"github.com/cespare/xxhash/v2"
|
||||||
|
@ -248,17 +247,25 @@ func (ts *targetStatus) getDurationFromLastScrape() time.Duration {
|
||||||
}
|
}
|
||||||
|
|
||||||
type droppedTargets struct {
|
type droppedTargets struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
m map[uint64]droppedTarget
|
m map[uint64]droppedTarget
|
||||||
lastCleanupTime uint64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type droppedTarget struct {
|
type droppedTarget struct {
|
||||||
originalLabels *promutils.Labels
|
originalLabels *promutils.Labels
|
||||||
relabelConfigs *promrelabel.ParsedConfigs
|
relabelConfigs *promrelabel.ParsedConfigs
|
||||||
deadline uint64
|
dropReason targetDropReason
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type targetDropReason string
|
||||||
|
|
||||||
|
const (
|
||||||
|
targetDropReasonRelabeling = targetDropReason("relabeling") // target dropped because of relabeling
|
||||||
|
targetDropReasonMissingScrapeURL = targetDropReason("missing scrape URL") // target dropped because of missing scrape URL
|
||||||
|
targetDropReasonDuplicate = targetDropReason("duplicate") // target with the given set of labels already exists
|
||||||
|
targetDropReasonSharding = targetDropReason("sharding") // target is dropped becase of sharding https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets
|
||||||
|
)
|
||||||
|
|
||||||
func (dt *droppedTargets) getTargetsList() []droppedTarget {
|
func (dt *droppedTargets) getTargetsList() []droppedTarget {
|
||||||
dt.mu.Lock()
|
dt.mu.Lock()
|
||||||
dts := make([]droppedTarget, 0, len(dt.m))
|
dts := make([]droppedTarget, 0, len(dt.m))
|
||||||
|
@ -275,30 +282,30 @@ func (dt *droppedTargets) getTargetsList() []droppedTarget {
|
||||||
return dts
|
return dts
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs) {
|
// Register registers dropped target with the given originalLabels.
|
||||||
if *dropOriginalLabels {
|
//
|
||||||
// The originalLabels must be dropped, so do not register it.
|
// The relabelConfigs must contain relabel configs, which were applied to originalLabels.
|
||||||
|
// The reason must contain the reason why the target has been dropped.
|
||||||
|
func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs, reason targetDropReason) {
|
||||||
|
if originalLabels == nil {
|
||||||
|
// Do not register target without originalLabels. This is the case when *dropOriginalLabels is set to true.
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// It is better to have hash collisions instead of spending additional CPU on originalLabels.String() call.
|
// It is better to have hash collisions instead of spending additional CPU on originalLabels.String() call.
|
||||||
key := labelsHash(originalLabels)
|
key := labelsHash(originalLabels)
|
||||||
currentTime := fasttime.UnixTimestamp()
|
|
||||||
dt.mu.Lock()
|
dt.mu.Lock()
|
||||||
_, ok := dt.m[key]
|
dt.m[key] = droppedTarget{
|
||||||
if ok || len(dt.m) < *maxDroppedTargets {
|
originalLabels: originalLabels,
|
||||||
dt.m[key] = droppedTarget{
|
relabelConfigs: relabelConfigs,
|
||||||
originalLabels: originalLabels,
|
dropReason: reason,
|
||||||
relabelConfigs: relabelConfigs,
|
|
||||||
deadline: currentTime + 10*60,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if currentTime-dt.lastCleanupTime > 60 {
|
if len(dt.m) >= *maxDroppedTargets {
|
||||||
for k, v := range dt.m {
|
for k := range dt.m {
|
||||||
if currentTime > v.deadline {
|
delete(dt.m, k)
|
||||||
delete(dt.m, k)
|
if len(dt.m) < *maxDroppedTargets {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dt.lastCleanupTime = currentTime
|
|
||||||
}
|
}
|
||||||
dt.mu.Unlock()
|
dt.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
@ -514,6 +521,7 @@ type targetLabels struct {
|
||||||
up bool
|
up bool
|
||||||
originalLabels *promutils.Labels
|
originalLabels *promutils.Labels
|
||||||
labels *promutils.Labels
|
labels *promutils.Labels
|
||||||
|
dropReason targetDropReason
|
||||||
}
|
}
|
||||||
type targetLabelsByJob struct {
|
type targetLabelsByJob struct {
|
||||||
jobName string
|
jobName string
|
||||||
|
@ -604,6 +612,7 @@ func (tsr *targetsStatusResult) getTargetLabelsByJob() []*targetLabelsByJob {
|
||||||
m.droppedTargets++
|
m.droppedTargets++
|
||||||
m.targets = append(m.targets, targetLabels{
|
m.targets = append(m.targets, targetLabels{
|
||||||
originalLabels: dt.originalLabels,
|
originalLabels: dt.originalLabels,
|
||||||
|
dropReason: dt.dropReason,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
a := make([]*targetLabelsByJob, 0, len(byJob))
|
a := make([]*targetLabelsByJob, 0, len(byJob))
|
||||||
|
|
|
@ -336,7 +336,7 @@
|
||||||
{% elseif t.labels.Len() > 0 %}
|
{% elseif t.labels.Len() > 0 %}
|
||||||
<span class="badge bg-danger">DOWN</span>
|
<span class="badge bg-danger">DOWN</span>
|
||||||
{% else %}
|
{% else %}
|
||||||
<span class="badge bg-warning">DROPPED</span>
|
<span class="badge bg-warning">DROPPED ({%s string(t.dropReason) %})</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td class="labels">
|
<td class="labels">
|
||||||
|
|
|
@ -875,7 +875,11 @@ func streamdiscoveredJobTargets(qw422016 *qt422016.Writer, num int, tlj *targetL
|
||||||
//line lib/promscrape/targetstatus.qtpl:338
|
//line lib/promscrape/targetstatus.qtpl:338
|
||||||
} else {
|
} else {
|
||||||
//line lib/promscrape/targetstatus.qtpl:338
|
//line lib/promscrape/targetstatus.qtpl:338
|
||||||
qw422016.N().S(`<span class="badge bg-warning">DROPPED</span>`)
|
qw422016.N().S(`<span class="badge bg-warning">DROPPED (`)
|
||||||
|
//line lib/promscrape/targetstatus.qtpl:339
|
||||||
|
qw422016.E().S(string(t.dropReason))
|
||||||
|
//line lib/promscrape/targetstatus.qtpl:339
|
||||||
|
qw422016.N().S(`)</span>`)
|
||||||
//line lib/promscrape/targetstatus.qtpl:340
|
//line lib/promscrape/targetstatus.qtpl:340
|
||||||
}
|
}
|
||||||
//line lib/promscrape/targetstatus.qtpl:340
|
//line lib/promscrape/targetstatus.qtpl:340
|
||||||
|
|
Loading…
Reference in a new issue