lib/promscrape: add ability to spread scrape targets among multiple vmagent instances

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1084
This commit is contained in:
Aliaksandr Valialkin 2021-02-28 18:39:57 +02:00
parent a75ead3027
commit 9a2bf65134
5 changed files with 52 additions and 1 deletions

View file

@ -34,6 +34,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets) for details.
## Quick Start
@ -227,6 +228,21 @@ Read more about relabeling in the following articles:
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
## Scraping big number of targets
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` clustering).
Each `vmagent` instance in the cluster must have identical configs, including identical `-promscrape.config` files, except of a single command-line flag:
`-promscrape.cluster.memberNum`. The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
spread scrape targets among a cluster of two `vmagent` instances:
```
/path/to/vmagent ... -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0
/path/to/vmagent ... -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1
```
## Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page

View file

@ -11,6 +11,7 @@
* `process_io_write_syscalls_total` - the number of write syscalls such as write and pwrite
* `process_io_storage_read_bytes_total` - the number of bytes read from storage layer
* `process_io_storage_written_bytes_total` - the number of bytes written to storage layer
* FEATURE: vmagent: add ability to spread scrape targets among multiple `vmagent` instances. See [these docs](https://victoriametrics.github.io/vmagent.html#scraping-big-number-of-targets) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1084) for details.
* FEATURE: vmagent: use watch API for Kuberntes service discovery. This should reduce load on Kuberntes API server when it tracks big number of objects (for example, 10K pods). This should also reduce the time needed for k8s targets discovery.
* FEATURE: vmagent: export `vm_promscrape_target_relabel_duration_seconds` metric, which can be used for monitoring the time spend on relabeling for discovered targets.
* FEATURE: vmagent: optimize [relabeling](https://victoriametrics.github.io/vmagent.html#relabeling) performance for common cases.

View file

@ -34,6 +34,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
* Scrape targets can be spread among multiple `vmagent` instances when big number of targets must be scraped. See [these docs](#scraping-big-number-of-targets) for details.
## Quick Start
@ -227,6 +228,21 @@ Read more about relabeling in the following articles:
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
## Scraping big number of targets
A single `vmagent` instance can scrape tens of thousands of scrape targets. Sometimes this isn't enough due to limitations on CPU, network, RAM, etc.
In this case scrape targets can be split among multiple `vmagent` instances (aka `vmagent` clustering).
Each `vmagent` instance in the cluster must have identical configs, including identical `-promscrape.config` files, except of a single command-line flag:
`-promscrape.cluster.memberNum`. The flag value must be in the range `0 ... N-1`, where `N` is the number of `vmagent` instances in the cluster.
The number of `vmagent` instances in the cluster must be passed to `-promscrape.cluster.membersCount` command-line flag. For example, the following commands
spread scrape targets among a cluster of two `vmagent` instances:
```
/path/to/vmagent ... -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=0
/path/to/vmagent ... -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1
```
## Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page

View file

@ -12,6 +12,7 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
@ -29,6 +30,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/openstack"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
"github.com/VictoriaMetrics/metrics"
xxhash "github.com/cespare/xxhash/v2"
"gopkg.in/yaml.v2"
)
@ -42,6 +44,11 @@ var (
dropOriginalLabels = flag.Bool("promscrape.dropOriginalLabels", false, "Whether to drop original labels for scrape targets at /targets and /api/v1/targets pages. "+
"This may be needed for reducing memory usage when original labels for big number of scrape targets occupy big amounts of memory. "+
"Note that this reduces debuggability for improper per-target relabeling configs")
clusterMembersCount = flag.Int("promscrape.cluster.membersCount", 0, "The number of members in a cluster of scrapers. "+
"Each member must have an unique -promscrape.cluster.memberNum in the range 0 ... promscrape.cluster.membersCount-1 . "+
"Each member then scrapes roughly 1/N of all the targets. By default cluster scraping is disabled, i.e. a single scraper scrapes all the targets")
clusterMemberNum = flag.Int("promscrape.cluster.memberNum", 0, "The number of number in the cluster of scrapers. "+
"It must be an unique value in the range 0 ... promscrape.cluster.membersCount-1 across scrapers in the cluster")
)
// Config represents essential parts from Prometheus config defined at https://prometheus.io/docs/prometheus/latest/configuration/configuration/
@ -714,6 +721,9 @@ func (stc *StaticConfig) appendScrapeWork(dst []*ScrapeWork, swc *scrapeWorkConf
func (swc *scrapeWorkConfig) getScrapeWork(target string, extraLabels, metaLabels map[string]string) (*ScrapeWork, error) {
key := getScrapeWorkKey(extraLabels, metaLabels)
if needSkipScrapeWork(key) {
return nil, nil
}
if sw := swc.cache.Get(key); sw != nil {
return sw, nil
}
@ -731,6 +741,14 @@ func getScrapeWorkKey(extraLabels, metaLabels map[string]string) string {
return string(b)
}
func needSkipScrapeWork(key string) bool {
if *clusterMembersCount <= 0 {
return false
}
h := int(xxhash.Sum64(bytesutil.ToUnsafeBytes(key)))
return (h % *clusterMembersCount) != *clusterMemberNum
}
func appendSortedKeyValuePairs(dst []byte, m map[string]string) []byte {
keys := make([]string, 0, len(m))
for k := range m {

View file

@ -183,7 +183,7 @@ func (sw *scrapeWork) run(stopCh <-chan struct{}) {
// This also makes consistent scrape times across restarts
// for a target with the same ScrapeURL and labels.
key := fmt.Sprintf("ScrapeURL=%s, Labels=%s", sw.Config.ScrapeURL, sw.Config.LabelsString())
h := uint32(xxhash.Sum64([]byte(key)))
h := uint32(xxhash.Sum64(bytesutil.ToUnsafeBytes(key)))
randSleep = uint64(float64(scrapeInterval) * (float64(h) / (1 << 32)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(scrapeInterval)
if randSleep < sleepOffset {