mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-10 15:14:09 +00:00
app/vmselect: add support for vmstorage groups with independent -replicationFactor per group
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5197 See https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmstorage-groups-at-vmselect Thanks to @zekker6 for the initial pull request at https://github.com/VictoriaMetrics/VictoriaMetrics-enterprise/pull/718
This commit is contained in:
parent
242472086b
commit
0f91f83639
4 changed files with 239 additions and 10 deletions
|
@ -34,6 +34,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
* SECURITY: upgrade Go builder from Go1.21.4 to Go1.21.5. See [the list of issues addressed in Go1.21.5](https://github.com/golang/go/issues?q=milestone%3AGo1.21.5+label%3ACherryPickApproved).
|
||||
|
||||
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to send requests to the first available backend and fall back to other `hot standby` backends when the first backend is unavailable. This allows building highly available setups as shown in [these docs](https://docs.victoriametrics.com/vmauth.html#high-availability). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792).
|
||||
* FEATURE: `vmselect`: allow specifying multiple groups of `vmstorage` nodes with independent `-replicationFactor` per each group. See [these docs](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmstorage-groups-at-vmselect) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5197) for details.
|
||||
* FEATURE: `vmselect`: allow opening [vmui](https://docs.victoriametrics.com/#vmui) and investigating [Top queries](https://docs.victoriametrics.com/#top-queries) and [Active queries](https://docs.victoriametrics.com/#active-queries) when the `vmselect` is overloaded with concurrent queries (e.g. when more than `-search.maxConcurrentRequests` concurrent queries are executed). Previously an attempt to open `Top queries` or `Active queries` at `vmui` could result in `couldn't start executing the request in ... seconds, since -search.maxConcurrentRequests=... concurrent requests are executed` error, which could complicate debugging of overloaded `vmselect` or single-node VictoriaMetrics.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-enableMultitenantHandlers` command-line flag, which allows receiving data via [VictoriaMetrics cluster urls](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format) at `vmagent` and converting [tenant ids](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) to (`vm_account_id`, `vm_project_id`) labels before sending the data to the configured `-remoteWrite.url`. See [these docs](https://docs.victoriametrics.com/vmagent.html#multitenancy) for details.
|
||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.disableOnDiskQueue` command-line flag, which can be used for disabling data queueing to disk when the remote storage cannot keep up with the data ingestion rate. See [these docs](https://docs.victoriametrics.com/vmagent.html#disabling-on-disk-persistence) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2110).
|
||||
|
|
|
@ -567,6 +567,8 @@ The cluster works in the following way when some of `vmstorage` nodes are unavai
|
|||
if less than `-replicationFactor` vmstorage nodes are unavailable during querying, since it assumes that the remaining
|
||||
`vmstorage` nodes contain the full data. See [these docs](#replication-and-data-safety) for details.
|
||||
|
||||
It is also possible to configure independent replication factor per distinct `vmstorage` groups - see [these docs](#vmstorage-groups-at-vmselect).
|
||||
|
||||
`vmselect` doesn't serve partial responses for API handlers returning raw datapoints - [`/api/v1/export*` endpoints](https://docs.victoriametrics.com/#how-to-export-time-series), since users usually expect this data is always complete.
|
||||
|
||||
Data replication can be used for increasing storage durability. See [these docs](#replication-and-data-safety) for details.
|
||||
|
@ -678,17 +680,61 @@ Then an additional `vmselect` nodes can be configured for reading the data from
|
|||
|
||||
## Multi-level cluster setup
|
||||
|
||||
`vmselect` nodes can be queried by other `vmselect` nodes if they run with `-clusternativeListenAddr` command-line flag. For example, if `vmselect` is started with `-clusternativeListenAddr=:8401`, then it can accept queries from another `vmselect` nodes at TCP port 8401 in the same way as `vmstorage` nodes do. This allows chaining `vmselect` nodes and building multi-level cluster topologies. For example, the top-level `vmselect` node can query second-level `vmselect` nodes in different availability zones (AZ), while the second-level `vmselect` nodes can query `vmstorage` nodes in local AZ.
|
||||
`vmselect` nodes can be queried by other `vmselect` nodes if they run with `-clusternativeListenAddr` command-line flag.
|
||||
For example, if `vmselect` is started with `-clusternativeListenAddr=:8401`, then it can accept queries from another `vmselect` nodes at TCP port 8401
|
||||
in the same way as `vmstorage` nodes do. This allows chaining `vmselect` nodes and building multi-level cluster topologies.
|
||||
For example, the top-level `vmselect` node can query second-level `vmselect` nodes in different availability zones (AZ),
|
||||
while the second-level `vmselect` nodes can query `vmstorage` nodes in local AZ. See also [vmstorage groups at vmselect](#vmstorage-groups-at-vmselect).
|
||||
|
||||
`vminsert` nodes can accept data from another `vminsert` nodes if they run with `-clusternativeListenAddr` command-line flag. For example, if `vminsert` is started with `-clusternativeListenAddr=:8400`, then it can accept data from another `vminsert` nodes at TCP port 8400 in the same way as `vmstorage` nodes do. This allows chaining `vminsert` nodes and building multi-level cluster topologies. For example, the top-level `vminsert` node can replicate data among the second level of `vminsert` nodes located in distinct availability zones (AZ), while the second-level `vminsert` nodes can spread the data among `vmstorage` nodes in local AZ.
|
||||
`vminsert` nodes can accept data from another `vminsert` nodes if they run with `-clusternativeListenAddr` command-line flag.
|
||||
For example, if `vminsert` is started with `-clusternativeListenAddr=:8400`, then it can accept data from another `vminsert` nodes at TCP port 8400
|
||||
in the same way as `vmstorage` nodes do. This allows chaining `vminsert` nodes and building multi-level cluster topologies.
|
||||
For example, the top-level `vminsert` node can replicate data among the second level of `vminsert` nodes located in distinct availability zones (AZ),
|
||||
while the second-level `vminsert` nodes can spread the data among `vmstorage` nodes in local AZ.
|
||||
|
||||
The multi-level cluster setup for `vminsert` nodes has the following shortcomings because of synchronous replication and data sharding:
|
||||
|
||||
* Data ingestion speed is limited by the slowest link to AZ.
|
||||
* `vminsert` nodes at top level re-route incoming data to the remaining AZs when some AZs are temporarily unavailable. This results in data gaps at AZs which were temporarily unavailable.
|
||||
|
||||
These issues are addressed by [vmagent](https://docs.victoriametrics.com/vmagent.html) when it runs in [multitenancy mode](https://docs.victoriametrics.com/vmagent.html#multitenancy). `vmagent` buffers data, which must be sent to a particular AZ, when this AZ is temporarily unavailable. The buffer is stored on disk. The buffered data is sent to AZ as soon as it becomes available.
|
||||
These issues are addressed by [vmagent](https://docs.victoriametrics.com/vmagent.html) when it runs in [multitenancy mode](https://docs.victoriametrics.com/vmagent.html#multitenancy).
|
||||
`vmagent` buffers data, which must be sent to a particular AZ, when this AZ is temporarily unavailable. The buffer is stored on disk. The buffered data is sent to AZ as soon as it becomes available.
|
||||
|
||||
## vmstorage groups at vmselect
|
||||
|
||||
`vmselect` can be configured to query multiple distinct groups of `vmstorage` nodes with individual `-replicationFactor` per each group.
|
||||
The following format for `-storageNode` command-line flag value should be used for assigning a particular `addr` of `vmstorage` to a particular `groupName` -
|
||||
`-storageNode=groupName/addr`. For example, the following command runs `vmselect`, which continues returning full responses if up to one node per each group is temporarily unavailable
|
||||
because the given `-replicationFactor=2` is applied individually per each group:
|
||||
|
||||
```
|
||||
/path/to/vmselect \
|
||||
-replicationFactor=2 \
|
||||
-storageNode=group1/host1 \
|
||||
-storageNode=group1/host2 \
|
||||
-storageNode=group1/host3 \
|
||||
-storageNode=group2/host4 \
|
||||
-storageNode=group2/host5 \
|
||||
-storageNode=group2/host6 \
|
||||
-storageNode=group3/host7 \
|
||||
-storageNode=group3/host8 \
|
||||
-storageNode=group3/host9
|
||||
```
|
||||
|
||||
It is possible to specify distinct `-replicationFactor` per each group via the following format - `-replicationFactor=groupName:rf`.
|
||||
For example, the following command runs `vmselect`, which uses `-replicationFactor=3` for the `group1`, while it uses `-replicationFactor=1` for the `group2`:
|
||||
|
||||
```
|
||||
/path/to/vmselect \
|
||||
-replicationFactor=group1:3 \
|
||||
-storageNode=group1/host1 \
|
||||
-storageNode=group1/host2 \
|
||||
-storageNode=group1/host3 \
|
||||
-replicationFactor=group2:1 \
|
||||
-storageNode=group2/host4 \
|
||||
-storageNode=group2/host5 \
|
||||
-storageNode=group2/host6
|
||||
```
|
||||
|
||||
## Helm
|
||||
|
||||
|
@ -701,19 +747,32 @@ It is available in the [helm-charts](https://github.com/VictoriaMetrics/helm-cha
|
|||
|
||||
## Replication and data safety
|
||||
|
||||
By default, VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath` such as [Google compute persistent disk](https://cloud.google.com/compute/docs/disks#pdspecs), which guarantees data durability. VictoriaMetrics supports application-level replication if replicated durable persistent disks cannot be used for some reason.
|
||||
By default, VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath` such as [Google compute persistent disk](https://cloud.google.com/compute/docs/disks#pdspecs),
|
||||
which guarantees data durability. VictoriaMetrics supports application-level replication if replicated durable persistent disks cannot be used for some reason.
|
||||
|
||||
The replication can be enabled by passing `-replicationFactor=N` command-line flag to `vminsert`. This instructs `vminsert` to store `N` copies for every ingested sample on `N` distinct `vmstorage` nodes. This guarantees that all the stored data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
|
||||
The replication can be enabled by passing `-replicationFactor=N` command-line flag to `vminsert`. This instructs `vminsert` to store `N` copies for every ingested sample
|
||||
on `N` distinct `vmstorage` nodes. This guarantees that all the stored data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
|
||||
|
||||
Passing `-replicationFactor=N` command-line flag to `vmselect` instructs it to not mark responses as `partial` if less than `-replicationFactor` vmstorage nodes are unavailable during the query. See [cluster availability docs](#cluster-availability) for details.
|
||||
Passing `-replicationFactor=N` command-line flag to `vmselect` instructs it to not mark responses as `partial` if less than `-replicationFactor` vmstorage nodes are unavailable during the query.
|
||||
See [cluster availability docs](#cluster-availability) for details.
|
||||
|
||||
The cluster must contain at least `2*N-1` `vmstorage` nodes, where `N` is replication factor, in order to maintain the given replication factor for newly ingested data when `N-1` of storage nodes are unavailable.
|
||||
The cluster must contain at least `2*N-1` `vmstorage` nodes, where `N` is replication factor, in order to maintain the given replication factor
|
||||
for newly ingested data when `N-1` of storage nodes are unavailable.
|
||||
|
||||
VictoriaMetrics stores timestamps with millisecond precision, so `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect` nodes when the replication is enabled, so they could de-duplicate replicated samples obtained from distinct `vmstorage` nodes during querying. If duplicate data is pushed to VictoriaMetrics from identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) instances or Prometheus instances, then the `-dedup.minScrapeInterval` must be set to `scrape_interval` from scrape configs according to [deduplication docs](#deduplication).
|
||||
VictoriaMetrics stores timestamps with millisecond precision, so `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect` nodes when the replication is enabled,
|
||||
so they could de-duplicate replicated samples obtained from distinct `vmstorage` nodes during querying. If duplicate data is pushed to VictoriaMetrics
|
||||
from identically configured [vmagent](https://docs.victoriametrics.com/vmagent.html) instances or Prometheus instances, then the `-dedup.minScrapeInterval` must be set
|
||||
to `scrape_interval` from scrape configs according to [deduplication docs](#deduplication).
|
||||
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883), so it is recommended performing regular backups. See [these docs](#backups) for details.
|
||||
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
|
||||
so it is recommended performing regular backups. See [these docs](#backups) for details.
|
||||
|
||||
Note that the replication increases resource usage - CPU, RAM, disk space, network bandwidth - by up to `-replicationFactor=N` times, because `vminsert` stores `N` copies of incoming data to distinct `vmstorage` nodes and `vmselect` needs to de-duplicate the replicated data obtained from `vmstorage` nodes during querying. So it is more cost-effective to offload the replication to underlying replicated durable storage pointed by `-storageDataPath` such as [Google Compute Engine persistent disk](https://cloud.google.com/compute/docs/disks/#pdspecs), which is protected from data loss and data corruption. It also provides consistently high performance and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime. HDD-based persistent disks should be enough for the majority of use cases. It is recommended using durable replicated persistent volumes in Kubernetes.
|
||||
Note that the replication increases resource usage - CPU, RAM, disk space, network bandwidth - by up to `-replicationFactor=N` times, because `vminsert` stores `N` copies
|
||||
of incoming data to distinct `vmstorage` nodes and `vmselect` needs to de-duplicate the replicated data obtained from `vmstorage` nodes during querying.
|
||||
So it is more cost-effective to offload the replication to underlying replicated durable storage pointed by `-storageDataPath`
|
||||
such as [Google Compute Engine persistent disk](https://cloud.google.com/compute/docs/disks/#pdspecs), which is protected from data loss and data corruption.
|
||||
It also provides consistently high performance and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime.
|
||||
HDD-based persistent disks should be enough for the majority of use cases. It is recommended using durable replicated persistent volumes in Kubernetes.
|
||||
|
||||
## Deduplication
|
||||
|
||||
|
|
100
lib/flagutil/dict.go
Normal file
100
lib/flagutil/dict.go
Normal file
|
@ -0,0 +1,100 @@
|
|||
package flagutil
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DictInt allows specifying a dictionary of named ints in the form `name1:value1,...,nameN:valueN`.
|
||||
type DictInt struct {
|
||||
defaultValue int
|
||||
kvs []kIntValue
|
||||
}
|
||||
|
||||
type kIntValue struct {
|
||||
k string
|
||||
v int
|
||||
}
|
||||
|
||||
// NewDictInt creates DictInt with the given name, defaultValue and description.
|
||||
func NewDictInt(name string, defaultValue int, description string) *DictInt {
|
||||
description += fmt.Sprintf(" (default %d)", defaultValue)
|
||||
description += "\nSupports an `array` of `key:value` entries separated by comma or specified via multiple flags."
|
||||
di := &DictInt{
|
||||
defaultValue: defaultValue,
|
||||
}
|
||||
flag.Var(di, name, description)
|
||||
return di
|
||||
}
|
||||
|
||||
// String implements flag.Value interface
|
||||
func (di *DictInt) String() string {
|
||||
kvs := di.kvs
|
||||
if len(kvs) == 1 && kvs[0].k == "" {
|
||||
// Short form - a single int value
|
||||
return strconv.Itoa(kvs[0].v)
|
||||
}
|
||||
|
||||
formattedResults := make([]string, len(kvs))
|
||||
for i, kv := range kvs {
|
||||
formattedResults[i] = fmt.Sprintf("%s:%d", kv.k, kv.v)
|
||||
}
|
||||
return strings.Join(formattedResults, ",")
|
||||
}
|
||||
|
||||
// Set implements flag.Value interface
|
||||
func (di *DictInt) Set(value string) error {
|
||||
values := parseArrayValues(value)
|
||||
if len(di.kvs) == 0 && len(values) == 1 && strings.IndexByte(values[0], ':') < 0 {
|
||||
v, err := strconv.Atoi(values[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
di.kvs = append(di.kvs, kIntValue{
|
||||
v: v,
|
||||
})
|
||||
return nil
|
||||
}
|
||||
for _, x := range values {
|
||||
n := strings.IndexByte(x, ':')
|
||||
if n < 0 {
|
||||
return fmt.Errorf("missing ':' in %q", x)
|
||||
}
|
||||
k := x[:n]
|
||||
v, err := strconv.Atoi(x[n+1:])
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot parse value for key=%q: %w", k, err)
|
||||
}
|
||||
if di.contains(k) {
|
||||
return fmt.Errorf("duplicate value for key=%q: %d", k, v)
|
||||
}
|
||||
di.kvs = append(di.kvs, kIntValue{
|
||||
k: k,
|
||||
v: v,
|
||||
})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (di *DictInt) contains(key string) bool {
|
||||
for _, kv := range di.kvs {
|
||||
if kv.k == key {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Get returns value for the given key.
|
||||
//
|
||||
// Default value is returned if key isn't found in di.
|
||||
func (di *DictInt) Get(key string) int {
|
||||
for _, kv := range di.kvs {
|
||||
if kv.k == key {
|
||||
return kv.v
|
||||
}
|
||||
}
|
||||
return di.defaultValue
|
||||
}
|
69
lib/flagutil/dict_test.go
Normal file
69
lib/flagutil/dict_test.go
Normal file
|
@ -0,0 +1,69 @@
|
|||
package flagutil
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDictIntSetSuccess(t *testing.T) {
|
||||
f := func(s string) {
|
||||
t.Helper()
|
||||
var di DictInt
|
||||
if err := di.Set(s); err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
result := di.String()
|
||||
if result != s {
|
||||
t.Fatalf("unexpected DictInt.String(); got %q; want %q", result, s)
|
||||
}
|
||||
}
|
||||
|
||||
f("")
|
||||
f("123")
|
||||
f("-234")
|
||||
f("foo:123")
|
||||
f("foo:123,bar:-42,baz:0,aa:43")
|
||||
}
|
||||
|
||||
func TestDictIntFailure(t *testing.T) {
|
||||
f := func(s string) {
|
||||
t.Helper()
|
||||
var di DictInt
|
||||
if err := di.Set(s); err == nil {
|
||||
t.Fatalf("expecting non-nil error")
|
||||
}
|
||||
}
|
||||
|
||||
// missing values
|
||||
f("foo")
|
||||
f("foo:")
|
||||
|
||||
// non-integer values
|
||||
f("foo:bar")
|
||||
f("12.34")
|
||||
f("foo:123.34")
|
||||
|
||||
// duplicate keys
|
||||
f("a:234,k:123,k:432")
|
||||
}
|
||||
|
||||
func TestDictIntGet(t *testing.T) {
|
||||
f := func(s, key string, defaultValue, expectedValue int) {
|
||||
t.Helper()
|
||||
var di DictInt
|
||||
di.defaultValue = defaultValue
|
||||
if err := di.Set(s); err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
value := di.Get(key)
|
||||
if value != expectedValue {
|
||||
t.Fatalf("unexpected value; got %d; want %d", value, expectedValue)
|
||||
}
|
||||
}
|
||||
|
||||
f("", "", 123, 123)
|
||||
f("", "foo", 123, 123)
|
||||
f("foo:42", "", 123, 123)
|
||||
f("foo:42", "foo", 123, 42)
|
||||
f("532", "", 123, 532)
|
||||
f("532", "foo", 123, 123)
|
||||
}
|
Loading…
Reference in a new issue