From 5714a68ac638ebcbb588742f77f4712b53c6bf80 Mon Sep 17 00:00:00 2001
From: Roman Khavronenko <roman@victoriametrics.com>
Date: Wed, 21 Sep 2022 10:48:38 +0200
Subject: [PATCH] deployment/docker: move cluster compose env to master branch
 (#3130)

* deployment/docker: move cluster compose env to master branch

The change supposed to simplify the process of maintaining for
single/cluster docker-compose envs, alerts, dashboards. It also
supposes to reduce confusion for users when looking for cluster
related alerts/configs.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* deployment/docker: move cluster compose env to master branch

Review updates.

Signed-off-by: hagen1778 <roman@victoriametrics.com>

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 deployment/docker/Makefile                    |  13 ++
 deployment/docker/README.md                   |  74 ++++++-
 deployment/docker/alerts-cluster.yml          | 199 ++++++++++++++++++
 deployment/docker/alerts-health.yml           |  54 +++++
 deployment/docker/alerts-vmagent.yml          | 122 +++++++++++
 deployment/docker/alerts.yml                  | 176 +---------------
 deployment/docker/docker-compose-cluster.yml  | 121 +++++++++++
 deployment/docker/docker-compose.yml          |  11 +-
 deployment/docker/prometheus-cluster.yml      |  19 ++
 .../provisioning/datasources/datasource.yml   |   6 +
 docs/Cluster-VictoriaMetrics.md               |   2 +-
 docs/Quick-Start.md                           |  13 +-
 12 files changed, 618 insertions(+), 192 deletions(-)
 create mode 100644 deployment/docker/alerts-cluster.yml
 create mode 100644 deployment/docker/alerts-health.yml
 create mode 100644 deployment/docker/alerts-vmagent.yml
 create mode 100644 deployment/docker/docker-compose-cluster.yml
 create mode 100644 deployment/docker/prometheus-cluster.yml

diff --git a/deployment/docker/Makefile b/deployment/docker/Makefile
index 0cf57ceb89..a38707444c 100644
--- a/deployment/docker/Makefile
+++ b/deployment/docker/Makefile
@@ -177,3 +177,16 @@ package-via-docker-386:
 
 remove-docker-images:
 	docker image ls --format '{{.Repository}}\t{{.ID}}' | awk '{print $$2}' | xargs docker image rm -f
+
+
+docker-single-up:
+	docker-compose -f deployment/docker/docker-compose.yml up
+
+docker-single-down:
+	docker-compose -f deployment/docker/docker-compose.yml down -v
+
+docker-cluster-up:
+	docker-compose -f deployment/docker/docker-compose-cluster.yml up
+
+docker-cluster-down:
+	docker-compose -f deployment/docker/docker-compose-cluster.yml down -v
\ No newline at end of file
diff --git a/deployment/docker/README.md b/deployment/docker/README.md
index 8090f2a9db..e1fa97d18c 100644
--- a/deployment/docker/README.md
+++ b/deployment/docker/README.md
@@ -1,12 +1,33 @@
 # Docker compose environment for VictoriaMetrics
 
-To spin-up VictoriaMetrics, vmagent, vmalert, Alertmanager and Grafana run the following command:
+Docker compose environment for VictoriaMetrics includes VictoriaMetrics components,
+[Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) 
+and [Grafana](https://grafana.com/).
 
-`docker-compose up`
+For starting the docker-compose environment ensure you have docker installed and running and access to the Internet.
+All commands should be executed from the root directory of this repo.
 
-For clustered version check [docker compose in cluster branch](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster/deployment/docker).
+To spin-up environment for single server VictoriaMetrics run the following command :
+```
+make docker-single-up
+```
 
-## VictoriaMetrics
+To shutdown the docker compose environment for single server run the following command:
+```
+make docker-single-down
+```
+
+For cluster version the command will be the following:
+```
+make docker-cluster-up
+```
+
+To shutdown the docker compose environment for cluster version run the following command:
+```
+make docker-cluster-down
+```
+
+## VictoriaMetrics single server
 
 VictoriaMetrics will be accessible on the following ports:
 
@@ -14,6 +35,40 @@ VictoriaMetrics will be accessible on the following ports:
 * `--opentsdbListenAddr=:4242`
 * `--httpListenAddr=:8428`
 
+The communication scheme between components is the following:
+* [vmagent](#vmagent) sends scraped metrics to VictoriaMetrics;
+* [grafana](#grafana) is configured with datasource pointing to VictoriaMetrics;
+* [vmalert](#vmalert) is configured to query VictoriaMetrics and send alerts state
+  and recording rules back to it;
+* [alertmanager](#alertmanager) is configured to receive notifications from vmalert.
+
+To access `vmalert` via `vmselect`
+use link [http://localhost:8428/vmalert](http://localhost:8428/vmalert/).
+
+To access [vmui](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#vmui)
+use link [http://localhost:8428/vmui](http://localhost:8428/vmui).
+
+## VictoriaMetrics cluster
+
+VictoriaMetrics cluster environemnt consists of vminsert, vmstorage and vmselect components. vmselect
+has exposed port `:8481`, vminsert has exposed port `:8480` and the rest of components are available
+only inside of environment.
+
+The communication scheme between components is the following:
+* [vmagent](#vmagent) sends scraped metrics to vminsert;
+* vminsert forwards data to vmstorage;
+* vmselect is connected to vmstorage for querying data;
+* [grafana](#grafana) is configured with datasource pointing to vmselect;
+* [vmalert](#vmalert) is configured to query vmselect and send alerts state
+  and recording rules to vminsert;
+* [alertmanager](#alertmanager) is configured to receive notifications from vmalert.
+
+To access `vmalert` via `vmselect` 
+use link [http://localhost:8481/select/0/prometheus/vmalert](http://localhost:8481/select/0/prometheus/vmalert/).
+
+To access [vmui](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#vmui) 
+use link [http://localhost:8481/select/0/prometheus/vmui](http://localhost:8481/select/0/prometheus/vmui).
+
 ## vmagent
 
 vmagent is used for scraping and pushing timeseries to
@@ -48,6 +103,11 @@ Default credential:
 
 Grafana is provisioned by default with following entities:
 
-* VictoriaMetrics datasource
-* Prometheus datasource
-* VictoriaMetrics overview dashboard
+* `VictoriaMetrics` datasource
+* `VictoriaMetrics - cluster` datasource
+* `VictoriaMetrics overview` dashboard
+* `VictoriaMetrics - cluster` dashboard
+* `VictoriaMetrics - vmagent` dashboard
+* `VictoriaMetrics - vmalert` dashboard
+
+Remember to pick `VictoriaMetrics - cluster` datasource when viewing `VictoriaMetrics - cluster` dashboard.
\ No newline at end of file
diff --git a/deployment/docker/alerts-cluster.yml b/deployment/docker/alerts-cluster.yml
new file mode 100644
index 0000000000..5f2587cf21
--- /dev/null
+++ b/deployment/docker/alerts-cluster.yml
@@ -0,0 +1,199 @@
+# File contains default list of alerts for VictoriaMetrics cluster.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  # Alerts group for VM cluster assumes that Grafana dashboard
+  # https://grafana.com/grafana/dashboards/11176 is installed.
+  # Please, update the `dashboard` annotation according to your setup.
+  - name: vmcluster
+    interval: 30s
+    concurrency: 2
+    rules:
+      - alert: DiskRunsOutOfSpaceIn3Days
+        expr: |
+          vm_free_disk_space_bytes / ignoring(path)
+          (
+             (
+              rate(vm_rows_added_to_storage_total[1d]) -
+              ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d])
+             )
+            * scalar(
+              sum(vm_data_size_bytes{type!="indexdb"}) /
+              sum(vm_rows{type!="indexdb"})
+             )
+          ) < 3 * 24 * 3600 > 0
+        for: 30m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} will run out of disk space in 3 days"
+          description: "Taking into account current ingestion rate, free disk space will be enough only
+              for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
+              Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
+
+      - alert: DiskRunsOutOfSpace
+        expr: |
+          sum(vm_data_size_bytes) by(instance) /
+          (
+           sum(vm_free_disk_space_bytes) by(instance) +
+           sum(vm_data_size_bytes) by(instance)
+          ) > 0.8
+        for: 30m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=110&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} will run out of disk space soon"
+          description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
+            Having less than 20% of free disk space could cripple merges processes and overall performance.
+            Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
+
+      - alert: RequestErrorsToAPI
+        expr: increase(vm_http_request_errors_total[5m]) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=52&var-instance={{ $labels.instance }}"
+          summary: "Too many errors served for {{ $labels.job }} path {{ $labels.path }} (instance {{ $labels.instance }})"
+          description: "Requests to path {{ $labels.path }} are receiving errors.
+            Please verify if clients are sending correct requests."
+
+      - alert: RPCErrors
+        expr: |
+          (
+           sum(increase(vm_rpc_connection_errors_total[5m])) by(job, instance)
+           +
+           sum(increase(vm_rpc_dial_errors_total[5m])) by(job, instance)
+           +
+           sum(increase(vm_rpc_handshake_errors_total[5m])) by(job, instance)
+          ) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=44&var-instance={{ $labels.instance }}"
+          summary: "Too many RPC errors for {{ $labels.job }} (instance {{ $labels.instance }})"
+          description: "RPC errors are interconnection errors between cluster components.\n
+            Possible reasons for errors are misconfiguration, overload, network blips or unreachable components."
+
+      - alert: ConcurrentFlushesHitTheLimit
+        expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}"
+          summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
+          description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
+            When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU."
+
+      - alert: TooManyLogs
+        expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=104&var-instance={{ $labels.instance }}"
+          summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
+          description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
+           Worth to check logs for specific error messages."
+
+      - alert: RowsRejectedOnIngestion
+        expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=135&var-instance={{ $labels.instance }}"
+          summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
+          description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
+            following reason: \"{{ $labels.reason }}\""
+
+      - alert: TooHighChurnRate
+        expr: |
+          (
+             sum(rate(vm_new_timeseries_created_total[5m]))
+             /
+             sum(rate(vm_rows_inserted_total[5m]))
+           ) > 0.1
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
+          summary: "Churn rate is more than 10% for the last 15m"
+          description: "VM constantly creates new time series.\n
+            This effect is known as Churn Rate.\n
+            High Churn Rate tightly connected with database performance and may
+            result in unexpected OOM's or slow queries."
+
+      - alert: TooHighChurnRate24h
+        expr: |
+          sum(increase(vm_new_timeseries_created_total[24h]))
+          >
+          (sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
+          summary: "Too high number of new series created over last 24h"
+          description: "The number of created new time series over last 24h is 3x times higher than
+            current number of active series.\n
+            This effect is known as Churn Rate.\n
+            High Churn Rate tightly connected with database performance and may
+            result in unexpected OOM's or slow queries."
+
+      - alert: TooHighSlowInsertsRate
+        expr: |
+          (
+             sum(rate(vm_slow_row_inserts_total[5m]))
+             /
+             sum(rate(vm_rows_inserted_total[5m]))
+           ) > 0.05
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=108"
+          summary: "Percentage of slow inserts is more than 5% for the last 15m"
+          description: "High rate of slow inserts may be a sign of resource exhaustion
+            for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
+
+      - alert: ProcessNearFDLimits
+        expr: (process_max_fds - process_open_fds) < 100
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
+          summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
+          description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
+          Consider to increase the limit as fast as possible."
+
+      - alert: LabelsLimitExceededOnIngestion
+        expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=116&var-instance={{ $labels.instance }}"
+          summary: "Metrics ingested to vminsert on {{ $labels.instance }} are exceeding labels limit"
+          description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
+           This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
+           correctly or that clients which send these metrics aren't misbehaving."
+
+      - alert: VminsertVmstorageConnectionIsSaturated
+        expr: rate(vm_rpc_send_duration_seconds_total[5m]) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=139&var-instance={{ $labels.instance }}"
+          summary: "Connection between vminsert on {{ $labels.instance }} and vmstorage on {{ $labels.addr }} is saturated"
+          description: "The connection between vminsert (instance {{ $labels.instance }}) and vmstorage (instance {{ $labels.addr }})
+            is saturated by more than 90% and vminsert won't be able to keep up.\n
+            This usually means that more vminsert or vmstorage nodes must be added to the cluster in order to increase
+            the total number of vminsert -> vmstorage links."
diff --git a/deployment/docker/alerts-health.yml b/deployment/docker/alerts-health.yml
new file mode 100644
index 0000000000..489a7035fe
--- /dev/null
+++ b/deployment/docker/alerts-health.yml
@@ -0,0 +1,54 @@
+# File contains default list of alerts for VM components.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  - name: vm-health
+    # note the `job` filter and update accordingly to your setup
+    rules:
+      - alert: TooManyRestarts
+        expr: changes(process_start_time_seconds{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
+        labels:
+          severity: critical
+        annotations:
+          summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
+          description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
+            It might be crashlooping."
+
+      - alert: ServiceDown
+        expr: up{job=~"victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
+          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
+
+      - alert: ProcessNearFDLimits
+        expr: (process_max_fds - process_open_fds) < 100
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
+          description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
+          Consider to increase the limit as fast as possible."
+
+      - alert: TooHighMemoryUsage
+        expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
+          description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
+           Consider to either increase available memory or decrease the load on the process."
+
+      - alert: TooHighCPUUsage
+        expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
+          description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
+               Consider to either increase available CPU resources or decrease the load on the process."
\ No newline at end of file
diff --git a/deployment/docker/alerts-vmagent.yml b/deployment/docker/alerts-vmagent.yml
new file mode 100644
index 0000000000..0fd9b8d56d
--- /dev/null
+++ b/deployment/docker/alerts-vmagent.yml
@@ -0,0 +1,122 @@
+# File contains default list of alerts for vmagent service.
+# The alerts below are just recommendations and may require some updates
+# and threshold calibration according to every specific setup.
+groups:
+  # Alerts group for vmagent assumes that Grafana dashboard
+  # https://grafana.com/grafana/dashboards/12683 is installed.
+  # Pls update the `dashboard` annotation according to your setup.
+  - name: vmagent
+    interval: 30s
+    concurrency: 2
+    rules:
+      - alert: PersistentQueueIsDroppingData
+        expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
+          description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
+              on instance {{ $labels.instance }} for the last 10m."
+
+      - alert: RejectedRemoteWriteDataBlocksAreDropped
+        expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
+          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by 
+          remote-write server data blocks. Check the logs to find the reason for rejects."
+
+      - alert: TooManyScrapeErrors
+        expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
+          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
+
+      - alert: TooManyWriteErrors
+        expr: |
+          (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
+          +
+          sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
+          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
+
+      - alert: TooManyRemoteWriteErrors
+        expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
+          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
+          description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
+            Ensure that destination is up and reachable."
+
+      - alert: RemoteWriteConnectionIsSaturated
+        expr: |
+          sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) 
+          > 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
+          summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
+          description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
+            is saturated by more than 90% and vmagent won't be able to keep up.\n
+            This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
+            the number of connections per each remote storage."
+
+      - alert: PersistentQueueForWritesIsSaturated
+        expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
+          summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
+          description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
+            are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk. 
+            In this case, consider to decrease load on the vmagent or improve the disk throughput."
+
+      - alert: PersistentQueueForReadsIsSaturated
+        expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
+          summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
+          description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
+            are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk. 
+            In this case, consider to decrease load on the vmagent or improve the disk throughput."
+
+      - alert: SeriesLimitHourReached
+        expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
+          description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. 
+            Then samples for new time series will be dropped instead of sending them to remote storage systems."
+
+      - alert: SeriesLimitDayReached
+        expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
+        labels:
+          severity: critical
+        annotations:
+          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
+          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
+          description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. 
+            Then samples for new time series will be dropped instead of sending them to remote storage systems."
\ No newline at end of file
diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml
index 6f34571c32..7cf31ff8ae 100644
--- a/deployment/docker/alerts.yml
+++ b/deployment/docker/alerts.yml
@@ -1,60 +1,7 @@
-# File contains default list of alerts for vm-single and vmagent services.
+# File contains default list of alerts for VictoriaMetrics single server.
 # The alerts below are just recommendations and may require some updates
 # and threshold calibration according to every specific setup.
 groups:
-  - name: vm-health
-    # note the `job` filter and update accordingly to your setup
-    rules:
-      # note the `job` filter and update accordingly to your setup
-      - alert: TooManyRestarts
-        expr: changes(process_start_time_seconds{job=~"victoriametrics|vmagent|vmalert"}[15m]) > 2
-        labels:
-          severity: critical
-        annotations:
-          summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
-          description: "Job {{ $labels.job }} has restarted more than twice in the last 15 minutes.
-            It might be crashlooping."
-
-      - alert: ServiceDown
-        expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
-          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
-
-      - alert: ProcessNearFDLimits
-        expr: (process_max_fds - process_open_fds) < 100
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
-          description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
-           Consider to increase the limit as fast as possible."
-
-      - alert: TooHighMemoryUsage
-        expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
-          description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
-           Consider to either increase available memory or decrease the load on the process."
-
-      - alert: TooHighCPUUsage
-        expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
-          description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
-               Consider to either increase available CPU resources or decrease the load on the process."
-
-
   # Alerts group for VM single assumes that Grafana dashboard
   # https://grafana.com/grafana/dashboards/10229 is installed.
   # Pls update the `dashboard` annotation according to your setup.
@@ -207,123 +154,4 @@ groups:
           summary: "Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit"
           description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
            This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
-           correctly or that clients which send these metrics aren't misbehaving."
-
-  # Alerts group for vmagent assumes that Grafana dashboard
-  # https://grafana.com/grafana/dashboards/12683 is installed.
-  # Pls update the `dashboard` annotation according to your setup.
-  - name: vmagent
-    interval: 30s
-    concurrency: 2
-    rules:
-      - alert: PersistentQueueIsDroppingData
-        expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
-          summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
-          description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
-              on instance {{ $labels.instance }} for the last 10m."
-
-      - alert: RejectedRemoteWriteDataBlocksAreDropped
-        expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
-          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by 
-          remote-write server data blocks. Check the logs to find the reason for rejects."
-
-      - alert: TooManyScrapeErrors
-        expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
-          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
-
-      - alert: TooManyWriteErrors
-        expr: |
-          (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
-          +
-          sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
-          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
-
-      - alert: TooManyRemoteWriteErrors
-        expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
-          summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
-          description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
-            Ensure that destination is up and reachable."
-
-      - alert: RemoteWriteConnectionIsSaturated
-        expr: |
-          sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url) 
-          > 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url)
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
-          summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
-          description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
-            is saturated by more than 90% and vmagent won't be able to keep up.\n
-            This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
-            the number of connections per each remote storage."
-
-      - alert: PersistentQueueForWritesIsSaturated
-        expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
-          summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
-          description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
-            are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk. 
-            In this case, consider to decrease load on the vmagent or improve the disk throughput."
-
-      - alert: PersistentQueueForReadsIsSaturated
-        expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
-          summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
-          description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
-            are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk. 
-            In this case, consider to decrease load on the vmagent or improve the disk throughput."
-
-      - alert: SeriesLimitHourReached
-        expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
-        labels:
-          severity: critical
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
-          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
-          description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. 
-            Then samples for new time series will be dropped instead of sending them to remote storage systems."
-
-      - alert: SeriesLimitDayReached
-        expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
-        labels:
-          severity: critical
-        annotations:
-          dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
-          summary: "Instance {{ $labels.instance }} reached 90% of the limit"
-          description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. 
-            Then samples for new time series will be dropped instead of sending them to remote storage systems."
+           correctly or that clients which send these metrics aren't misbehaving."
\ No newline at end of file
diff --git a/deployment/docker/docker-compose-cluster.yml b/deployment/docker/docker-compose-cluster.yml
new file mode 100644
index 0000000000..6ead89e4af
--- /dev/null
+++ b/deployment/docker/docker-compose-cluster.yml
@@ -0,0 +1,121 @@
+version: '3.5'
+services:
+  vmagent:
+    container_name: vmagent
+    image: victoriametrics/vmagent:latest
+    depends_on:
+      - "vminsert"
+    ports:
+      - 8429:8429
+    volumes:
+      - vmagentdata:/vmagentdata
+      - ./prometheus-cluster.yml:/etc/prometheus/prometheus.yml
+    command:
+      - '--promscrape.config=/etc/prometheus/prometheus.yml'
+      - '--remoteWrite.url=http://vminsert:8480/insert/0/prometheus/'
+    restart: always
+
+  grafana:
+    container_name: grafana
+    image: grafana/grafana:9.1.0
+    depends_on:
+      - "vmselect"
+    ports:
+      - 3000:3000
+    restart: always
+    volumes:
+      - grafanadata:/var/lib/grafana
+      - ./provisioning/:/etc/grafana/provisioning/
+      - ./../../dashboards/victoriametrics-cluster.json:/var/lib/grafana/dashboards/vm.json
+      - ./../../dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json
+      - ./../../dashboards/vmalert.json:/var/lib/grafana/dashboards/vmalert.json
+
+  vmstorage-1:
+    container_name: vmstorage-1
+    image: victoriametrics/vmstorage:latest
+    ports:
+      - 8482
+      - 8400
+      - 8401
+    volumes:
+      - strgdata-1:/storage
+    command:
+      - '--storageDataPath=/storage'
+    restart: always
+  vmstorage-2:
+    container_name: vmstorage-2
+    image: victoriametrics/vmstorage:latest
+    ports:
+      - 8482
+      - 8400
+      - 8401
+    volumes:
+      - strgdata-2:/storage
+    command:
+      - '--storageDataPath=/storage'
+    restart: always
+  vminsert:
+    container_name: vminsert
+    image: victoriametrics/vminsert:latest
+    depends_on:
+      - "vmstorage-1"
+      - "vmstorage-2"
+    command:
+      - '--storageNode=vmstorage-1:8400'
+      - '--storageNode=vmstorage-2:8400'
+    ports:
+      - 8480:8480
+    restart: always
+  vmselect:
+    container_name: vmselect
+    image: victoriametrics/vmselect:latest
+    depends_on:
+      - "vmstorage-1"
+      - "vmstorage-2"
+    command:
+      - '--storageNode=vmstorage-1:8401'
+      - '--storageNode=vmstorage-2:8401'
+      - '--vmalert.proxyURL=http://vmalert:8880'
+    ports:
+      - 8481:8481
+    restart: always
+
+  vmalert:
+    container_name: vmalert
+    image: victoriametrics/vmalert:latest
+    depends_on:
+      - "vmselect"
+    ports:
+      - 8880:8880
+    volumes:
+      - ./alerts-cluster.yml:/etc/alerts/alerts.yml
+      - ./alerts-health.yml:/etc/alerts/alerts-health.yml
+      - ./alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml
+    command:
+      - '--datasource.url=http://vmselect:8481/select/0/prometheus'
+      - '--remoteRead.url=http://vmselect:8481/select/0/prometheus'
+      - '--remoteWrite.url=http://vminsert:8480/insert/0/prometheus'
+      - '--notifier.url=http://alertmanager:9093/'
+      - '--rule=/etc/alerts/*.yml'
+      # display source of alerts in grafana
+      - '-external.url=http://127.0.0.1:3000' #grafana outside container
+      # when copypaste the line below be aware of '$$' for escaping in '$expr'
+      - '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":"{{$$expr|quotesEscape|crlfEscape|queryEscape}}"},{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]'
+    restart: always
+
+  alertmanager:
+    container_name: alertmanager
+    image: prom/alertmanager:v0.24.0
+    volumes:
+      - ./alertmanager.yml:/config/alertmanager.yml
+    command:
+      - '--config.file=/config/alertmanager.yml'
+    ports:
+      - 9093:9093
+    restart: always
+
+volumes:
+  vmagentdata: {}
+  strgdata-1: {}
+  strgdata-2: {}
+  grafanadata: {}
\ No newline at end of file
diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml
index fdf31d1c62..159113d7de 100644
--- a/deployment/docker/docker-compose.yml
+++ b/deployment/docker/docker-compose.yml
@@ -2,7 +2,7 @@ version: "3.5"
 services:
   vmagent:
     container_name: vmagent
-    image: victoriametrics/vmagent:v1.80.0
+    image: victoriametrics/vmagent:latest
     depends_on:
       - "victoriametrics"
     ports:
@@ -18,7 +18,7 @@ services:
     restart: always
   victoriametrics:
     container_name: victoriametrics
-    image: victoriametrics/victoria-metrics:v1.80.0
+    image: victoriametrics/victoria-metrics:latest
     ports:
       - 8428:8428
       - 8089:8089
@@ -56,7 +56,7 @@ services:
     restart: always
   vmalert:
     container_name: vmalert
-    image: victoriametrics/vmalert:v1.80.0
+    image: victoriametrics/vmalert:latest
     depends_on:
       - "victoriametrics"
       - "alertmanager"
@@ -64,6 +64,8 @@ services:
       - 8880:8880
     volumes:
       - ./alerts.yml:/etc/alerts/alerts.yml
+      - ./alerts-health.yml:/etc/alerts/alerts-health.yml
+      - ./alerts-vmagent.yml:/etc/alerts/alerts-vmagent.yml
     command:
       - "--datasource.url=http://victoriametrics:8428/"
       - "--remoteRead.url=http://victoriametrics:8428/"
@@ -72,7 +74,8 @@ services:
       - "--rule=/etc/alerts/*.yml"
       # display source of alerts in grafana
       - "--external.url=http://127.0.0.1:3000" #grafana outside container
-      - '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":"{{$$expr|quotesEscape|crlfEscape|queryEscape}}"},{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]' ## when copypaste the line be aware of '$$' for escaping in '$expr'
+      # when copypaste the line be aware of '$$' for escaping in '$expr'
+      - '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr":"{{$$expr|quotesEscape|crlfEscape|queryEscape}}"},{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]'
     networks:
       - vm_net
     restart: always
diff --git a/deployment/docker/prometheus-cluster.yml b/deployment/docker/prometheus-cluster.yml
new file mode 100644
index 0000000000..32336929be
--- /dev/null
+++ b/deployment/docker/prometheus-cluster.yml
@@ -0,0 +1,19 @@
+global:
+  scrape_interval: 10s
+
+scrape_configs:
+  - job_name: 'vmagent'
+    static_configs:
+      - targets: ['vmagent:8429']
+  - job_name: 'vmalert'
+    static_configs:
+      - targets: ['vmalert:8880']
+  - job_name: 'vminsert'
+    static_configs:
+      - targets: ['vminsert:8480']
+  - job_name: 'vmselect'
+    static_configs:
+      - targets: ['vmselect:8481']
+  - job_name: 'vmstorage'
+    static_configs:
+      - targets: ['vmstorage-1:8482', 'vmstorage-2:8482']
\ No newline at end of file
diff --git a/deployment/docker/provisioning/datasources/datasource.yml b/deployment/docker/provisioning/datasources/datasource.yml
index d9d7b85d13..e16c273c4a 100644
--- a/deployment/docker/provisioning/datasources/datasource.yml
+++ b/deployment/docker/provisioning/datasources/datasource.yml
@@ -6,3 +6,9 @@ datasources:
       access: proxy
       url: http://victoriametrics:8428
       isDefault: true
+
+    - name: VictoriaMetrics - cluster
+      type: prometheus
+      access: proxy
+      url: http://vmselect:8481/select/0/prometheus
+      isDefault: false
\ No newline at end of file
diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md
index 79c666f95a..16cdcd83f0 100644
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@@ -144,7 +144,7 @@ Ports may be altered by setting `-httpListenAddr` on the corresponding nodes.
 It is recommended setting up [monitoring](#monitoring) for the cluster.
 
 The following tools can simplify cluster setup:
-- [An example docker-compose config for VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/docker-compose.yml)
+- [An example docker-compose config for VictoriaMetrics cluster](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose-cluster.yml)
 - [Helm charts for VictoriaMetrics](https://github.com/VictoriaMetrics/helm-charts)
 - [Kubernetes operator for VictoriaMetrics](https://github.com/VictoriaMetrics/operator)
 
diff --git a/docs/Quick-Start.md b/docs/Quick-Start.md
index 113bd0fbe2..ddf56efabd 100644
--- a/docs/Quick-Start.md
+++ b/docs/Quick-Start.md
@@ -58,21 +58,22 @@ There is also [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster
 ### Starting VM-Cluster via Docker
 
 The following commands clone the latest available
-[VictoriaMetrics cluster repository](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster)
-and start the docker container via 'docker-compose'. Further customization is possible by editing
-the [docker-compose.yaml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/deployment/docker/docker-compose.yml)
+[VictoriaMetrics repository](https://github.com/VictoriaMetrics/VictoriaMetrics)
+and start the docker container via 'make docker-cluster-up'. Further customization is possible by editing
+the [docker-compose-cluster.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/docker-compose-cluster.yml)
 file.
 
 <div class="with-copy" markdown="1">
 
 ```console
-git clone https://github.com/VictoriaMetrics/VictoriaMetrics --branch cluster && 
-cd VictoriaMetrics/deployment/docker && 
-docker-compose up
+git clone https://github.com/VictoriaMetrics/VictoriaMetrics && 
+make docker-cluster-up
 ```
 
 </div>
 
+See more details [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#readme).
+
 * [Cluster setup](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-setup)
 
 ## Write data