Merge branch 'public-single-node' into pmm-6401-read-prometheus-data-files

2024-12-11 14:53:49 +00:00 · 2023-02-11 12:09:55 -08:00 · 2023-02-11 12:09:55 -08:00 · a38bf70679
commit a38bf70679
parent 7b41c9ac72 3ec8a4dc80
5 changed files with 54 additions and 17 deletions
--- a/dashboards/victoriametrics-cluster.json
+++ b/dashboards/victoriametrics-cluster.json
@ -4642,12 +4642,12 @@
              },
              "editorMode": "code",
              "exemplar": false,
-              "expr": "avg(\n    rate(process_cpu_seconds_total{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_storage\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    rate(process_cpu_seconds_total{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_storage\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -4788,12 +4788,12 @@
              },
              "editorMode": "code",
              "exemplar": true,
-              "expr": "avg(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -5381,11 +5381,11 @@
                "uid": "$ds"
              },
              "editorMode": "code",
-              "expr": "avg(\n    sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n    (\n        sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n        sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n    ) \n)",
+              "expr": "median(\n    sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n    (\n        sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n        sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n    ) \n)",
              "format": "time_series",
              "hide": false,
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -6255,12 +6255,12 @@
              },
              "editorMode": "code",
              "exemplar": true,
-              "expr": "avg(\n    rate(process_cpu_seconds_total{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_select\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    rate(process_cpu_seconds_total{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_select\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -6399,12 +6399,12 @@
              },
              "editorMode": "code",
              "exemplar": true,
-              "expr": "avg(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -7364,12 +7364,12 @@
              },
              "editorMode": "code",
              "exemplar": true,
-              "expr": "avg(\n    rate(process_cpu_seconds_total{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_insert\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    rate(process_cpu_seconds_total{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    process_cpu_cores_available{job=~\"$job_insert\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
@ -7508,12 +7508,12 @@
              },
              "editorMode": "code",
              "exemplar": true,
-              "expr": "avg(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}\n)",
+              "expr": "median(\n    max_over_time(process_resident_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n    /\n    vm_available_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}\n)",
              "format": "time_series",
              "hide": false,
              "interval": "",
              "intervalFactor": 1,
-              "legendFormat": "avg",
+              "legendFormat": "median",
              "range": true,
              "refId": "C"
            }
--- a/docs/Articles.md
+++ b/docs/Articles.md
@ -64,6 +64,7 @@ See also [case studies](https://docs.victoriametrics.com/CaseStudies.html).
 * [Brewblox: InfluxDB to Victoria Metrics](https://www.brewblox.com/dev/decisions/20210718_victoria_metrics.html)
 * [VictoriaMetrics static scraper](https://blog.differentpla.net/blog/2022/10/16/victoria-metrics-static-scraper/)
 * [VictoriaMetrics and Open Cosmos boldly takes edge computing to the edge of space](https://www.iot-now.com/2022/07/19/122423-victoriametrics-and-open-cosmos-boldly-takes-edge-computing-to-the-edge-of-space/)
+* [Evaluating Backend Options For Prometheus Metrics](https://www.techetio.com/2022/08/21/evaluating-backend-options-for-prometheus-metrics/)

 ## Our articles

--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -23,6 +23,8 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * FEATURE: [vmalert enterprise](https://docs.victoriametrics.com/vmalert.html): add ability to read alerting and recording rules from S3, GCS or S3-compatible object storage. See [these docs](https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage).
 * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `mad_over_time(m[d])` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over raw samples on the lookbehind window `d`. See [this feature request](https://github.com/prometheus/prometheus/issues/5514).

+* BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790).
+
 ## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1)

 Released at 2023-02-09
--- a/lib/mergeset/table.go
+++ b/lib/mergeset/table.go
@ -788,7 +788,18 @@ func (tb *Table) notifyBackgroundMergers() bool {
 	}
 }

-var flushConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
+var flushConcurrencyLimit = func() int {
+	n := cgroup.AvailableCPUs()
+	if n < 2 {
+		// Allow at least 2 concurrent flushers on systems with a single CPU core
+		// in order to guarantee that in-memory data flushes and background merges can be continued
+		// when a single flusher is busy with the long merge.
+		n = 2
+	}
+	return n
+}()
+
+var flushConcurrencyCh = make(chan struct{}, flushConcurrencyLimit)

 func needAssistedMerge(pws []*partWrapper, maxParts int) bool {
 	if len(pws) < maxParts {
--- a/lib/storage/partition.go
+++ b/lib/storage/partition.go
@ -615,7 +615,19 @@ func (pt *partition) notifyBackgroundMergers() bool {
 	}
 }

-var flushConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
+var flushConcurrencyLimit = func() int {
+	n := cgroup.AvailableCPUs()
+	if n < 3 {
+		// Allow at least 3 concurrent flushers on systems with a single CPU core
+		// in order to guarantee that in-memory data flushes and background merges can be continued
+		// when a single flusher is busy with the long merge of big parts,
+		// while another flusher is busy with the long merge of small parts.
+		n = 3
+	}
+	return n
+}()
+
+var flushConcurrencyCh = make(chan struct{}, flushConcurrencyLimit)

 func needAssistedMerge(pws []*partWrapper, maxParts int) bool {
 	if len(pws) < maxParts {
@ -1007,7 +1019,7 @@ func hasActiveMerges(pws []*partWrapper) bool {
 	return false
 }

-var mergeWorkersLimitCh = make(chan struct{}, getDefaultMergeConcurrency(16))
+var mergeWorkersLimitCh = make(chan struct{}, adjustMergeWorkersLimit(getDefaultMergeConcurrency(16)))

 var bigMergeWorkersLimitCh = make(chan struct{}, getDefaultMergeConcurrency(4))

@ -1038,9 +1050,20 @@ func SetMergeWorkersCount(n int) {
 		// Do nothing
 		return
 	}
+	n = adjustMergeWorkersLimit(n)
 	mergeWorkersLimitCh = make(chan struct{}, n)
 }

+func adjustMergeWorkersLimit(n int) int {
+	if n < 2 {
+		// Allow at least 2 merge workers on systems with a single CPU core
+		// in order to guarantee that background merges can be continued
+		// when a single worker is busy with the long merge of big parts.
+		return 2
+	}
+	return n
+}
+
 func (pt *partition) startMergeWorkers() {
 	// Start a merge worker per available CPU core.
 	// The actual number of concurrent merges is limited inside mergeWorker() below.