diff --git a/dashboards/victoriametrics-cluster.json b/dashboards/victoriametrics-cluster.json index cd3ba2d23..5e05b3c8e 100644 --- a/dashboards/victoriametrics-cluster.json +++ b/dashboards/victoriametrics-cluster.json @@ -4642,12 +4642,12 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_storage\", instance=~\"$instance\"}\n)", + "expr": "median(\n rate(process_cpu_seconds_total{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_storage\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -4788,12 +4788,12 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}\n)", + "expr": "median(\n max_over_time(process_resident_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_storage\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -5381,11 +5381,11 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "avg(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)", + "expr": "median(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -6255,12 +6255,12 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_select\", instance=~\"$instance\"}\n)", + "expr": "median(\n rate(process_cpu_seconds_total{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_select\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -6399,12 +6399,12 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}\n)", + "expr": "median(\n max_over_time(process_resident_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_select\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -7364,12 +7364,12 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_insert\", instance=~\"$instance\"}\n)", + "expr": "median(\n rate(process_cpu_seconds_total{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job_insert\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } @@ -7508,12 +7508,12 @@ }, "editorMode": "code", "exemplar": true, - "expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}\n)", + "expr": "median(\n max_over_time(process_resident_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job_insert\", instance=~\"$instance\"}\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", + "legendFormat": "median", "range": true, "refId": "C" } diff --git a/docs/Articles.md b/docs/Articles.md index 028ef3710..88ab8083d 100644 --- a/docs/Articles.md +++ b/docs/Articles.md @@ -64,6 +64,7 @@ See also [case studies](https://docs.victoriametrics.com/CaseStudies.html). * [Brewblox: InfluxDB to Victoria Metrics](https://www.brewblox.com/dev/decisions/20210718_victoria_metrics.html) * [VictoriaMetrics static scraper](https://blog.differentpla.net/blog/2022/10/16/victoria-metrics-static-scraper/) * [VictoriaMetrics and Open Cosmos boldly takes edge computing to the edge of space](https://www.iot-now.com/2022/07/19/122423-victoriametrics-and-open-cosmos-boldly-takes-edge-computing-to-the-edge-of-space/) +* [Evaluating Backend Options For Prometheus Metrics](https://www.techetio.com/2022/08/21/evaluating-backend-options-for-prometheus-metrics/) ## Our articles diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 9b08c5b55..1d259ede2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -23,6 +23,8 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: [vmalert enterprise](https://docs.victoriametrics.com/vmalert.html): add ability to read alerting and recording rules from S3, GCS or S3-compatible object storage. See [these docs](https://docs.victoriametrics.com/vmalert.html#reading-rules-from-object-storage). * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `mad_over_time(m[d])` function for calculating the [median absolute deviation](https://en.wikipedia.org/wiki/Median_absolute_deviation) over raw samples on the lookbehind window `d`. See [this feature request](https://github.com/prometheus/prometheus/issues/5514). +* BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790). + ## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1) Released at 2023-02-09 diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index ef403174e..9ca34650b 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -788,7 +788,18 @@ func (tb *Table) notifyBackgroundMergers() bool { } } -var flushConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs()) +var flushConcurrencyLimit = func() int { + n := cgroup.AvailableCPUs() + if n < 2 { + // Allow at least 2 concurrent flushers on systems with a single CPU core + // in order to guarantee that in-memory data flushes and background merges can be continued + // when a single flusher is busy with the long merge. + n = 2 + } + return n +}() + +var flushConcurrencyCh = make(chan struct{}, flushConcurrencyLimit) func needAssistedMerge(pws []*partWrapper, maxParts int) bool { if len(pws) < maxParts { diff --git a/lib/storage/partition.go b/lib/storage/partition.go index 305e59fd1..fafd110ee 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -615,7 +615,19 @@ func (pt *partition) notifyBackgroundMergers() bool { } } -var flushConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs()) +var flushConcurrencyLimit = func() int { + n := cgroup.AvailableCPUs() + if n < 3 { + // Allow at least 3 concurrent flushers on systems with a single CPU core + // in order to guarantee that in-memory data flushes and background merges can be continued + // when a single flusher is busy with the long merge of big parts, + // while another flusher is busy with the long merge of small parts. + n = 3 + } + return n +}() + +var flushConcurrencyCh = make(chan struct{}, flushConcurrencyLimit) func needAssistedMerge(pws []*partWrapper, maxParts int) bool { if len(pws) < maxParts { @@ -1007,7 +1019,7 @@ func hasActiveMerges(pws []*partWrapper) bool { return false } -var mergeWorkersLimitCh = make(chan struct{}, getDefaultMergeConcurrency(16)) +var mergeWorkersLimitCh = make(chan struct{}, adjustMergeWorkersLimit(getDefaultMergeConcurrency(16))) var bigMergeWorkersLimitCh = make(chan struct{}, getDefaultMergeConcurrency(4)) @@ -1038,9 +1050,20 @@ func SetMergeWorkersCount(n int) { // Do nothing return } + n = adjustMergeWorkersLimit(n) mergeWorkersLimitCh = make(chan struct{}, n) } +func adjustMergeWorkersLimit(n int) int { + if n < 2 { + // Allow at least 2 merge workers on systems with a single CPU core + // in order to guarantee that background merges can be continued + // when a single worker is busy with the long merge of big parts. + return 2 + } + return n +} + func (pt *partition) startMergeWorkers() { // Start a merge worker per available CPU core. // The actual number of concurrent merges is limited inside mergeWorker() below.