lib/cgroup: expose process_cpu_cores_available metric

This metric shows the number of CPU cores available to the process. This allows creating alerting rules on CPU saturation with the following query: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2107
2024-11-21 14:44:00 +00:00 · 2022-01-31 20:07:50 +02:00 · 2022-01-31 20:07:50 +02:00 · ead66155ef
commit ead66155ef
parent e7f1ceeb84
2 changed files with 16 additions and 8 deletions
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -12,6 +12,7 @@ sort: 15
  * Multi-level binary operations. For example, `foo{a="b"} + bar{x="y"} + baz{z="q"}` is now optimized to `foo{a="b",x="y",z="q"} + bar{a="b",x="y",z="q"} + baz{a="b",x="y",z="q"}`
  * Aggregate functions. For example, `sum(foo{a="b"}) by (c) + bar{c="d"}` is now optimized to `sum(foo{a="b",c="d"}) by (c) + bar{c="d"}`
 * FEATURE [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): optimize joining with `*_info` labels. For example: `kube_pod_created{namespace="prod"} * on (uid) group_left(node) kube_pod_info` now automatically adds the needed filters on `uid` label to `kube_pod_info` before selecting series for the right side of `*` operation. This may save CPU, RAM and disk IO resources. See [this article](https://www.robustperception.io/exposing-the-software-version-to-prometheus) for details on `*_info` labels.
 * FEATURE: all: expose `process_cpu_cores_available` metric, which shows the number of CPU cores available to the app. The number can be fractional if the corresponding cgroup limit is set to a fractional value. This metric is useful for alerting on CPU saturation. For example, the following query alerts when the app uses more than 90% of CPU during the last 5 minutes: `rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9` . See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2107).
 * BUGFIX: return proper results from `highestMax()` function at [Graphite render API](https://docs.victoriametrics.com/#graphite-render-api-usage). Previously it was incorrectly returning timeseries with min peaks instead of max peaks.
 * BUGFIX: properly limit indexdb cache sizes. Previously they could exceed values set via `-memory.allowedPercent` and/or `-memory.allowedBytes` when `indexdb` contained many data parts. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2007).
--- a/lib/cgroup/cpu.go
+++ b/lib/cgroup/cpu.go
@ -7,33 +7,40 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
 	"github.com/VictoriaMetrics/metrics"
 )
 // AvailableCPUs returns the number of available CPU cores for the app.
 //
 // The number is rounded to the next integer value if fractional number of CPU cores are available.
 func AvailableCPUs() int {
 	return runtime.GOMAXPROCS(-1)
 }
 func init() {
-	updateGOMAXPROCSToCPUQuota()
+	cpuCoresAvailable := getCPUQuota()
 	updateGOMAXPROCSToCPUQuota(cpuCoresAvailable)
 	metrics.NewGauge(`process_cpu_cores_available`, func() float64 {
 		return cpuCoresAvailable
 	})
 }
-// updateGOMAXPROCSToCPUQuota updates GOMAXPROCS to cgroup CPU quota if GOMAXPROCS isn't set in environment var.
+// updateGOMAXPROCSToCPUQuota updates GOMAXPROCS to cpuCoresAvailable if GOMAXPROCS isn't set in environment var.
-func updateGOMAXPROCSToCPUQuota() {
+func updateGOMAXPROCSToCPUQuota(cpuCoresAvailable float64) {
 	if v := os.Getenv("GOMAXPROCS"); v != "" {
 		// Do not override explicitly set GOMAXPROCS.
 		return
 	}
-	q := getCPUQuota()
+	if cpuCoresAvailable <= 0 {
-	if q <= 0 {
+		// Do not change GOMAXPROCS if cpuCoresAvailable is incorrectly set.
 		// Do not change GOMAXPROCS
 		return
 	}
-	gomaxprocs := int(q + 0.5)
+	gomaxprocs := int(cpuCoresAvailable + 0.5)
 	numCPU := runtime.NumCPU()
 	if gomaxprocs > numCPU {
 		// There is no sense in setting more GOMAXPROCS than the number of available CPU cores.
-		return
+		gomaxprocs = numCPU
 	}
 	if gomaxprocs <= 0 {
 		gomaxprocs = 1