From c0ec5415595890a17c3bb3c01ef67b7ee16575ce Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 13 May 2021 09:26:20 +0300 Subject: [PATCH] lib/cgroup: document the ability to detect cgroup v2 memory and cpu limits. This is follow-up for b50024812e496d6b4f2e640b12e2c49cdd2a6d31 --- docs/CHANGELOG.md | 1 + lib/cgroup/cpu.go | 25 ++++++++++++++----------- lib/cgroup/cpu_test.go | 6 +++--- lib/cgroup/mem.go | 9 ++++----- lib/cgroup/util.go | 6 +++--- 5 files changed, 25 insertions(+), 22 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index f21ccd808..f53a7c8d4 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -10,6 +10,7 @@ sort: 15 * FEATURE: return `X-Server-Hostname` header in http responses of all the VictoriaMetrics components. This should simplify tracing the origin server behind a load balancer or behind auth proxy during troubleshooting. * FEATURE: vmselect: allow to use 2x more memory for query processing at `vmselect` nodes in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html). This should allow processing heavy queries without the need to increase RAM size at `vmselect` nodes. * FEATURE: add ability to filter `/api/v1/status/tsdb` output with arbitrary [time series selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors) passed via `match[]` query args. See [these docs](https://docs.victoriametrics.com/#tsdb-stats) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1168) for details. +* FEATURE: automatically detect memory and cpu limits for VictoriaMetrics components running under [cgroup v2](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html) environments such as [HashiCorp Nomad](https://www.nomadproject.io/). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1269). * BUGFIX: vmagent: fix possible race when refreshing `role: endpoints` and `role: endpointslices` scrape targets in `kubernetes_sd_config`. Prevoiusly `pod` objects could be updated after the related `endpoints` object update. This could lead to missing scrape targets. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240). * BUGFIX: properly remove stale parts outside the configured retention if `-retentionPeriod` is smaller than one month. Previously stale parts could remain active for up to a month after they go outside the retention. diff --git a/lib/cgroup/cpu.go b/lib/cgroup/cpu.go index 8c3972d50..b151aa925 100644 --- a/lib/cgroup/cpu.go +++ b/lib/cgroup/cpu.go @@ -42,11 +42,10 @@ func updateGOMAXPROCSToCPUQuota() { } func getCPUQuota() float64 { - cpuQuota, err := getCPUStatGeneric() + cpuQuota, err := getCPUQuotaGeneric() if err != nil { return 0 } - if cpuQuota <= 0 { // The quota isn't set. This may be the case in multilevel containers. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-674423728 @@ -55,7 +54,7 @@ func getCPUQuota() float64 { return cpuQuota } -func getCPUStatGeneric() (float64, error) { +func getCPUQuotaGeneric() (float64, error) { quotaUS, err := getCPUStat("cpu.cfs_quota_us") if err == nil { periodUS, err := getCPUStat("cpu.cfs_period_us") @@ -63,7 +62,7 @@ func getCPUStatGeneric() (float64, error) { return float64(quotaUS) / float64(periodUS), nil } } - return getCPUStatV2("/sys/fs/cgroup", "/proc/self/cgroup") + return getCPUQuotaV2("/sys/fs/cgroup", "/proc/self/cgroup") } func getCPUStat(statName string) (int64, error) { @@ -83,31 +82,35 @@ func getOnlineCPUCount() float64 { return n } -func getCPUStatV2(sysPrefix, cgroupPath string) (float64, error) { +func getCPUQuotaV2(sysPrefix, cgroupPath string) (float64, error) { data, err := getFileContents("cpu.max", sysPrefix, cgroupPath, "") if err != nil { return 0, err } - return parseCPUMax(data) + data = strings.TrimSpace(data) + n, err := parseCPUMax(data) + if err != nil { + return 0, fmt.Errorf("cannot parse cpu.max file contents: %w", err) + } + return n, nil } -// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#cpu +// See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#cpu func parseCPUMax(data string) (float64, error) { - data = strings.TrimRight(data, "\r\n") bounds := strings.Split(data, " ") if len(bounds) != 2 { - return 0, fmt.Errorf("unexpected count: %d, want quota and period, got: %s", len(bounds), data) + return 0, fmt.Errorf("unexpected line format: want 'quota period'; got: %s", data) } if bounds[0] == "max" { return -1, nil } quota, err := strconv.ParseUint(bounds[0], 10, 64) if err != nil { - return 0, err + return 0, fmt.Errorf("cannot parse quota: %w", err) } period, err := strconv.ParseUint(bounds[1], 10, 64) if err != nil { - return 0, err + return 0, fmt.Errorf("cannot parse period: %w", err) } return float64(quota) / float64(period), nil } diff --git a/lib/cgroup/cpu_test.go b/lib/cgroup/cpu_test.go index 20084b305..a413e3187 100644 --- a/lib/cgroup/cpu_test.go +++ b/lib/cgroup/cpu_test.go @@ -23,15 +23,15 @@ func TestCountCPUs(t *testing.T) { f("0-6", 7) } -func TestGetCPUStatV2(t *testing.T) { +func TestGetCPUQuotaV2(t *testing.T) { f := func(sysPrefix, cgroupPath string, expectedCPU float64) { t.Helper() - got, err := getCPUStatV2(sysPrefix, cgroupPath) + got, err := getCPUQuotaV2(sysPrefix, cgroupPath) if err != nil { t.Fatalf("unexpected error: %s, sysPrefix: %s, cgroupPath: %s", err, sysPrefix, cgroupPath) } if got != expectedCPU { - t.Fatalf("unexpected result from getCPUStatV2(%s, %s), got %f, want %f", sysPrefix, cgroupPath, got, expectedCPU) + t.Fatalf("unexpected result from getCPUQuotaV2(%s, %s), got %f, want %f", sysPrefix, cgroupPath, got, expectedCPU) } } f("testdata/cgroup", "testdata/self/cgroupv2", 2) diff --git a/lib/cgroup/mem.go b/lib/cgroup/mem.go index e91f8145b..c4641a4b8 100644 --- a/lib/cgroup/mem.go +++ b/lib/cgroup/mem.go @@ -16,17 +16,16 @@ func GetMemoryLimit() int64 { if err == nil { return n } - // https: //www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files - n, err = getMemStatV2() + n, err = getMemStatV2("memory.max") if err != nil { return 0 } - return n } -func getMemStatV2() (int64, error) { - return getStatGeneric("memory.max", "/sys/fs/cgroup", "/proc/self/cgroup", "") +func getMemStatV2(statName string) (int64, error) { + // See https: //www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files + return getStatGeneric(statName, "/sys/fs/cgroup", "/proc/self/cgroup", "") } func getMemStat(statName string) (int64, error) { diff --git a/lib/cgroup/util.go b/lib/cgroup/util.go index 66960adb7..422204bd2 100644 --- a/lib/cgroup/util.go +++ b/lib/cgroup/util.go @@ -13,10 +13,10 @@ func getStatGeneric(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (i if err != nil { return 0, err } - data = strings.TrimRight(data, "\r\n") + data = strings.TrimSpace(data) n, err := strconv.ParseInt(data, 10, 64) if err != nil { - return 0, err + return 0, fmt.Errorf("cannot parse %q: %w", cgroupPath, err) } return n, nil } @@ -33,7 +33,7 @@ func getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) ( } subPath, err := grepFirstMatch(string(cgroupData), cgroupGrepLine, 2, ":") if err != nil { - return "", err + return "", fmt.Errorf("cannot find cgroup path for %q in %q: %w", cgroupGrepLine, cgroupPath, err) } filepath = path.Join(sysfsPrefix, subPath, statName) data, err = ioutil.ReadFile(filepath)