From 4be4645142e277015f73ed8434a26859eb820426 Mon Sep 17 00:00:00 2001 From: Zakhar Bessarab Date: Wed, 21 Dec 2022 02:12:04 +0400 Subject: [PATCH] app/vmbackupmanager: add metrics for better observability (#488) * app/vmbackupmanager: add metrics for better observability, include more information to `/api/v1/backups` API call response * app/vmbackupmanager: drop old metrics before creating new ones * app/vmbackupmanager: use `_total` postfix for counter metrics * app/vmbackupmanager: remove `_total` postfix for gauge-like metrics * app/vmbackupmanager: add `_last_run_failed` metrics for backups and retention * app/vmbackupmanager: address review feedback * app/vmbackupmanager: fix metric name * app/vmbackupmanager: address review feedback, remove background updates of metrics, add restoring state of `_last_run_failed` metric from remote storage * app/vmbackupmanager: improve performance for backup size calculation * app/vmbackupmanager: refactor backup and retention runs to deduplicate each run logic * {app/vmbackupmanager,lib/formatutil}: move HumanizeBytes into lib package * app/vmbackupmanager: fix creating new metrics instead of reusing existing ones * lit/formatutil: add comment to make linter happy * app/vmbackupmanager: address review feedback --- app/vmalert/templates/template.go | 14 ++------- app/vmalert/templates/template_test.go | 26 ++++++++++++++++ app/vmbackupmanager/README.md | 4 +-- docs/vmbackupmanager.md | 41 ++++++++++++++++++++++++-- lib/backup/actions/backup.go | 8 +++++ lib/formatutil/human.go | 19 ++++++++++++ 6 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 lib/formatutil/human.go diff --git a/app/vmalert/templates/template.go b/app/vmalert/templates/template.go index faab29a94..c932b5777 100644 --- a/app/vmalert/templates/template.go +++ b/app/vmalert/templates/template.go @@ -27,11 +27,11 @@ import ( "strconv" "strings" "sync" + textTpl "text/template" "time" - textTpl "text/template" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/formatutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" ) @@ -350,15 +350,7 @@ func templateFuncs() textTpl.FuncMap { if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) { return fmt.Sprintf("%.4g", v), nil } - prefix := "" - for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} { - if math.Abs(v) < 1024 { - break - } - prefix = p - v /= 1024 - } - return fmt.Sprintf("%.4g%s", v, prefix), nil + return formatutil.HumanizeBytes(v), nil }, // humanizeDuration converts given seconds to a human-readable duration diff --git a/app/vmalert/templates/template_test.go b/app/vmalert/templates/template_test.go index 1dd3adfbd..d5378a434 100644 --- a/app/vmalert/templates/template_test.go +++ b/app/vmalert/templates/template_test.go @@ -1,6 +1,7 @@ package templates import ( + "math" "strings" "testing" textTpl "text/template" @@ -50,6 +51,31 @@ func TestTemplateFuncs(t *testing.T) { if !ok { t.Fatalf("unexpected mismatch") } + + formatting := func(funcName string, p interface{}, resultExpected string) { + t.Helper() + v := funcs[funcName] + fLocal := v.(func(s interface{}) (string, error)) + result, err := fLocal(p) + if err != nil { + t.Fatalf("unexpected error for %s(%f): %s", funcName, p, err) + } + if result != resultExpected { + t.Fatalf("unexpected result for %s(%f); got\n%s\nwant\n%s", funcName, p, result, resultExpected) + } + } + formatting("humanize1024", float64(0), "0") + formatting("humanize1024", math.Inf(0), "+Inf") + formatting("humanize1024", math.NaN(), "NaN") + formatting("humanize1024", float64(127087), "124.1ki") + formatting("humanize1024", float64(130137088), "124.1Mi") + formatting("humanize1024", float64(133260378112), "124.1Gi") + formatting("humanize1024", float64(136458627186688), "124.1Ti") + formatting("humanize1024", float64(139733634239168512), "124.1Pi") + formatting("humanize1024", float64(143087241460908556288), "124.1Ei") + formatting("humanize1024", float64(146521335255970361638912), "124.1Zi") + formatting("humanize1024", float64(150037847302113650318245888), "124.1Yi") + formatting("humanize1024", float64(153638755637364377925883789312), "1.271e+05Yi") } func mkTemplate(current, replacement interface{}) textTemplate { diff --git a/app/vmbackupmanager/README.md b/app/vmbackupmanager/README.md index 6297e1a26..499c06397 100644 --- a/app/vmbackupmanager/README.md +++ b/app/vmbackupmanager/README.md @@ -158,7 +158,7 @@ The result on the GCS bucket. We see only 3 daily backups: * GET `/api/v1/backups` - returns list of backups in remote storage. Example output: ```json - ["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"] + [{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}] ``` * POST `/api/v1/restore` - saves backup name to restore when [performing restore](#restore-commands). @@ -211,7 +211,7 @@ It can be changed by using flag: `vmbackupmanager backup list` lists backups in remote storage: ```console $ ./vmbackupmanager backup list -["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"] +[{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}] ``` ### Restore commands diff --git a/docs/vmbackupmanager.md b/docs/vmbackupmanager.md index 57b39258b..faa601627 100644 --- a/docs/vmbackupmanager.md +++ b/docs/vmbackupmanager.md @@ -162,7 +162,7 @@ The result on the GCS bucket. We see only 3 daily backups: * GET `/api/v1/backups` - returns list of backups in remote storage. Example output: ```json - ["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"] + [{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}] ``` * POST `/api/v1/restore` - saves backup name to restore when [performing restore](#restore-commands). @@ -215,7 +215,7 @@ It can be changed by using flag: `vmbackupmanager backup list` lists backups in remote storage: ```console $ ./vmbackupmanager backup list -["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"] +[{"name":"daily/2022-11-30","size_bytes":26664689,"size":"25.429Mi"},{"name":"daily/2022-12-01","size_bytes":40160965,"size":"38.300Mi"},{"name":"hourly/2022-11-30:12","size_bytes":5846529,"size":"5.576Mi"},{"name":"hourly/2022-11-30:13","size_bytes":17651847,"size":"16.834Mi"},{"name":"hourly/2022-11-30:13:22","size_bytes":8797831,"size":"8.390Mi"},{"name":"hourly/2022-11-30:14","size_bytes":10680454,"size":"10.186Mi"}] ``` ### Restore commands @@ -274,7 +274,15 @@ If restore mark doesn't exist at `storageDataPath`(restore wasn't requested) `vm ### How to restore in Kubernetes -1. Enter container running `vmbackupmanager` +1. Ensure there is an init container with `vmbackupmanager restore` in `vmstorage` or `vmsingle` pod. + For [VictoriaMetrics operator](https://docs.victoriametrics.com/operator/VictoriaMetrics-Operator.html) deployments it is required to add: + ```yaml + vmbackup: + restore: + onStart: "true" + ``` + See operator `VMStorage` schema [here](https://docs.victoriametrics.com/operator/api.html#vmstorage) and `VMSingle` [here](https://docs.victoriametrics.com/operator/api.html#vmsinglespec). +2. Enter container running `vmbackupmanager` 2. Use `vmbackupmanager backup list` to get list of available backups: ```console $ /vmbackupmanager-prod backup list @@ -291,6 +299,33 @@ If restore mark doesn't exist at `storageDataPath`(restore wasn't requested) `vm ``` 4. Restart pod +#### Restore cluster into another cluster + +These steps are assuming that [VictoriaMetrics operator](https://docs.victoriametrics.com/operator/VictoriaMetrics-Operator.html) is used to manage `VMCluster`. +Clusters here are referred to as `source` and `destination`. + +1. Create a new cluster with access to *source* cluster `vmbackupmanager` storage and same number of storage nodes. + Add the following section in order to enable restore on start (operator `VMStorage` schema can be found [here](https://docs.victoriametrics.com/operator/api.html#vmstorage): + ```yaml + vmbackup: + restore: + onStart: "true" + ``` + Note: it is safe to leave this section in the cluster configuration, since it will be ignored if restore mark doesn't exist. + > Important! Use different `-dst` for *destination* cluster to avoid overwriting backup data of the *source* cluster. +2. Enter container running `vmbackupmanager` in *source* cluster +2. Use `vmbackupmanager backup list` to get list of available backups: + ```console + $ /vmbackupmanager-prod backup list + ["daily/2022-10-06","daily/2022-10-10","hourly/2022-10-04:13","hourly/2022-10-06:12","hourly/2022-10-06:13","hourly/2022-10-10:14","hourly/2022-10-10:16","monthly/2022-10","weekly/2022-40","weekly/2022-41"] + ``` +3. Use `vmbackupmanager restore create` to create restore mark at each pod of the *destination* cluster. + Each pod in *destination* cluster should be restored from backup of respective pod in *source* cluster. + For example: `vmstorage-source-0` in *source* cluster should be restored from `vmstorage-destination-0` in *destination* cluster. + ```console + $ /vmbackupmanager-prod restore create s3://source_cluster/vmstorage-source-0/daily/2022-10-06 + ``` + ## Configuration ### Flags diff --git a/lib/backup/actions/backup.go b/lib/backup/actions/backup.go index 30db9f73b..984d68f5f 100644 --- a/lib/backup/actions/backup.go +++ b/lib/backup/actions/backup.go @@ -11,6 +11,12 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fsnil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/metrics" +) + +var ( + bytesUploadedTotal = uint64(0) + bytesUploadedTotalMetric = metrics.NewCounter(`vm_backups_uploaded_bytes_total`) ) // Backup performs backup according to the provided settings. @@ -163,6 +169,8 @@ func runBackup(src *fslocal.FS, dst common.RemoteFS, origin common.OriginFS, con n := atomic.LoadUint64(&bytesUploaded) logger.Infof("uploaded %d out of %d bytes from src %s to dst %s in %s", n, uploadSize, src, dst, elapsed) }) + atomic.AddUint64(&bytesUploadedTotal, bytesUploaded) + bytesUploadedTotalMetric.Set(bytesUploadedTotal) if err != nil { return err } diff --git a/lib/formatutil/human.go b/lib/formatutil/human.go new file mode 100644 index 000000000..1de82a7e9 --- /dev/null +++ b/lib/formatutil/human.go @@ -0,0 +1,19 @@ +package formatutil + +import ( + "fmt" + "math" +) + +// HumanizeBytes returns human-readable representation of size in bytes with 1024 base. +func HumanizeBytes(size float64) string { + prefix := "" + for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} { + if math.Abs(size) < 1024 { + break + } + prefix = p + size /= 1024 + } + return fmt.Sprintf("%.4g%s", size, prefix) +}