diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 777d16a07..945b46ae0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,7 +65,7 @@ jobs: make ${{ matrix.scenario}} - name: Publish coverage - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v3 with: file: ./coverage.txt diff --git a/app/vmauth/auth_config.go b/app/vmauth/auth_config.go index 43b40bc94..fbb9bb414 100644 --- a/app/vmauth/auth_config.go +++ b/app/vmauth/auth_config.go @@ -1,6 +1,7 @@ package main import ( + "bytes" "encoding/base64" "flag" "fmt" @@ -290,6 +291,13 @@ func (sp *SrcPath) MarshalYAML() (interface{}, error) { return sp.sOriginal, nil } +var ( + configReloads = metrics.NewCounter(`vmauth_config_last_reload_total`) + configReloadErrors = metrics.NewCounter(`vmauth_config_last_reload_errors_total`) + configSuccess = metrics.NewCounter(`vmauth_config_last_reload_successful`) + configTimestamp = metrics.NewCounter(`vmauth_config_last_reload_success_timestamp_seconds`) +) + func initAuthConfig() { if len(*authConfigPath) == 0 { logger.Fatalf("missing required `-auth.config` command-line flag") @@ -300,11 +308,14 @@ func initAuthConfig() { // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240 sighupCh := procutil.NewSighupChan() - err := loadAuthConfig() + _, err := loadAuthConfig() if err != nil { logger.Fatalf("cannot load auth config: %s", err) } + configSuccess.Set(1) + configTimestamp.Set(fasttime.UnixTimestamp()) + stopCh = make(chan struct{}) authConfigWG.Add(1) go func() { @@ -327,52 +338,75 @@ func authConfigReloader(sighupCh <-chan os.Signal) { refreshCh = ticker.C } + updateFn := func() { + configReloads.Inc() + updated, err := loadAuthConfig() + if err != nil { + logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err) + configSuccess.Set(0) + configReloadErrors.Inc() + return + } + configSuccess.Set(1) + if updated { + configTimestamp.Set(fasttime.UnixTimestamp()) + } + } + for { select { case <-stopCh: return case <-refreshCh: - procutil.SelfSIGHUP() + updateFn() case <-sighupCh: logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath) - err := loadAuthConfig() - if err != nil { - logger.Errorf("failed to load auth config; using the last successfully loaded config; error: %s", err) - continue - } + updateFn() } } } +// authConfigData stores the yaml definition for this config. +// authConfigData needs to be updated each time authConfig is updated. +var authConfigData atomic.Pointer[[]byte] + var authConfig atomic.Pointer[AuthConfig] var authUsers atomic.Pointer[map[string]*UserInfo] var authConfigWG sync.WaitGroup var stopCh chan struct{} -func loadAuthConfig() error { - ac, err := readAuthConfig(*authConfigPath) +// loadAuthConfig loads and applies the config from *authConfigPath. +// It returns bool value to identify if new config was applied. +// The config can be not applied if there is a parsing error +// or if there are no changes to the current authConfig. +func loadAuthConfig() (bool, error) { + data, err := fs.ReadFileOrHTTP(*authConfigPath) if err != nil { - return fmt.Errorf("failed to load -auth.config=%q: %s", *authConfigPath, err) + return false, fmt.Errorf("failed to read -auth.config=%q: %w", *authConfigPath, err) + } + + oldData := authConfigData.Load() + if oldData != nil && bytes.Equal(data, *oldData) { + // there are no updates in the config - skip reloading. + return false, nil + } + + ac, err := parseAuthConfig(data) + if err != nil { + return false, fmt.Errorf("failed to parse -auth.config=%q: %w", *authConfigPath, err) } m, err := parseAuthConfigUsers(ac) if err != nil { - return fmt.Errorf("failed to parse users from -auth.config=%q: %s", *authConfigPath, err) + return false, fmt.Errorf("failed to parse users from -auth.config=%q: %w", *authConfigPath, err) } logger.Infof("loaded information about %d users from -auth.config=%q", len(m), *authConfigPath) authConfig.Store(ac) + authConfigData.Store(&data) authUsers.Store(&m) - return nil -} - -func readAuthConfig(path string) (*AuthConfig, error) { - data, err := fs.ReadFileOrHTTP(path) - if err != nil { - return nil, err - } - return parseAuthConfig(data) + return true, nil } func parseAuthConfig(data []byte) (*AuthConfig, error) { diff --git a/app/vmbackup/README.md b/app/vmbackup/README.md index 9daad41f1..340c99d6b 100644 --- a/app/vmbackup/README.md +++ b/app/vmbackup/README.md @@ -143,20 +143,23 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time- ## Advanced usage -* Obtaining credentials from a file. - Add flag `-credsFilePath=/etc/credentials` with the following content: +### Providing credentials as a file - for s3 (aws, minio or other s3 compatible storages): +Obtaining credentials from a file. +Add flag `-credsFilePath=/etc/credentials` with the following content: + +- for S3 (AWS, MinIO or other S3 compatible storages): + ```console [default] aws_access_key_id=theaccesskey aws_secret_access_key=thesecretaccesskeyvalue ``` - for gce cloud storage: - +- for GCP cloud storage: + ```json { "type": "service_account", @@ -171,24 +174,99 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time- "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/service-account-email" } ``` -* Obtaining credentials from env variables. - - For AWS S3 compatible storages set env variable `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. - Also you can set env variable `AWS_SHARED_CREDENTIALS_FILE` with path to credentials file. - - For GCE cloud storage set env variable `GOOGLE_APPLICATION_CREDENTIALS` with path to credentials file. - - For Azure storage either set env variables `AZURE_STORAGE_ACCOUNT_NAME` and `AZURE_STORAGE_ACCOUNT_KEY`, or `AZURE_STORAGE_ACCOUNT_CONNECTION_STRING`. -* Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc. - You have to add a custom url endpoint via flag: +### Providing credentials via env variables -```console - # for minio - -customS3Endpoint=http://localhost:9000 +Obtaining credentials from env variables. +- For AWS S3 compatible storages set env variable `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + Also you can set env variable `AWS_SHARED_CREDENTIALS_FILE` with path to credentials file. +- For GCE cloud storage set env variable `GOOGLE_APPLICATION_CREDENTIALS` with path to credentials file. +- For Azure storage either set env variables `AZURE_STORAGE_ACCOUNT_NAME` and `AZURE_STORAGE_ACCOUNT_KEY`, or `AZURE_STORAGE_ACCOUNT_CONNECTION_STRING`. - # for aws gov region - -customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com +Please, note that `vmbackup` will use credentials provided by cloud providers metadata service [when applicable](https://docs.victoriametrics.com/vmbackup.html#using-cloud-providers-metadata-service). + +### Using cloud providers metadata service + +`vmbackup` and `vmbackupmanager` will automatically use cloud providers metadata service in order to obtain credentials if they are running in cloud environment +and credentials are not explicitly provided via flags or env variables. + +### Providing credentials in Kubernetes + +The simplest way to provide credentials in Kubernetes is to use [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) +and inject them into the pod as environment variables. For example, the following secret can be used for AWS S3 credentials: +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: vmbackup-credentials +data: + access_key: key + secret_key: secret +``` +And then it can be injected into the pod as environment variables: +```yaml +... +env: +- name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: access_key + name: vmbackup-credentials +- name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: secret_key + name: vmbackup-credentials +... ``` -* Run `vmbackup -help` in order to see all the available options: +A more secure way is to use IAM roles to provide tokens for pods instead of managing credentials manually. + +For AWS deployments it will be required to configure [IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). +In order to use IAM roles for service accounts with `vmbackup` or `vmbackupmanager` it is required to create ServiceAccount with IAM role mapping: +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: monitoring-backups + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::{ACCOUNT_ID}:role/{ROLE_NAME} +``` +And [configure pod to use service account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/). +After this `vmbackup` and `vmbackupmanager` will automatically use IAM role for service account in order to obtain credentials. + +For GCP deployments it will be required to configure [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity). +In order to use Workload Identity with `vmbackup` or `vmbackupmanager` it is required to create ServiceAccount with Workload Identity annotation: +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: monitoring-backups + annotations: + iam.gke.io/gcp-service-account: {sa_name}@{project_name}.iam.gserviceaccount.com +``` +And [configure pod to use service account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/). +After this `vmbackup` and `vmbackupmanager` will automatically use Workload Identity for servicpe account in order to obtain credentials. + +### Using custom S3 endpoint + +Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc. +You have to add a custom url endpoint via flag: + +- for MinIO + ```console + -customS3Endpoint=http://localhost:9000 + ``` + +- for aws gov region + ```console + -customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com + ``` + +### Command-line flags + +Run `vmbackup -help` in order to see all the available options: ```console -concurrency int diff --git a/app/vmbackupmanager/README.md b/app/vmbackupmanager/README.md index 4d0c1937a..f56cac238 100644 --- a/app/vmbackupmanager/README.md +++ b/app/vmbackupmanager/README.md @@ -110,6 +110,9 @@ The result on the GCS bucket latest folder +Please, see [vmbackup docs](https://docs.victoriametrics.com/vmbackup.html#advanced-usage) for more examples of authentication with different +storage types. + ## Backup Retention Policy Backup retention policy is controlled by: diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 69a0d93b5..2d6313523 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -76,7 +76,7 @@ "uid": "$ds" }, "enable": true, - "expr": "sum(vm_app_version{job=~\"$job\"}) by(short_version) unless (sum(vm_app_version{job=~\"$job\"} offset 20m) by(short_version))", + "expr": "sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"}) by(short_version) unless (sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"} offset 20m) by(short_version))", "hide": true, "iconColor": "dark-blue", "name": "version", diff --git a/dashboards/vm/victoriametrics.json b/dashboards/vm/victoriametrics.json index 52c04c949..161308ade 100644 --- a/dashboards/vm/victoriametrics.json +++ b/dashboards/vm/victoriametrics.json @@ -77,7 +77,7 @@ "uid": "$ds" }, "enable": true, - "expr": "sum(vm_app_version{job=~\"$job\"}) by(short_version) unless (sum(vm_app_version{job=~\"$job\"} offset 20m) by(short_version))", + "expr": "sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"}) by(short_version) unless (sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"} offset 20m) by(short_version))", "hide": true, "iconColor": "dark-blue", "name": "version", diff --git a/docs/Articles.md b/docs/Articles.md index 9e7e679ea..8cb903c6e 100644 --- a/docs/Articles.md +++ b/docs/Articles.md @@ -78,6 +78,7 @@ See also [case studies](https://docs.victoriametrics.com/CaseStudies.html). * [VictoriaMetrics: an overview and its use instead of Prometheus](https://rtfm.co.ua/en/victoriametrics-an-overview-and-its-use-instead-of-prometheus/) * [VictoriaMetrics: deploying a Kubernetes monitoring stack](https://rtfm.co.ua/en/victoriametrics-deploying-a-kubernetes-monitoring-stack/) * [Better, Faster, Cheaper: How Grammarly Improved Monitoring by Over 10x with VictoriaMetrics](https://www.grammarly.com/blog/engineering/monitoring-with-victoriametrics/) +* [VictoriaMetrics, a stress-free Prometheus Remote Storage for 1 Billion metrics](https://medium.com/criteo-engineering/victoriametrics-a-prometheus-remote-storage-solution-57081a3d8e61) ## Our articles diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 1f0e5b6d8..8895b361d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -38,20 +38,31 @@ The following `tip` changes can be tested by building VictoriaMetrics components * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve accessibility score to 100 according to [Google's Lighthouse](https://developer.chrome.com/docs/lighthouse/accessibility/) tests. * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): organize `min`, `max`, `median` values on the chart legend and tooltips for better visibility. * FEATURE: dashboards: provide copies of Grafana dashboards alternated with VictoriaMetrics datasource at [dashboards/vm](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/dashboards/vm). -* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): added ability to set, override and clear request and response headers on a per-user and per-path basis. See [this i -ssue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4825) and [these docs](https://docs.victoriametrics.com/vmauth.html#auth-config) for details. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): added ability to set, override and clear request and response headers on a per-user and per-path basis. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4825) and [these docs](https://docs.victoriametrics.com/vmauth.html#auth-config) for details. * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): add ability to retry requests to the [remaining backends](https://docs.victoriametrics.com/vmauth.html#load-balancing) if they return response status codes specified in the `retry_status_codes` list. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4893). +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): expose metrics `vmauth_config_last_reload_*` for tracking the state of config reloads, similarly to vmagent/vmalert components. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth.html): do not print logs like `SIGHUP received...` once per configured `-configCheckInterval` cmd-line flag. This log will be printed only if config reload was invoked manually. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `eval_offset` attribute for [Groups](https://docs.victoriametrics.com/vmalert.html#groups). If specified, Group will be evaluated at the exact time offset on the range of [0...evaluationInterval]. The setting might be useful for cron-like rules which must be evaluated at specific moments of time. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3409) for details. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): validate [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) function names in alerting and recording rules when `vmalert` runs with `-dryRun` command-line flag. Previously it was allowed to use unknown (aka invalid) MetricsQL function names there. For example, `foo()` was counted as a valid query. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4933). * FEATURE: limit the length of string params in log messages to 500 chars. Longer string params are replaced with the `first_250_chars..last_250_chars`. This prevents from too long log lines, which can be emitted by VictoriaMetrics components. -* BUGFIX: [storage](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html): prevent from livelock when [forced merge](https://docs.victoriametrics.com/#forced-merge) is called under high data ingestion. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4987). -* BUGFIX: [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage): correctly return `null` instead of `Inf` in JSON query responses. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3783). * BUGFIX: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): fix display of ingested rows rate for `Samples ingested/s` and `Samples rate` panels for vmagent's dasbhoard. Previously, not all ingested protocols were accounted in these panels. An extra panel `Rows rate` was added to `Ingestion` section to display the split for rows ingested rate by protocol. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix the bug causing render looping when switching to heatmap. -* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): properly copy `parts.json` files inside `<-storageDataPath>/{data,indexdb}` folders during [incremental backups](https://docs.victoriametrics.com/vmbackup.html#incremental-backups). Previously the new `parts.json` could be skipped during incremental backups, which could lead to inability to restore from the backup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005). This issue has been introduced in [v1.90.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.90.0). * BUGFIX: [VictoriaMetrics enterprise](https://docs.victoriametrics.com/enterprise.html) validate `-dedup.minScrapeInterval` value and `-downsampling.period` intervals are multiples of each other. See [these docs](https://docs.victoriametrics.com/#downsampling). +* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): properly copy `appliedRetention.txt` files inside `<-storageDataPath>/{data}` folders during [incremental backups](https://docs.victoriametrics.com/vmbackup.html#incremental-backups). Previously the new `appliedRetention.txt` could be skipped during incremental backups, which could lead to increased load on storage after restoring from backup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005). + +## [v1.93.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.5) + +Released at 2023-09-19 + +**v1.93.x is a line of LTS releases (e.g. long-time support). It contains important up-to-date bugfixes. +The v1.93.x line will be supported for at least 12 months since [v1.93.0](https://docs.victoriametrics.com/CHANGELOG.html#v1930) release** + +* BUGFIX: [storage](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html): prevent from livelock when [forced merge](https://docs.victoriametrics.com/#forced-merge) is called under high data ingestion. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4987). +* BUGFIX: [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage): correctly return `null` instead of `Inf` in JSON query responses. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3783). +* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup.html): properly copy `parts.json` files inside `<-storageDataPath>/{data,indexdb}` folders during [incremental backups](https://docs.victoriametrics.com/vmbackup.html#incremental-backups). Previously the new `parts.json` could be skipped during incremental backups, which could lead to inability to restore from the backup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005). This issue has been introduced in [v1.90.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.90.0). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly close connections to Kubernetes API server after the change in `selectors` or `namespaces` sections of [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs). Previously `vmagent` could continue polling Kubernetes API server with the old `selectors` or `namespaces` configs additionally to polling new configs. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4850). +* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth.html): prevent configuration reloading if there were no changes in config. This improves memory usage when `-configCheckInterval` cmd-line flag is configured and config has extensive list of regexp expressions requiring additional memory on parsing. ## [v1.93.4](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.4) diff --git a/docs/operator/vars.md b/docs/operator/vars.md index dad003d18..edf6939e4 100644 --- a/docs/operator/vars.md +++ b/docs/operator/vars.md @@ -10,7 +10,7 @@ aliases: - /operator/vars.html --- # Auto Generated vars for package config - updated at Wed Sep 13 14:05:24 UTC 2023 + updated at Thu Sep 21 10:01:40 UTC 2023 | varible name | variable default value | variable required | variable description | @@ -20,7 +20,7 @@ aliases: | VM_CUSTOMCONFIGRELOADERIMAGE | victoriametrics/operator:config-reloader-v0.32.0 | false | - | | VM_PSPAUTOCREATEENABLED | false | false | - | | VM_VMALERTDEFAULT_IMAGE | victoriametrics/vmalert | false | - | -| VM_VMALERTDEFAULT_VERSION | v1.93.4 | false | - | +| VM_VMALERTDEFAULT_VERSION | v1.93.5 | false | - | | VM_VMALERTDEFAULT_PORT | 8080 | false | - | | VM_VMALERTDEFAULT_USEDEFAULTRESOURCES | true | false | - | | VM_VMALERTDEFAULT_RESOURCE_LIMIT_MEM | 500Mi | false | - | @@ -31,7 +31,7 @@ aliases: | VM_VMALERTDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - | | VM_VMALERTDEFAULT_CONFIGRELOADIMAGE | jimmidyson/configmap-reload:v0.3.0 | false | - | | VM_VMAGENTDEFAULT_IMAGE | victoriametrics/vmagent | false | - | -| VM_VMAGENTDEFAULT_VERSION | v1.93.4 | false | - | +| VM_VMAGENTDEFAULT_VERSION | v1.93.5 | false | - | | VM_VMAGENTDEFAULT_CONFIGRELOADIMAGE | quay.io/prometheus-operator/prometheus-config-reloader:v0.68.0 | false | - | | VM_VMAGENTDEFAULT_PORT | 8429 | false | - | | VM_VMAGENTDEFAULT_USEDEFAULTRESOURCES | true | false | - | @@ -42,7 +42,7 @@ aliases: | VM_VMAGENTDEFAULT_CONFIGRELOADERCPU | 100m | false | - | | VM_VMAGENTDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - | | VM_VMSINGLEDEFAULT_IMAGE | victoriametrics/victoria-metrics | false | - | -| VM_VMSINGLEDEFAULT_VERSION | v1.93.4 | false | - | +| VM_VMSINGLEDEFAULT_VERSION | v1.93.5 | false | - | | VM_VMSINGLEDEFAULT_PORT | 8429 | false | - | | VM_VMSINGLEDEFAULT_USEDEFAULTRESOURCES | true | false | - | | VM_VMSINGLEDEFAULT_RESOURCE_LIMIT_MEM | 1500Mi | false | - | @@ -53,14 +53,14 @@ aliases: | VM_VMSINGLEDEFAULT_CONFIGRELOADERMEMORY | 25Mi | false | - | | VM_VMCLUSTERDEFAULT_USEDEFAULTRESOURCES | true | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_IMAGE | victoriametrics/vmselect | false | - | -| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_VERSION | v1.93.4-cluster | false | - | +| VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_VERSION | v1.93.5-cluster | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_PORT | 8481 | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_LIMIT_MEM | 1000Mi | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_LIMIT_CPU | 500m | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_REQUEST_MEM | 500Mi | false | - | | VM_VMCLUSTERDEFAULT_VMSELECTDEFAULT_RESOURCE_REQUEST_CPU | 100m | false | - | | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_IMAGE | victoriametrics/vmstorage | false | - | -| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VERSION | v1.93.4-cluster | false | - | +| VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VERSION | v1.93.5-cluster | false | - | | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VMINSERTPORT | 8400 | false | - | | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_VMSELECTPORT | 8401 | false | - | | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_PORT | 8482 | false | - | @@ -69,7 +69,7 @@ aliases: | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_RESOURCE_REQUEST_MEM | 500Mi | false | - | | VM_VMCLUSTERDEFAULT_VMSTORAGEDEFAULT_RESOURCE_REQUEST_CPU | 250m | false | - | | VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_IMAGE | victoriametrics/vminsert | false | - | -| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_VERSION | v1.93.4-cluster | false | - | +| VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_VERSION | v1.93.5-cluster | false | - | | VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_PORT | 8480 | false | - | | VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_RESOURCE_LIMIT_MEM | 500Mi | false | - | | VM_VMCLUSTERDEFAULT_VMINSERTDEFAULT_RESOURCE_LIMIT_CPU | 500m | false | - | @@ -88,7 +88,7 @@ aliases: | VM_VMALERTMANAGER_RESOURCE_REQUEST_CPU | 30m | false | - | | VM_DISABLESELFSERVICESCRAPECREATION | false | false | - | | VM_VMBACKUP_IMAGE | victoriametrics/vmbackupmanager | false | - | -| VM_VMBACKUP_VERSION | v1.93.4-enterprise | false | - | +| VM_VMBACKUP_VERSION | v1.93.5-enterprise | false | - | | VM_VMBACKUP_PORT | 8300 | false | - | | VM_VMBACKUP_USEDEFAULTRESOURCES | true | false | - | | VM_VMBACKUP_RESOURCE_LIMIT_MEM | 500Mi | false | - | @@ -97,7 +97,7 @@ aliases: | VM_VMBACKUP_RESOURCE_REQUEST_CPU | 150m | false | - | | VM_VMBACKUP_LOGLEVEL | INFO | false | - | | VM_VMAUTHDEFAULT_IMAGE | victoriametrics/vmauth | false | - | -| VM_VMAUTHDEFAULT_VERSION | v1.93.4 | false | - | +| VM_VMAUTHDEFAULT_VERSION | v1.93.5 | false | - | | VM_VMAUTHDEFAULT_CONFIGRELOADIMAGE | quay.io/prometheus-operator/prometheus-config-reloader:v0.68.0 | false | - | | VM_VMAUTHDEFAULT_PORT | 8427 | false | - | | VM_VMAUTHDEFAULT_USEDEFAULTRESOURCES | true | false | - | diff --git a/docs/vmbackup.md b/docs/vmbackup.md index 39c4b3e95..bff38459a 100644 --- a/docs/vmbackup.md +++ b/docs/vmbackup.md @@ -154,20 +154,23 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time- ## Advanced usage -* Obtaining credentials from a file. - Add flag `-credsFilePath=/etc/credentials` with the following content: +### Providing credentials as a file - for s3 (aws, minio or other s3 compatible storages): +Obtaining credentials from a file. +Add flag `-credsFilePath=/etc/credentials` with the following content: + +- for S3 (AWS, MinIO or other S3 compatible storages): + ```console [default] aws_access_key_id=theaccesskey aws_secret_access_key=thesecretaccesskeyvalue ``` - for gce cloud storage: - +- for GCP cloud storage: + ```json { "type": "service_account", @@ -182,24 +185,99 @@ See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time- "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/service-account-email" } ``` -* Obtaining credentials from env variables. - - For AWS S3 compatible storages set env variable `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. - Also you can set env variable `AWS_SHARED_CREDENTIALS_FILE` with path to credentials file. - - For GCE cloud storage set env variable `GOOGLE_APPLICATION_CREDENTIALS` with path to credentials file. - - For Azure storage either set env variables `AZURE_STORAGE_ACCOUNT_NAME` and `AZURE_STORAGE_ACCOUNT_KEY`, or `AZURE_STORAGE_ACCOUNT_CONNECTION_STRING`. -* Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc. - You have to add a custom url endpoint via flag: +### Providing credentials via env variables -```console - # for minio - -customS3Endpoint=http://localhost:9000 +Obtaining credentials from env variables. +- For AWS S3 compatible storages set env variable `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + Also you can set env variable `AWS_SHARED_CREDENTIALS_FILE` with path to credentials file. +- For GCE cloud storage set env variable `GOOGLE_APPLICATION_CREDENTIALS` with path to credentials file. +- For Azure storage either set env variables `AZURE_STORAGE_ACCOUNT_NAME` and `AZURE_STORAGE_ACCOUNT_KEY`, or `AZURE_STORAGE_ACCOUNT_CONNECTION_STRING`. - # for aws gov region - -customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com +Please, note that `vmbackup` will use credentials provided by cloud providers metadata service [when applicable](https://docs.victoriametrics.com/vmbackup.html#using-cloud-providers-metadata-service). + +### Using cloud providers metadata service + +`vmbackup` and `vmbackupmanager` will automatically use cloud providers metadata service in order to obtain credentials if they are running in cloud environment +and credentials are not explicitly provided via flags or env variables. + +### Providing credentials in Kubernetes + +The simplest way to provide credentials in Kubernetes is to use [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) +and inject them into the pod as environment variables. For example, the following secret can be used for AWS S3 credentials: +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: vmbackup-credentials +data: + access_key: key + secret_key: secret +``` +And then it can be injected into the pod as environment variables: +```yaml +... +env: +- name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: access_key + name: vmbackup-credentials +- name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: secret_key + name: vmbackup-credentials +... ``` -* Run `vmbackup -help` in order to see all the available options: +A more secure way is to use IAM roles to provide tokens for pods instead of managing credentials manually. + +For AWS deployments it will be required to configure [IAM roles for service accounts](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). +In order to use IAM roles for service accounts with `vmbackup` or `vmbackupmanager` it is required to create ServiceAccount with IAM role mapping: +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: monitoring-backups + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::{ACCOUNT_ID}:role/{ROLE_NAME} +``` +And [configure pod to use service account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/). +After this `vmbackup` and `vmbackupmanager` will automatically use IAM role for service account in order to obtain credentials. + +For GCP deployments it will be required to configure [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity). +In order to use Workload Identity with `vmbackup` or `vmbackupmanager` it is required to create ServiceAccount with Workload Identity annotation: +```yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: monitoring-backups + annotations: + iam.gke.io/gcp-service-account: {sa_name}@{project_name}.iam.gserviceaccount.com +``` +And [configure pod to use service account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/). +After this `vmbackup` and `vmbackupmanager` will automatically use Workload Identity for servicpe account in order to obtain credentials. + +### Using custom S3 endpoint + +Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc. +You have to add a custom url endpoint via flag: + +- for MinIO + ```console + -customS3Endpoint=http://localhost:9000 + ``` + +- for aws gov region + ```console + -customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com + ``` + +### Command-line flags + +Run `vmbackup -help` in order to see all the available options: ```console -concurrency int diff --git a/docs/vmbackupmanager.md b/docs/vmbackupmanager.md index 7f912ff9e..051c87656 100644 --- a/docs/vmbackupmanager.md +++ b/docs/vmbackupmanager.md @@ -121,6 +121,9 @@ The result on the GCS bucket latest folder +Please, see [vmbackup docs](https://docs.victoriametrics.com/vmbackup.html#advanced-usage) for more examples of authentication with different +storage types. + ## Backup Retention Policy Backup retention policy is controlled by: diff --git a/lib/backup/common/part.go b/lib/backup/common/part.go index d7f9aa239..8b9c68e31 100644 --- a/lib/backup/common/part.go +++ b/lib/backup/common/part.go @@ -40,9 +40,11 @@ type Part struct { // key returns a string, which uniquely identifies p. func (p *Part) key() string { - if strings.HasSuffix(p.Path, "/parts.json") { - // parts.json file contents changes over time, so it must have an unique key in order - // to always copy it during backup, restore and server-side copy. + if strings.HasSuffix(p.Path, "/parts.json") || + strings.HasSuffix(p.Path, "/appliedRetention.txt") { + // parts.json and appliedRetention.txt files contents changes over time, + // so it must have an unique key in order to always copy it during + // backup, restore and server-side copy. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5005 id := atomic.AddUint64(&uniqueKeyID, 1) return fmt.Sprintf("unique-%016X", id) diff --git a/lib/logstorage/datadb.go b/lib/logstorage/datadb.go index d23c05eed..41f9c31a6 100644 --- a/lib/logstorage/datadb.go +++ b/lib/logstorage/datadb.go @@ -149,7 +149,18 @@ func mustOpenDatadb(pt *partition, path string, flushInterval time.Duration) *da pws := make([]*partWrapper, len(partNames)) for i, partName := range partNames { + // Make sure the partName exists on disk. + // If it is missing, then manual action from the user is needed, + // since this is unexpected state, which cannot occur under normal operation, + // including unclean shutdown. partPath := filepath.Join(path, partName) + if !fs.IsPathExist(partPath) { + partsFile := filepath.Join(path, partsFilename) + logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+ + "ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts", + partPath, partsFile, partsFile, partsFile) + } + p := mustOpenFilePart(pt, partPath) pws[i] = newPartWrapper(p, nil, time.Time{}) } diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index 8b916770e..bf7274a8e 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -1349,6 +1349,18 @@ func mustOpenParts(path string) []*partWrapper { des := fs.MustReadDir(path) m := make(map[string]struct{}, len(partNames)) for _, partName := range partNames { + // Make sure the partName exists on disk. + // If it is missing, then manual action from the user is needed, + // since this is unexpected state, which cannot occur under normal operation, + // including unclean shutdown. + partPath := filepath.Join(path, partName) + if !fs.IsPathExist(partPath) { + partsFile := filepath.Join(path, partsFilename) + logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+ + "ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts", + partPath, partsFile, partsFile, partsFile) + } + m[partName] = struct{}{} } for _, de := range des { diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index dd4c54b75..62777d7fa 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -668,7 +668,8 @@ func (is *indexSearch) searchLabelNamesWithFiltersOnDate(qt *querytracer.Tracer, // This would help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 metricIDs := filter.AppendTo(nil) qt.Printf("sort %d metricIDs", len(metricIDs)) - return is.getLabelNamesForMetricIDs(qt, metricIDs, lns, maxLabelNames) + is.getLabelNamesForMetricIDs(qt, metricIDs, lns, maxLabelNames) + return nil } var prevLabelName []byte ts := &is.ts @@ -732,39 +733,34 @@ func (is *indexSearch) searchLabelNamesWithFiltersOnDate(qt *querytracer.Tracer, return nil } -func (is *indexSearch) getLabelNamesForMetricIDs(qt *querytracer.Tracer, metricIDs []uint64, lns map[string]struct{}, maxLabelNames int) error { +func (is *indexSearch) getLabelNamesForMetricIDs(qt *querytracer.Tracer, metricIDs []uint64, lns map[string]struct{}, maxLabelNames int) { lns["__name__"] = struct{}{} var mn MetricName foundLabelNames := 0 var buf []byte for _, metricID := range metricIDs { - var err error - buf, err = is.searchMetricNameWithCache(buf[:0], metricID) - if err != nil { - if err == io.EOF { - // It is likely the metricID->metricName entry didn't propagate to inverted index yet. - // Skip this metricID for now. - continue - } - return fmt.Errorf("cannot find metricName by metricID %d: %w", metricID, err) + var ok bool + buf, ok = is.searchMetricNameWithCache(buf[:0], metricID) + if !ok { + // It is likely the metricID->metricName entry didn't propagate to inverted index yet. + // Skip this metricID for now. + continue } if err := mn.Unmarshal(buf); err != nil { - return fmt.Errorf("cannot unmarshal metricName %q: %w", buf, err) + logger.Panicf("FATAL: cannot unmarshal metricName %q: %w", buf, err) } for _, tag := range mn.Tags { - _, ok := lns[string(tag.Key)] - if !ok { + if _, ok := lns[string(tag.Key)]; !ok { foundLabelNames++ lns[string(tag.Key)] = struct{}{} if len(lns) >= maxLabelNames { qt.Printf("hit the limit on the number of unique label names: %d", maxLabelNames) - return nil + return } } } } qt.Printf("get %d distinct label names from %d metricIDs", foundLabelNames, len(metricIDs)) - return nil } // SearchLabelValuesWithFiltersOnTimeRange returns label values for the given labelName, tfss and tr. @@ -868,7 +864,8 @@ func (is *indexSearch) searchLabelValuesWithFiltersOnDate(qt *querytracer.Tracer // This would help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 metricIDs := filter.AppendTo(nil) qt.Printf("sort %d metricIDs", len(metricIDs)) - return is.getLabelValuesForMetricIDs(qt, lvs, labelName, metricIDs, maxLabelValues) + is.getLabelValuesForMetricIDs(qt, lvs, labelName, metricIDs, maxLabelValues) + return nil } if labelName == "__name__" { // __name__ label is encoded as empty string in indexdb. @@ -927,7 +924,7 @@ func (is *indexSearch) searchLabelValuesWithFiltersOnDate(qt *querytracer.Tracer return nil } -func (is *indexSearch) getLabelValuesForMetricIDs(qt *querytracer.Tracer, lvs map[string]struct{}, labelName string, metricIDs []uint64, maxLabelValues int) error { +func (is *indexSearch) getLabelValuesForMetricIDs(qt *querytracer.Tracer, lvs map[string]struct{}, labelName string, metricIDs []uint64, maxLabelValues int) { if labelName == "" { labelName = "__name__" } @@ -935,32 +932,27 @@ func (is *indexSearch) getLabelValuesForMetricIDs(qt *querytracer.Tracer, lvs ma foundLabelValues := 0 var buf []byte for _, metricID := range metricIDs { - var err error - buf, err = is.searchMetricNameWithCache(buf[:0], metricID) - if err != nil { - if err == io.EOF { - // It is likely the metricID->metricName entry didn't propagate to inverted index yet. - // Skip this metricID for now. - continue - } - return fmt.Errorf("cannot find metricName by metricID %d: %w", metricID, err) + var ok bool + buf, ok = is.searchMetricNameWithCache(buf[:0], metricID) + if !ok { + // It is likely the metricID->metricName entry didn't propagate to inverted index yet. + // Skip this metricID for now. + continue } if err := mn.Unmarshal(buf); err != nil { - return fmt.Errorf("cannot unmarshal metricName %q: %w", buf, err) + logger.Panicf("FATAL: cannot unmarshal metricName %q: %s", buf, err) } tagValue := mn.GetTagValue(labelName) - _, ok := lvs[string(tagValue)] - if !ok { + if _, ok := lvs[string(tagValue)]; !ok { foundLabelValues++ lvs[string(tagValue)] = struct{}{} if len(lvs) >= maxLabelValues { qt.Printf("hit the limit on the number of unique label values for label %q: %d", labelName, maxLabelValues) - return nil + return } } } qt.Printf("get %d distinct values for label %q from %d metricIDs", foundLabelValues, labelName, len(metricIDs)) - return nil } // SearchTagValueSuffixes returns all the tag value suffixes for the given tagKey and tagValuePrefix on the given tr. @@ -1442,38 +1434,35 @@ func (th *topHeap) Pop() interface{} { // searchMetricNameWithCache appends metric name for the given metricID to dst // and returns the result. -func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byte, error) { +func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byte, bool) { metricName := db.getMetricNameFromCache(dst, metricID) if len(metricName) > len(dst) { - return metricName, nil + return metricName, true } is := db.getIndexSearch(noDeadline) - var err error - dst, err = is.searchMetricName(dst, metricID) + var ok bool + dst, ok = is.searchMetricName(dst, metricID) db.putIndexSearch(is) - if err == nil { + if ok { // There is no need in verifying whether the given metricID is deleted, // since the filtering must be performed before calling this func. db.putMetricNameToCache(metricID, dst) - return dst, nil - } - if err != io.EOF { - return dst, err + return dst, true } // Try searching in the external indexDB. if db.doExtDB(func(extDB *indexDB) { is := extDB.getIndexSearch(noDeadline) - dst, err = is.searchMetricName(dst, metricID) + dst, ok = is.searchMetricName(dst, metricID) extDB.putIndexSearch(is) - if err == nil { + if ok { // There is no need in verifying whether the given metricID is deleted, // since the filtering must be performed before calling this func. extDB.putMetricNameToCache(metricID, dst) } - }) { - return dst, err + }) && ok { + return dst, true } // Cannot find MetricName for the given metricID. This may be the case @@ -1484,7 +1473,7 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt // Mark the metricID as deleted, so it will be created again when new data point // for the given time series will arrive. db.deleteMetricIDs([]uint64{metricID}) - return dst, io.EOF + return dst, false } // DeleteTSIDs marks as deleted all the TSIDs matching the given tfss. @@ -1820,36 +1809,36 @@ func (is *indexSearch) getTSIDByMetricNameNoExtDB(dst *TSID, metricName []byte, return false } -func (is *indexSearch) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byte, error) { +func (is *indexSearch) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byte, bool) { metricName := is.db.getMetricNameFromCache(dst, metricID) if len(metricName) > len(dst) { - return metricName, nil + return metricName, true } - var err error - dst, err = is.searchMetricName(dst, metricID) - if err == nil { + var ok bool + dst, ok = is.searchMetricName(dst, metricID) + if ok { // There is no need in verifying whether the given metricID is deleted, // since the filtering must be performed before calling this func. is.db.putMetricNameToCache(metricID, dst) - return dst, nil + return dst, true } - return dst, err + return dst, false } -func (is *indexSearch) searchMetricName(dst []byte, metricID uint64) ([]byte, error) { +func (is *indexSearch) searchMetricName(dst []byte, metricID uint64) ([]byte, bool) { ts := &is.ts kb := &is.kb kb.B = is.marshalCommonPrefix(kb.B[:0], nsPrefixMetricIDToMetricName) kb.B = encoding.MarshalUint64(kb.B, metricID) if err := ts.FirstItemWithPrefix(kb.B); err != nil { if err == io.EOF { - return dst, err + return dst, false } - return dst, fmt.Errorf("error when searching metricName by metricID; searchPrefix %q: %w", kb.B, err) + logger.Panicf("FATAL: error when searching metricName by metricID; searchPrefix %q: %w", kb.B, err) } v := ts.Item[len(kb.B):] dst = append(dst, v...) - return dst, nil + return dst, true } func (is *indexSearch) containsTimeRange(tr TimeRange) (bool, error) { @@ -1928,18 +1917,15 @@ func (is *indexSearch) updateMetricIDsByMetricNameMatch(qt *querytracer.Tracer, return err } } - var err error - metricName.B, err = is.searchMetricNameWithCache(metricName.B[:0], metricID) - if err != nil { - if err == io.EOF { - // It is likely the metricID->metricName entry didn't propagate to inverted index yet. - // Skip this metricID for now. - continue - } - return fmt.Errorf("cannot find metricName by metricID %d: %w", metricID, err) + var ok bool + metricName.B, ok = is.searchMetricNameWithCache(metricName.B[:0], metricID) + if !ok { + // It is likely the metricID->metricName entry didn't propagate to inverted index yet. + // Skip this metricID for now. + continue } if err := mn.Unmarshal(metricName.B); err != nil { - return fmt.Errorf("cannot unmarshal metricName %q: %w", metricName.B, err) + logger.Panicf("FATAL: cannot unmarshal metricName %q: %s", metricName.B, err) } // Match the mn against tfs. diff --git a/lib/storage/index_db_test.go b/lib/storage/index_db_test.go index 28c01b72e..16f5f2dec 100644 --- a/lib/storage/index_db_test.go +++ b/lib/storage/index_db_test.go @@ -3,7 +3,6 @@ package storage import ( "bytes" "fmt" - "io" "math/rand" "os" "reflect" @@ -655,19 +654,19 @@ func testIndexDBCheckTSIDByName(db *indexDB, mns []MetricName, tsids []TSID, isC } // Search for metric name for the given metricID. - var err error - metricNameCopy, err = db.searchMetricNameWithCache(metricNameCopy[:0], genTSID.TSID.MetricID) - if err != nil { - return fmt.Errorf("error in searchMetricNameWithCache for metricID=%d; i=%d: %w", genTSID.TSID.MetricID, i, err) + var ok bool + metricNameCopy, ok = db.searchMetricNameWithCache(metricNameCopy[:0], genTSID.TSID.MetricID) + if !ok { + return fmt.Errorf("cannot find metricName for metricID=%d; i=%d", genTSID.TSID.MetricID, i) } if !bytes.Equal(metricName, metricNameCopy) { return fmt.Errorf("unexpected mn for metricID=%d;\ngot\n%q\nwant\n%q", genTSID.TSID.MetricID, metricNameCopy, metricName) } // Try searching metric name for non-existent MetricID. - buf, err := db.searchMetricNameWithCache(nil, 1) - if err != io.EOF { - return fmt.Errorf("expecting io.EOF error when searching for non-existing metricID; got %v", err) + buf, found := db.searchMetricNameWithCache(nil, 1) + if found { + return fmt.Errorf("unexpected metricName found for non-existing metricID; got %X", buf) } if len(buf) > 0 { return fmt.Errorf("expecting empty buf when searching for non-existent metricID; got %X", buf) diff --git a/lib/storage/partition.go b/lib/storage/partition.go index c81b05a77..92073f26b 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -1779,6 +1779,18 @@ func mustOpenParts(path string, partNames []string) []*partWrapper { des := fs.MustReadDir(path) m := make(map[string]struct{}, len(partNames)) for _, partName := range partNames { + // Make sure the partName exists on disk. + // If it is missing, then manual action from the user is needed, + // since this is unexpected state, which cannot occur under normal operation, + // including unclean shutdown. + partPath := filepath.Join(path, partName) + if !fs.IsPathExist(partPath) { + partsFile := filepath.Join(path, partsFilename) + logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+ + "ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts", + partPath, partsFile, partsFile, partsFile) + } + m[partName] = struct{}{} } for _, de := range des { diff --git a/lib/storage/search.go b/lib/storage/search.go index 1873939f5..f5b4ab7f8 100644 --- a/lib/storage/search.go +++ b/lib/storage/search.go @@ -211,16 +211,12 @@ func (s *Search) NextMetricBlock() bool { // Skip the block, since it contains only data outside the configured retention. continue } - var err error - s.MetricBlockRef.MetricName, err = s.idb.searchMetricNameWithCache(s.MetricBlockRef.MetricName[:0], tsid.MetricID) - if err != nil { - if err == io.EOF { - // Skip missing metricName for tsid.MetricID. - // It should be automatically fixed. See indexDB.searchMetricNameWithCache for details. - continue - } - s.err = err - return false + var ok bool + s.MetricBlockRef.MetricName, ok = s.idb.searchMetricNameWithCache(s.MetricBlockRef.MetricName[:0], tsid.MetricID) + if !ok { + // Skip missing metricName for tsid.MetricID. + // It should be automatically fixed. See indexDB.searchMetricNameWithCache for details. + continue } s.prevMetricID = tsid.MetricID } diff --git a/lib/storage/storage.go b/lib/storage/storage.go index fff06b435..7d88d36c5 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -1114,15 +1114,12 @@ func (s *Storage) SearchMetricNames(qt *querytracer.Tracer, tfss []*TagFilters, return nil, err } } - var err error - metricName, err = idb.searchMetricNameWithCache(metricName[:0], metricID) - if err != nil { - if err == io.EOF { - // Skip missing metricName for metricID. - // It should be automatically fixed. See indexDB.searchMetricName for details. - continue - } - return nil, fmt.Errorf("error when searching metricName for metricID=%d: %w", metricID, err) + var ok bool + metricName, ok = idb.searchMetricNameWithCache(metricName[:0], metricID) + if !ok { + // Skip missing metricName for metricID. + // It should be automatically fixed. See indexDB.searchMetricName for details. + continue } if _, ok := metricNamesSeen[string(metricName)]; ok { // The given metric name was already seen; skip it @@ -1175,13 +1172,11 @@ func (s *Storage) prefetchMetricNames(qt *querytracer.Tracer, srcMetricIDs []uin return err } } - metricName, err = is.searchMetricNameWithCache(metricName[:0], metricID) - if err != nil { - if err == io.EOF { - missingMetricIDs = append(missingMetricIDs, metricID) - continue - } - return fmt.Errorf("error in pre-fetching metricName for metricID=%d: %w", metricID, err) + var ok bool + metricName, ok = is.searchMetricNameWithCache(metricName[:0], metricID) + if !ok { + missingMetricIDs = append(missingMetricIDs, metricID) + continue } } idb.doExtDB(func(extDB *indexDB) { @@ -1193,11 +1188,7 @@ func (s *Storage) prefetchMetricNames(qt *querytracer.Tracer, srcMetricIDs []uin return } } - metricName, err = is.searchMetricNameWithCache(metricName[:0], metricID) - if err != nil && err != io.EOF { - err = fmt.Errorf("error in pre-fetching metricName for metricID=%d in extDB: %w", metricID, err) - return - } + metricName, _ = is.searchMetricNameWithCache(metricName[:0], metricID) } }) if err != nil && err != io.EOF {