diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index eb6d89d52..000000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: benchmark -on: - push: - branches: - - master - - cluster - paths-ignore: - - "docs/**" - - "**.md" - - "dashboards/**" - - "deployment/**.yml" - pull_request: - types: - - opened - - synchronize - - reopened - - labeled - branches: - - master - - cluster - paths-ignore: - - "docs/**" - - "**.md" - - "dashboards/**" - - "deployment/**.yml" -permissions: - contents: read - packages: write - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-streamaggr-benchmark-image: - name: build - runs-on: ubuntu-latest - if: contains(github.event.pull_request.labels.*.name, 'streamaggr-benchmark') - steps: - - name: Code checkout - uses: actions/checkout@v4 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Package VMAgent Docker image for benchmark - run: | - SKIP_SCRATCH_BUILD=true \ - DOCKER_BUILD_OPTS='--cache-to type=gha,mode=max --cache-from type=gha' \ - PKG_TAG=${{ github.event.pull_request.head.sha }} \ - DOCKER_REGISTRY=ghcr.io \ - TARGET_PLATFORM=linux/amd64 make publish-vmagent diff --git a/CODE_OF_CONDUCT_RU.md b/CODE_OF_CONDUCT_RU.md deleted file mode 100644 index 312f2011d..000000000 --- a/CODE_OF_CONDUCT_RU.md +++ /dev/null @@ -1,120 +0,0 @@ - -# Кодекс Поведения участника - -## Наши обязательства - -Мы, как участники, авторы и лидеры обязуемся сделать участие в сообществе -свободным от притеснений для всех, независимо от возраста, телосложения, -видимых или невидимых ограничений способности, этнической принадлежности, -половых признаков, гендерной идентичности и выражения, уровня опыта, -образования, социо-экономического статуса, национальности, внешности, -расы, религии, или сексуальной идентичности и ориентации. - -Мы обещаем действовать и взаимодействовать таким образом, чтобы вносить вклад в открытое, -дружелюбное, многообразное, инклюзивное и здоровое сообщество. - -## Наши стандарты - -Примеры поведения, создающие условия для благоприятных взаимоотношений включают в себя: - -* Проявление доброты и эмпатии к другим участникам проекта -* Уважение к чужой точке зрения и опыту -* Конструктивная критика и принятие конструктивной критики -* Принятие ответственности, принесение извинений тем, кто пострадал от наших ошибок - и извлечение уроков из опыта -* Ориентирование на то, что лучше подходит для сообщества, а не только для нас лично - -Примеры неприемлемого поведения участников включают в себя: - -* Использование выражений или изображений сексуального характера и нежелательное сексуальное внимание или домогательство в любой форме -* Троллинг, оскорбительные или уничижительные комментарии, переход на личности или затрагивание политических убеждений -* Публичное или приватное домогательство -* Публикация личной информации других лиц, например, физического или электронного адреса, без явного разрешения -* Иное поведение, которое обоснованно считать неуместным в профессиональной обстановке - -## Обязанности - -Лидеры сообщества отвечают за разъяснение и применение наших стандартов приемлемого -поведения и будут предпринимать соответствующие и честные меры по исправлению положения -в ответ на любое поведение, которое они сочтут неприемлемым, угрожающим, оскорбительным или вредным. - -Лидеры сообщества обладают правом и обязанностью удалять, редактировать или отклонять -комментарии, коммиты, код, изменения в вики, вопросы и другой вклад, который не совпадает -с Кодексом Поведения, и предоставят причины принятого решения, когда сочтут нужным. - -## Область применения - -Данный Кодекс Поведения применим во всех во всех публичных физических и цифровых пространства сообщества, -а также когда человек официально представляет сообщество в публичных местах. -Примеры представления проекта или сообщества включают использование официальной электронной почты, -публикации в официальном аккаунте в социальных сетях, -или упоминания как представителя в онлайн или оффлайн мероприятии. - -## Приведение в исполнение - -О случаях домогательства, а так же оскорбительного или иного другого неприемлемого -поведения можно сообщить ответственным лидерам сообщества с помощью письма на info@victoriametrics.com -Все жалобы будут рассмотрены и расследованы оперативно и беспристрастно. - -Все лидеры сообщества обязаны уважать неприкосновенность частной жизни и личную -неприкосновенность автора сообщения. - -## Руководство по исполнению - -Лидеры сообщества будут следовать следующим Принципам Воздействия в Сообществе, -чтобы определить последствия для тех, кого они считают виновными в нарушении данного Кодекса Поведения: - -### 1. Исправление - -**Общественное влияние**: Использование недопустимой лексики или другое поведение, -считающиеся непрофессиональным или нежелательным в сообществе. - -**Последствия**: Личное, письменное предупреждение от лидеров сообщества, -объясняющее суть нарушения и почему такое поведение -было неуместно. Лидеры сообщества могут попросить принести публичное извинение. - -### 2. Предупреждение - -**Общественное влияние**: Нарушение в результате одного инцидента или серии действий. - -**Последствия**: Предупреждение о последствиях в случае продолжающегося неуместного поведения. -На определенное время не допускается взаимодействие с людьми, вовлеченными в инцидент, -включая незапрошенное взаимодействие -с теми, кто обеспечивает соблюдение Кодекса. Это включает в себя избегание взаимодействия -в публичных пространствах, а так же во внешних каналах, -таких как социальные сети. Нарушение этих правил влечет за собой временный или вечный бан. - -### 3. Временный бан - -**Общественное влияние**: Серьёзное нарушение стандартов сообщества, -включая продолжительное неуместное поведение. - -**Последствия**: Временный запрет (бан) на любое взаимодействие -или публичное общение с сообществом на определенный период времени. -На этот период не допускается публичное или личное взаимодействие с людьми, -вовлеченными в инцидент, включая незапрошенное взаимодействие -с теми, кто обеспечивает соблюдение Кодекса. -Нарушение этих правил влечет за собой вечный бан. - -### 4. Вечный бан - -**Общественное влияние**: Демонстрация систематических нарушений стандартов сообщества, -включая продолжающееся неуместное поведение, домогательство до отдельных лиц, -или проявление агрессии либо пренебрежительного отношения к категориям лиц. - -**Последствия**: Вечный запрет на любое публичное взаимодействие с сообществом. - -## Атрибуция - -Данный Кодекс Поведения основан на [Кодекс Поведения участника][homepage], -версии 2.0, доступной по адресу -. - -Принципы Воздействия в Сообществе были вдохновлены [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -[homepage]: https://www.contributor-covenant.org - -Ответы на общие вопросы о данном кодексе поведения ищите на странице FAQ: -. Переводы доступны по адресу -. diff --git a/README.md b/README.md index 95e7f7d58..4404ffad7 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ [![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) [![Docker Pulls](https://img.shields.io/docker/pulls/victoriametrics/victoria-metrics.svg?maxAge=604800)](https://hub.docker.com/r/victoriametrics/victoria-metrics) -[![victoriametrics](https://snapcraft.io/victoriametrics/badge.svg)](https://snapcraft.io/victoriametrics) [![Slack](https://img.shields.io/badge/join%20slack-%23victoriametrics-brightgreen.svg)](https://slack.victoriametrics.com/) [![GitHub license](https://img.shields.io/github/license/VictoriaMetrics/VictoriaMetrics.svg)](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE) [![Go Report](https://goreportcard.com/badge/github.com/VictoriaMetrics/VictoriaMetrics)](https://goreportcard.com/report/github.com/VictoriaMetrics/VictoriaMetrics) @@ -181,10 +180,6 @@ Additionally, all the VictoriaMetrics components allow setting flag values via e * For repeating flags an alternative syntax can be used by joining the different values into one using `,` char as separator (for example `-storageNode -storageNode ` will translate to `storageNode=,`). * Environment var prefix can be set via `-envflag.prefix` flag. For instance, if `-envflag.prefix=VM_`, then env vars must be prepended with `VM_`. -### Configuration with snap package - -Snap packages for VictoriaMetrics are supported by community and are available at [https://snapcraft.io/victoriametrics](https://snapcraft.io/victoriametrics). - ### Running as Windows service In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw) @@ -2728,7 +2723,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -cacheExpireDuration duration Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s) -configAuthKey value - Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -configAuthKey=file:///abs/path/to/file or -configAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -configAuthKey=http://host/path or -configAuthKey=https://host/path -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) @@ -2765,7 +2760,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -finalMergeDelay duration Deprecated: this flag does nothing -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -forceFlushAuthKey value authKey, which must be passed in query string to /internal/force_flush pages @@ -2880,7 +2875,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum size in bytes of a single Prometheus remote_write API request Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432) -maxLabelValueLen int - The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 1024) + The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 4096) -maxLabelsPerTimeseries int The maximum number of labels accepted per time series. Superfluous labels are dropped. In this case the vm_metrics_with_dropped_labels_total metric at /metrics page is incremented (default 30) -memory.allowedBytes size @@ -2891,7 +2886,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -2922,7 +2917,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -opentsdbhttpTrimTimestamp duration Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -precisionBits int The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64) @@ -3044,7 +3039,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -relabelConfig string Optional path to a file with relabeling rules, which are applied to all the ingested metrics. The path can point either to local file or to http url. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -retentionFilter array Retention filter in the format 'filter:retention'. For example, '{env="dev"}:3d' configures the retention for time series with env="dev" label to 3 days. See https://docs.victoriametrics.com/#retention-filters for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise/ diff --git a/app/victoria-logs/Makefile b/app/victoria-logs/Makefile index 7c8054418..2af385130 100644 --- a/app/victoria-logs/Makefile +++ b/app/victoria-logs/Makefile @@ -81,6 +81,9 @@ victoria-logs-linux-ppc64le: victoria-logs-linux-s390x: APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch +victoria-logs-linux-loong64: + APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=loong64 $(MAKE) app-local-goos-goarch + victoria-logs-linux-386: APP_NAME=victoria-logs CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch diff --git a/app/vlstorage/main.go b/app/vlstorage/main.go index 3ea3db668..628919ce5 100644 --- a/app/vlstorage/main.go +++ b/app/vlstorage/main.go @@ -80,7 +80,7 @@ func Init() { // Stop stops vlstorage. func Stop() { - metrics.UnregisterSet(storageMetrics) + metrics.UnregisterSet(storageMetrics, true) storageMetrics = nil strg.MustClose() diff --git a/app/vmagent/main.go b/app/vmagent/main.go index f67cdffd6..26c77dbf3 100644 --- a/app/vmagent/main.go +++ b/app/vmagent/main.go @@ -70,8 +70,8 @@ var ( "See also -opentsdbHTTPListenAddr.useProxyProtocol") opentsdbHTTPUseProxyProtocol = flag.Bool("opentsdbHTTPListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted "+ "at -opentsdbHTTPListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt") - configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings.") - reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings.") + configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.*") + reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*") dryRun = flag.Bool("dryRun", false, "Whether to check config files without running vmagent. The following files are checked: "+ "-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig, -remoteWrite.streamAggr.config . "+ "Unknown config entries aren't allowed in -promscrape.config by default. This can be changed by passing -promscrape.config.strictParse=false command-line flag") @@ -114,7 +114,7 @@ func main() { logger.Fatalf("error when checking relabel configs: %s", err) } if err := remotewrite.CheckStreamAggrConfigs(); err != nil { - logger.Fatalf("error when checking -remoteWrite.streamAggr.config: %s", err) + logger.Fatalf("error when checking -streamAggr.config and -remoteWrite.streamAggr.config: %s", err) } logger.Infof("all the configs are ok; exiting with 0 status code") return @@ -434,7 +434,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { } return true case "/prometheus/config", "/config": - if !httpserver.CheckAuthFlag(w, r, configAuthKey.Get(), "configAuthKey") { + if !httpserver.CheckAuthFlag(w, r, configAuthKey) { return true } promscrapeConfigRequests.Inc() @@ -443,7 +443,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { return true case "/prometheus/api/v1/status/config", "/api/v1/status/config": // See https://prometheus.io/docs/prometheus/latest/querying/api/#config - if !httpserver.CheckAuthFlag(w, r, configAuthKey.Get(), "configAuthKey") { + if !httpserver.CheckAuthFlag(w, r, configAuthKey) { return true } promscrapeStatusConfigRequests.Inc() @@ -453,7 +453,7 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { fmt.Fprintf(w, `{"status":"success","data":{"yaml":%q}}`, bb.B) return true case "/prometheus/-/reload", "/-/reload": - if !httpserver.CheckAuthFlag(w, r, reloadAuthKey.Get(), "reloadAuthKey") { + if !httpserver.CheckAuthFlag(w, r, reloadAuthKey) { return true } promscrapeConfigReloadRequests.Inc() diff --git a/app/vmagent/remotewrite/client.go b/app/vmagent/remotewrite/client.go index aea3b498b..7ffb50a2f 100644 --- a/app/vmagent/remotewrite/client.go +++ b/app/vmagent/remotewrite/client.go @@ -15,8 +15,8 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/awsapi" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common" @@ -120,7 +120,7 @@ func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persiste logger.Fatalf("cannot initialize AWS Config for -remoteWrite.url=%q: %s", remoteWriteURL, err) } tr := &http.Transport{ - DialContext: httputils.GetStatDialFunc("vmagent_remotewrite"), + DialContext: netutil.NewStatDialFunc("vmagent_remotewrite"), TLSHandshakeTimeout: tlsHandshakeTimeout.GetOptionalArg(argIdx), MaxConnsPerHost: 2 * concurrency, MaxIdleConnsPerHost: 2 * concurrency, diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index a7cdcf32b..bc6159b43 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -6,6 +6,7 @@ import ( "net/http" "net/url" "path/filepath" + "slices" "strconv" "sync" "sync/atomic" @@ -88,15 +89,15 @@ var ( "By default there are no limits on samples ingestion rate. See also -remoteWrite.rateLimit") disableOnDiskQueue = flagutil.NewArrayBool("remoteWrite.disableOnDiskQueue", "Whether to disable storing pending data to -remoteWrite.tmpDataPath "+ - "when the configured remote storage systems cannot keep up with the data ingestion rate. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence ."+ - "See also -remoteWrite.dropSamplesOnOverload") - dropSamplesOnOverload = flagutil.NewArrayBool("remoteWrite.dropSamplesOnOverload", "Whether to drop samples when -remoteWrite.disableOnDiskQueue is set and if the samples "+ - "cannot be pushed into the configured remote storage systems in a timely manner. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence") + "when the remote storage system at the corresponding -remoteWrite.url cannot keep up with the data ingestion rate. "+ + "See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence . See also -remoteWrite.dropSamplesOnOverload") + dropSamplesOnOverload = flag.Bool("remoteWrite.dropSamplesOnOverload", false, "Whether to drop samples when -remoteWrite.disableOnDiskQueue is set and if the samples "+ + "cannot be pushed into the configured -remoteWrite.url systems in a timely manner. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence") ) var ( - // rwctxs contains statically populated entries when -remoteWrite.url is specified. - rwctxs []*remoteWriteCtx + // rwctxsGlobal contains statically populated entries when -remoteWrite.url is specified. + rwctxsGlobal []*remoteWriteCtx // Data without tenant id is written to defaultAuthToken if -enableMultitenantHandlers is specified. defaultAuthToken = &auth.Token{} @@ -109,8 +110,11 @@ var ( StatusCode: http.StatusTooManyRequests, } - // disableOnDiskQueueAll is set to true if all remoteWrite.urls were configured to disable persistent queue via disableOnDiskQueue - disableOnDiskQueueAll bool + // disableOnDiskQueueAny is set to true if at least a single -remoteWrite.url is configured with -remoteWrite.disableOnDiskQueue + disableOnDiskQueueAny bool + + // dropSamplesOnFailureGlobal is set to true if -remoteWrite.dropSamplesOnOverload is set or if multiple -remoteWrite.disableOnDiskQueue options are set. + dropSamplesOnFailureGlobal bool ) // MultitenancyEnabled returns true if -enableMultitenantHandlers is specified. @@ -203,28 +207,18 @@ func Init() { relabelConfigSuccess.Set(1) relabelConfigTimestamp.Set(fasttime.UnixTimestamp()) - sasFile, sasOpts := getStreamAggrOpts(-1) - if sasFile != "" { - sas, err := newStreamAggrConfig(-1, pushToRemoteStoragesDropFailed) - if err != nil { - logger.Fatalf("cannot initialize stream aggregators from -streamAggr.config=%q: %s", sasFile, err) - } - sasGlobal.Store(sas) - } else if sasOpts.DedupInterval > 0 { - deduplicatorGlobal = streamaggr.NewDeduplicator(pushToRemoteStoragesDropFailed, sasOpts.DedupInterval, sasOpts.DropInputLabels, sasOpts.Alias) - } + initStreamAggrConfigGlobal() - if len(*remoteWriteURLs) > 0 { - rwctxs = newRemoteWriteCtxs(nil, *remoteWriteURLs) - } + rwctxsGlobal = newRemoteWriteCtxs(nil, *remoteWriteURLs) - disableOnDiskQueueAll = true - for _, v := range *disableOnDiskQueue { - if !v { - disableOnDiskQueueAll = false - break - } - } + disableOnDiskQueues := []bool(*disableOnDiskQueue) + disableOnDiskQueueAny = slices.Contains(disableOnDiskQueues, true) + + // Samples must be dropped if multiple -remoteWrite.disableOnDiskQueue options are configured and at least a single is set to true. + // In this case it is impossible to prevent from sending many duplicates of samples passed to TryPush() to all the configured -remoteWrite.url + // if these samples couldn't be sent to the -remoteWrite.url with the disabled persistent queue. So it is better sending samples + // to the remaining -remoteWrite.url and dropping them on the blocked queue. + dropSamplesOnFailureGlobal = *dropSamplesOnOverload || disableOnDiskQueueAny && len(disableOnDiskQueues) > 1 dropDanglingQueues() @@ -234,9 +228,9 @@ func Init() { defer configReloaderWG.Done() for { select { - case <-sighupCh: case <-configReloaderStopCh: return + case <-sighupCh: } reloadRelabelConfigs() reloadStreamAggrConfigs() @@ -255,8 +249,8 @@ func dropDanglingQueues() { // In case if there were many persistent queues with identical *remoteWriteURLs // the queue with the last index will be dropped. // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6140 - existingQueues := make(map[string]struct{}, len(rwctxs)) - for _, rwctx := range rwctxs { + existingQueues := make(map[string]struct{}, len(rwctxsGlobal)) + for _, rwctx := range rwctxsGlobal { existingQueues[rwctx.fq.Dirname()] = struct{}{} } @@ -273,7 +267,7 @@ func dropDanglingQueues() { } } if removed > 0 { - logger.Infof("removed %d dangling queues from %q, active queues: %d", removed, *tmpDataPath, len(rwctxs)) + logger.Infof("removed %d dangling queues from %q, active queues: %d", removed, *tmpDataPath, len(rwctxsGlobal)) } } @@ -382,10 +376,10 @@ func Stop() { deduplicatorGlobal = nil } - for _, rwctx := range rwctxs { + for _, rwctx := range rwctxsGlobal { rwctx.MustStop() } - rwctxs = nil + rwctxsGlobal = nil if sl := hourlySeriesLimiter; sl != nil { sl.MustStop() @@ -397,6 +391,8 @@ func Stop() { // PushDropSamplesOnFailure pushes wr to the configured remote storage systems set via -remoteWrite.url // +// PushDropSamplesOnFailure drops wr samples if they cannot be sent to -remoteWrite.url by any reason. +// // PushDropSamplesOnFailure can modify wr contents. func PushDropSamplesOnFailure(at *auth.Token, wr *prompbmarshal.WriteRequest) { _ = tryPush(at, wr, true) @@ -409,7 +405,7 @@ func PushDropSamplesOnFailure(at *auth.Token, wr *prompbmarshal.WriteRequest) { // // The caller must return ErrQueueFullHTTPRetry to the client, which sends wr, if TryPush returns false. func TryPush(at *auth.Token, wr *prompbmarshal.WriteRequest) bool { - return tryPush(at, wr, false) + return tryPush(at, wr, dropSamplesOnFailureGlobal) } func tryPush(at *auth.Token, wr *prompbmarshal.WriteRequest, forceDropSamplesOnFailure bool) bool { @@ -426,24 +422,20 @@ func tryPush(at *auth.Token, wr *prompbmarshal.WriteRequest, forceDropSamplesOnF tenantRctx = getRelabelCtx() defer putRelabelCtx(tenantRctx) } - rowsCount := getRowsCount(tss) // Quick check whether writes to configured remote storage systems are blocked. // This allows saving CPU time spent on relabeling and block compression // if some of remote storage systems cannot keep up with the data ingestion rate. - // this shortcut is only applicable if all remote writes have disableOnDiskQueue = true - if disableOnDiskQueueAll { - for _, rwctx := range rwctxs { - if rwctx.fq.IsWriteBlocked() { - rwctx.pushFailures.Inc() - if forceDropSamplesOnFailure || rwctx.dropSamplesOnOverload { - // Just drop samples - rwctx.rowsDroppedOnPushFailure.Add(rowsCount) - continue - } - return false - } - } + rwctxs, ok := getEligibleRemoteWriteCtxs(tss, forceDropSamplesOnFailure) + if !ok { + // At least a single remote write queue is blocked and dropSamplesOnFailure isn't set. + // Return false to the caller, so it could re-send samples again. + return false + } + if len(rwctxs) == 0 { + // All the remote write queues are skipped because they are blocked and dropSamplesOnFailure is set to true. + // Return true to the caller, so it doesn't re-send the samples again. + return true } var rctx *relabelCtx @@ -453,6 +445,7 @@ func tryPush(at *auth.Token, wr *prompbmarshal.WriteRequest, forceDropSamplesOnF rctx = getRelabelCtx() defer putRelabelCtx(rctx) } + rowsCount := getRowsCount(tss) globalRowsPushedBeforeRelabel.Add(rowsCount) maxSamplesPerBlock := *maxRowsPerBlock // Allow up to 10x of labels per each block on average. @@ -505,20 +498,46 @@ func tryPush(at *auth.Token, wr *prompbmarshal.WriteRequest, forceDropSamplesOnF deduplicatorGlobal.Push(tssBlock) tssBlock = tssBlock[:0] } - if !tryPushBlockToRemoteStorages(tssBlock, forceDropSamplesOnFailure) { + if !tryPushBlockToRemoteStorages(rwctxs, tssBlock, forceDropSamplesOnFailure) { return false } } return true } -func pushToRemoteStoragesDropFailed(tss []prompbmarshal.TimeSeries) { - if tryPushBlockToRemoteStorages(tss, true) { +func getEligibleRemoteWriteCtxs(tss []prompbmarshal.TimeSeries, forceDropSamplesOnFailure bool) ([]*remoteWriteCtx, bool) { + if !disableOnDiskQueueAny { + return rwctxsGlobal, true + } + + // This code is applicable if at least a single remote storage has -disableOnDiskQueue + rwctxs := make([]*remoteWriteCtx, 0, len(rwctxsGlobal)) + for _, rwctx := range rwctxsGlobal { + if !rwctx.fq.IsWriteBlocked() { + rwctxs = append(rwctxs, rwctx) + } else { + rwctx.pushFailures.Inc() + if !forceDropSamplesOnFailure { + return nil, false + } + rowsCount := getRowsCount(tss) + rwctx.rowsDroppedOnPushFailure.Add(rowsCount) + } + } + return rwctxs, true +} + +func pushToRemoteStoragesTrackDropped(tss []prompbmarshal.TimeSeries) { + rwctxs, _ := getEligibleRemoteWriteCtxs(tss, true) + if len(rwctxs) == 0 { return } + if !tryPushBlockToRemoteStorages(rwctxs, tss, true) { + logger.Panicf("BUG: tryPushBlockToRemoteStorages() must return true when forceDropSamplesOnFailure=true") + } } -func tryPushBlockToRemoteStorages(tssBlock []prompbmarshal.TimeSeries, forceDropSamplesOnFailure bool) bool { +func tryPushBlockToRemoteStorages(rwctxs []*remoteWriteCtx, tssBlock []prompbmarshal.TimeSeries, forceDropSamplesOnFailure bool) bool { if len(tssBlock) == 0 { // Nothing to push return true @@ -537,7 +556,7 @@ func tryPushBlockToRemoteStorages(tssBlock []prompbmarshal.TimeSeries, forceDrop if replicas <= 0 { replicas = 1 } - return tryShardingBlockAmongRemoteStorages(tssBlock, replicas, forceDropSamplesOnFailure) + return tryShardingBlockAmongRemoteStorages(rwctxs, tssBlock, replicas, forceDropSamplesOnFailure) } // Replicate tssBlock samples among rwctxs. @@ -558,7 +577,7 @@ func tryPushBlockToRemoteStorages(tssBlock []prompbmarshal.TimeSeries, forceDrop return !anyPushFailed.Load() } -func tryShardingBlockAmongRemoteStorages(tssBlock []prompbmarshal.TimeSeries, replicas int, forceDropSamplesOnFailure bool) bool { +func tryShardingBlockAmongRemoteStorages(rwctxs []*remoteWriteCtx, tssBlock []prompbmarshal.TimeSeries, replicas int, forceDropSamplesOnFailure bool) bool { x := getTSSShards(len(rwctxs)) defer putTSSShards(x) @@ -745,10 +764,8 @@ type remoteWriteCtx struct { sas atomic.Pointer[streamaggr.Aggregators] deduplicator *streamaggr.Deduplicator - streamAggrKeepInput bool - streamAggrDropInput bool - disableOnDiskQueue bool - dropSamplesOnOverload bool + streamAggrKeepInput bool + streamAggrDropInput bool pss []*pendingSeries pssNextIdx atomic.Uint64 @@ -773,6 +790,7 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL *url.URL, maxInmemoryBlocks in logger.Warnf("rounding the -remoteWrite.maxDiskUsagePerURL=%d to the minimum supported value: %d", maxPendingBytes, persistentqueue.DefaultChunkFileSize) maxPendingBytes = persistentqueue.DefaultChunkFileSize } + isPQDisabled := disableOnDiskQueue.GetOptionalArg(argIdx) fq := persistentqueue.MustOpenFastQueue(queuePath, sanitizedURL, maxInmemoryBlocks, maxPendingBytes, isPQDisabled) _ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, queuePath, sanitizedURL), func() float64 { @@ -817,31 +835,13 @@ func newRemoteWriteCtx(argIdx int, remoteWriteURL *url.URL, maxInmemoryBlocks in c: c, pss: pss, - dropSamplesOnOverload: dropSamplesOnOverload.GetOptionalArg(argIdx), - disableOnDiskQueue: isPQDisabled, + rowsPushedAfterRelabel: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_rows_pushed_after_relabel_total{path=%q,url=%q}`, queuePath, sanitizedURL)), + rowsDroppedByRelabel: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q,url=%q}`, queuePath, sanitizedURL)), - rowsPushedAfterRelabel: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_rows_pushed_after_relabel_total{path=%q, url=%q}`, queuePath, sanitizedURL)), - rowsDroppedByRelabel: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, queuePath, sanitizedURL)), - - pushFailures: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_push_failures_total{path=%q, url=%q}`, queuePath, sanitizedURL)), - rowsDroppedOnPushFailure: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_samples_dropped_total{path=%q, url=%q}`, queuePath, sanitizedURL)), - } - - // Initialize sas - sasFile, sasOpts := getStreamAggrOpts(argIdx) - if sasFile != "" { - sas, err := newStreamAggrConfig(argIdx, rwctx.pushInternalTrackDropped) - if err != nil { - logger.Fatalf("cannot initialize stream aggregators from -remoteWrite.streamAggr.config=%q: %s", sasFile, err) - } - rwctx.sas.Store(sas) - rwctx.streamAggrKeepInput = streamAggrKeepInput.GetOptionalArg(argIdx) - rwctx.streamAggrDropInput = streamAggrDropInput.GetOptionalArg(argIdx) - metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, sasFile)).Set(1) - metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, sasFile)).Set(fasttime.UnixTimestamp()) - } else if sasOpts.DedupInterval > 0 { - rwctx.deduplicator = streamaggr.NewDeduplicator(rwctx.pushInternalTrackDropped, sasOpts.DedupInterval, sasOpts.DropInputLabels, sasOpts.Alias) + pushFailures: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_push_failures_total{path=%q,url=%q}`, queuePath, sanitizedURL)), + rowsDroppedOnPushFailure: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_samples_dropped_total{path=%q,url=%q}`, queuePath, sanitizedURL)), } + rwctx.initStreamAggrConfig() return rwctx } @@ -934,8 +934,9 @@ func (rwctx *remoteWriteCtx) TryPush(tss []prompbmarshal.TimeSeries, forceDropSa // Couldn't push tss to remote storage rwctx.pushFailures.Inc() - if forceDropSamplesOnFailure || rwctx.dropSamplesOnOverload { - rwctx.rowsDroppedOnPushFailure.Add(len(tss)) + if forceDropSamplesOnFailure { + rowsCount := getRowsCount(tss) + rwctx.rowsDroppedOnPushFailure.Add(rowsCount) return true } return false @@ -962,14 +963,12 @@ func (rwctx *remoteWriteCtx) pushInternalTrackDropped(tss []prompbmarshal.TimeSe if rwctx.tryPushInternal(tss) { return } - if !rwctx.disableOnDiskQueue { + if !rwctx.fq.IsPersistentQueueDisabled() { logger.Panicf("BUG: tryPushInternal must return true if -remoteWrite.disableOnDiskQueue isn't set") } rwctx.pushFailures.Inc() - if dropSamplesOnOverload.GetOptionalArg(rwctx.idx) { - rowsCount := getRowsCount(tss) - rwctx.rowsDroppedOnPushFailure.Add(rowsCount) - } + rowsCount := getRowsCount(tss) + rwctx.rowsDroppedOnPushFailure.Add(rowsCount) } func (rwctx *remoteWriteCtx) tryPushInternal(tss []prompbmarshal.TimeSeries) bool { diff --git a/app/vmagent/remotewrite/remotewrite_test.go b/app/vmagent/remotewrite/remotewrite_test.go index d86b20eda..01a9fc1f2 100644 --- a/app/vmagent/remotewrite/remotewrite_test.go +++ b/app/vmagent/remotewrite/remotewrite_test.go @@ -77,14 +77,16 @@ func TestRemoteWriteContext_TryPush_ImmutableTimeseries(t *testing.T) { rowsDroppedByRelabel: metrics.GetOrCreateCounter(`bar`), } if dedupInterval > 0 { - rwctx.deduplicator = streamaggr.NewDeduplicator(nil, dedupInterval, nil, "global") + rwctx.deduplicator = streamaggr.NewDeduplicator(nil, dedupInterval, nil, "dedup-global") } - if len(streamAggrConfig) > 0 { - sas, err := streamaggr.LoadFromData([]byte(streamAggrConfig), nil, streamaggr.Options{}) + if streamAggrConfig != "" { + pushNoop := func(_ []prompbmarshal.TimeSeries) {} + sas, err := streamaggr.LoadFromData([]byte(streamAggrConfig), pushNoop, nil, "global") if err != nil { t.Fatalf("cannot load streamaggr configs: %s", err) } + defer sas.MustStop() rwctx.sas.Store(sas) } @@ -94,7 +96,9 @@ func TestRemoteWriteContext_TryPush_ImmutableTimeseries(t *testing.T) { // copy inputTss to make sure it is not mutated during TryPush call copy(expectedTss, inputTss) - rwctx.TryPush(inputTss, false) + if !rwctx.TryPush(inputTss, false) { + t.Fatalf("cannot push samples to rwctx") + } if !reflect.DeepEqual(expectedTss, inputTss) { t.Fatalf("unexpected samples;\ngot\n%v\nwant\n%v", inputTss, expectedTss) diff --git a/app/vmagent/remotewrite/streamaggr.go b/app/vmagent/remotewrite/streamaggr.go index c1cba412e..0ebc2e624 100644 --- a/app/vmagent/remotewrite/streamaggr.go +++ b/app/vmagent/remotewrite/streamaggr.go @@ -61,104 +61,180 @@ var ( // CheckStreamAggrConfigs checks -remoteWrite.streamAggr.config and -streamAggr.config. func CheckStreamAggrConfigs() error { - pushNoop := func(_ []prompbmarshal.TimeSeries) {} + // Check global config + sas, err := newStreamAggrConfigGlobal() + if err != nil { + return err + } + sas.MustStop() - if _, err := newStreamAggrConfig(-1, pushNoop); err != nil { - return fmt.Errorf("could not load -streamAggr.config stream aggregation config: %w", err) - } if len(*streamAggrConfig) > len(*remoteWriteURLs) { - return fmt.Errorf("too many -remoteWrite.streamAggr.config args: %d; it mustn't exceed the number of -remoteWrite.url args: %d", - len(*streamAggrConfig), len(*remoteWriteURLs)) + return fmt.Errorf("too many -remoteWrite.streamAggr.config args: %d; it mustn't exceed the number of -remoteWrite.url args: %d", len(*streamAggrConfig), len(*remoteWriteURLs)) } + + pushNoop := func(_ []prompbmarshal.TimeSeries) {} for idx := range *streamAggrConfig { - if _, err := newStreamAggrConfig(idx, pushNoop); err != nil { + sas, err := newStreamAggrConfigPerURL(idx, pushNoop) + if err != nil { return err } + sas.MustStop() } return nil } func reloadStreamAggrConfigs() { - reloadStreamAggrConfig(-1, pushToRemoteStoragesDropFailed) - for idx, rwctx := range rwctxs { - reloadStreamAggrConfig(idx, rwctx.pushInternalTrackDropped) + reloadStreamAggrConfigGlobal() + for _, rwctx := range rwctxsGlobal { + rwctx.reloadStreamAggrConfig() } } -func reloadStreamAggrConfig(idx int, pushFunc streamaggr.PushFunc) { - path, opts := getStreamAggrOpts(idx) - logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", path) - metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, path)).Inc() - - sasNew, err := newStreamAggrConfigWithOpts(pushFunc, path, opts) - if err != nil { - metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, path)).Inc() - metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, path)).Set(0) - logger.Errorf("cannot reload stream aggregation config at %q; continue using the previously loaded config; error: %s", path, err) +func reloadStreamAggrConfigGlobal() { + path := *streamAggrGlobalConfig + if path == "" { return } - var sas *streamaggr.Aggregators - if idx < 0 { - sas = sasGlobal.Load() - } else { - sas = rwctxs[idx].sas.Load() + logger.Infof("reloading stream aggregation configs pointed by -streamAggr.config=%q", path) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, path)).Inc() + + sasNew, err := newStreamAggrConfigGlobal() + if err != nil { + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, path)).Inc() + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, path)).Set(0) + logger.Errorf("cannot reload -streamAggr.config=%q; continue using the previously loaded config; error: %s", path, err) + return } + sas := sasGlobal.Load() if !sasNew.Equal(sas) { - var sasOld *streamaggr.Aggregators - if idx < 0 { - sasOld = sasGlobal.Swap(sasNew) - } else { - sasOld = rwctxs[idx].sas.Swap(sasNew) - } + sasOld := sasGlobal.Swap(sasNew) sasOld.MustStop() - logger.Infof("successfully reloaded stream aggregation configs at %q", path) + logger.Infof("successfully reloaded -streamAggr.config=%q", path) } else { sasNew.MustStop() - logger.Infof("successfully reloaded stream aggregation configs at %q", path) + logger.Infof("-streamAggr.config=%q wasn't changed since the last reload", path) } metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, path)).Set(1) metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, path)).Set(fasttime.UnixTimestamp()) } -func getStreamAggrOpts(idx int) (string, streamaggr.Options) { - if idx < 0 { - return *streamAggrGlobalConfig, streamaggr.Options{ - DedupInterval: streamAggrGlobalDedupInterval.Duration(), - DropInputLabels: *streamAggrGlobalDropInputLabels, - IgnoreOldSamples: *streamAggrGlobalIgnoreOldSamples, - IgnoreFirstIntervals: *streamAggrGlobalIgnoreFirstIntervals, - Alias: "global", +func initStreamAggrConfigGlobal() { + sas, err := newStreamAggrConfigGlobal() + if err != nil { + logger.Fatalf("cannot initialize gloabl stream aggregators: %s", err) + } + if sas != nil { + filePath := sas.FilePath() + sasGlobal.Store(sas) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, filePath)).Set(1) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, filePath)).Set(fasttime.UnixTimestamp()) + } else { + dedupInterval := streamAggrGlobalDedupInterval.Duration() + if dedupInterval > 0 { + deduplicatorGlobal = streamaggr.NewDeduplicator(pushToRemoteStoragesTrackDropped, dedupInterval, *streamAggrDropInputLabels, "dedup-global") } } - url := fmt.Sprintf("%d:secret-url", idx+1) - if *showRemoteWriteURL { - url = fmt.Sprintf("%d:%s", idx+1, remoteWriteURLs.GetOptionalArg(idx)) +} + +func (rwctx *remoteWriteCtx) initStreamAggrConfig() { + idx := rwctx.idx + + sas, err := rwctx.newStreamAggrConfig() + if err != nil { + logger.Fatalf("cannot initialize stream aggregators: %s", err) } - opts := streamaggr.Options{ + if sas != nil { + filePath := sas.FilePath() + rwctx.sas.Store(sas) + rwctx.streamAggrKeepInput = streamAggrKeepInput.GetOptionalArg(idx) + rwctx.streamAggrDropInput = streamAggrDropInput.GetOptionalArg(idx) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, filePath)).Set(1) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, filePath)).Set(fasttime.UnixTimestamp()) + } else { + dedupInterval := streamAggrDedupInterval.GetOptionalArg(idx) + if dedupInterval > 0 { + alias := fmt.Sprintf("dedup-%d", idx+1) + rwctx.deduplicator = streamaggr.NewDeduplicator(rwctx.pushInternalTrackDropped, dedupInterval, *streamAggrDropInputLabels, alias) + } + } +} + +func (rwctx *remoteWriteCtx) reloadStreamAggrConfig() { + path := streamAggrConfig.GetOptionalArg(rwctx.idx) + if path == "" { + return + } + + logger.Infof("reloading stream aggregation configs pointed by -remoteWrite.streamAggr.config=%q", path) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_total{path=%q}`, path)).Inc() + + sasNew, err := rwctx.newStreamAggrConfig() + if err != nil { + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reloads_errors_total{path=%q}`, path)).Inc() + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, path)).Set(0) + logger.Errorf("cannot reload -remoteWrite.streamAggr.config=%q; continue using the previously loaded config; error: %s", path, err) + return + } + + sas := rwctx.sas.Load() + if !sasNew.Equal(sas) { + sasOld := rwctx.sas.Swap(sasNew) + sasOld.MustStop() + logger.Infof("successfully reloaded -remoteWrite.streamAggr.config=%q", path) + } else { + sasNew.MustStop() + logger.Infof("-remoteWrite.streamAggr.config=%q wasn't changed since the last reload", path) + } + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_successful{path=%q}`, path)).Set(1) + metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_streamaggr_config_reload_success_timestamp_seconds{path=%q}`, path)).Set(fasttime.UnixTimestamp()) +} + +func newStreamAggrConfigGlobal() (*streamaggr.Aggregators, error) { + path := *streamAggrGlobalConfig + if path == "" { + return nil, nil + } + + opts := &streamaggr.Options{ + DedupInterval: streamAggrGlobalDedupInterval.Duration(), + DropInputLabels: *streamAggrGlobalDropInputLabels, + IgnoreOldSamples: *streamAggrGlobalIgnoreOldSamples, + IgnoreFirstIntervals: *streamAggrGlobalIgnoreFirstIntervals, + } + + sas, err := streamaggr.LoadFromFile(path, pushToRemoteStoragesTrackDropped, opts, "global") + if err != nil { + return nil, fmt.Errorf("cannot load -streamAggr.config=%q: %w", *streamAggrGlobalConfig, err) + } + return sas, nil +} + +func (rwctx *remoteWriteCtx) newStreamAggrConfig() (*streamaggr.Aggregators, error) { + return newStreamAggrConfigPerURL(rwctx.idx, rwctx.pushInternalTrackDropped) +} + +func newStreamAggrConfigPerURL(idx int, pushFunc streamaggr.PushFunc) (*streamaggr.Aggregators, error) { + path := streamAggrConfig.GetOptionalArg(idx) + if path == "" { + return nil, nil + } + + alias := fmt.Sprintf("%d:secret-url", idx+1) + if *showRemoteWriteURL { + alias = fmt.Sprintf("%d:%s", idx+1, remoteWriteURLs.GetOptionalArg(idx)) + } + opts := &streamaggr.Options{ DedupInterval: streamAggrDedupInterval.GetOptionalArg(idx), DropInputLabels: *streamAggrDropInputLabels, IgnoreOldSamples: streamAggrIgnoreOldSamples.GetOptionalArg(idx), IgnoreFirstIntervals: *streamAggrIgnoreFirstIntervals, - Alias: url, } - if len(*streamAggrConfig) == 0 { - return "", opts + sas, err := streamaggr.LoadFromFile(path, pushFunc, opts, alias) + if err != nil { + return nil, fmt.Errorf("cannot load -remoteWrite.streamAggr.config=%q: %w", path, err) } - return streamAggrConfig.GetOptionalArg(idx), opts -} - -func newStreamAggrConfigWithOpts(pushFunc streamaggr.PushFunc, path string, opts streamaggr.Options) (*streamaggr.Aggregators, error) { - if len(path) == 0 { - // Skip empty stream aggregation config. - return nil, nil - } - return streamaggr.LoadFromFile(path, pushFunc, opts) -} - -func newStreamAggrConfig(idx int, pushFunc streamaggr.PushFunc) (*streamaggr.Aggregators, error) { - path, opts := getStreamAggrOpts(idx) - return newStreamAggrConfigWithOpts(pushFunc, path, opts) + return sas, nil } diff --git a/app/vmalert-tool/Makefile b/app/vmalert-tool/Makefile index dbb6b373a..d2a5e32b0 100644 --- a/app/vmalert-tool/Makefile +++ b/app/vmalert-tool/Makefile @@ -81,6 +81,9 @@ vmalert-tool-linux-ppc64le: vmalert-tool-linux-s390x: APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch +vmalert-tool-linux-loong64: + APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=loong64 $(MAKE) app-local-goos-goarch + vmalert-tool-linux-386: APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch diff --git a/app/vmalert-tool/unittest/input_test.go b/app/vmalert-tool/unittest/input_test.go index 8ce90782d..6d6ccda3e 100644 --- a/app/vmalert-tool/unittest/input_test.go +++ b/app/vmalert-tool/unittest/input_test.go @@ -6,88 +6,61 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" ) -func TestParseInputValue(t *testing.T) { - testCases := []struct { - input string - exp []sequenceValue - failed bool - }{ - { - "", - nil, - true, - }, - { - "testfailed", - nil, - true, - }, - // stale doesn't support operations - { - "stalex3", - nil, - true, - }, - { - "-4", - []sequenceValue{{Value: -4}}, - false, - }, - { - "_", - []sequenceValue{{Omitted: true}}, - false, - }, - { - "stale", - []sequenceValue{{Value: decimal.StaleNaN}}, - false, - }, - { - "-4x1", - []sequenceValue{{Value: -4}, {Value: -4}}, - false, - }, - { - "_x1", - []sequenceValue{{Omitted: true}}, - false, - }, - { - "1+1x2 0.1 0.1+0.3x2 3.14", - []sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 0.1}, {Value: 0.1}, {Value: 0.4}, {Value: 0.7}, {Value: 3.14}}, - false, - }, - { - "2-1x4", - []sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}}, - false, - }, - { - "1+1x1 _ -4 stale 3+20x1", - []sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}}, - false, - }, +func TestParseInputValue_Failure(t *testing.T) { + f := func(input string) { + t.Helper() + + _, err := parseInputValue(input, true) + if err == nil { + t.Fatalf("expecting non-nil error") + } } - for _, tc := range testCases { - output, err := parseInputValue(tc.input, true) - if err != nil != tc.failed { - t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil) + f("") + f("testfailed") + + // stale doesn't support operations + f("stalex3") +} + +func TestParseInputValue_Success(t *testing.T) { + f := func(input string, outputExpected []sequenceValue) { + t.Helper() + + output, err := parseInputValue(input, true) + if err != nil { + t.Fatalf("unexpected error in parseInputValue: %s", err) } - if len(tc.exp) != len(output) { - t.Fatalf("expect %v, got %v", tc.exp, output) + + if len(outputExpected) != len(output) { + t.Fatalf("unexpected output length; got %d; want %d", len(outputExpected), len(output)) } - for i := 0; i < len(tc.exp); i++ { - if tc.exp[i].Omitted != output[i].Omitted { - t.Fatalf("expect %v, got %v", tc.exp, output) + for i := 0; i < len(outputExpected); i++ { + if outputExpected[i].Omitted != output[i].Omitted { + t.Fatalf("unexpected Omitted field in the output\ngot\n%v\nwant\n%v", output, outputExpected) } - if tc.exp[i].Value != output[i].Value { - if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) { + if outputExpected[i].Value != output[i].Value { + if decimal.IsStaleNaN(outputExpected[i].Value) && decimal.IsStaleNaN(output[i].Value) { continue } - t.Fatalf("expect %v, got %v", tc.exp, output) + t.Fatalf("unexpeccted Value field in the output\ngot\n%v\nwant\n%v", output, outputExpected) } } } + + f("-4", []sequenceValue{{Value: -4}}) + + f("_", []sequenceValue{{Omitted: true}}) + + f("stale", []sequenceValue{{Value: decimal.StaleNaN}}) + + f("-4x1", []sequenceValue{{Value: -4}, {Value: -4}}) + + f("_x1", []sequenceValue{{Omitted: true}}) + + f("1+1x2 0.1 0.1+0.3x2 3.14", []sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 0.1}, {Value: 0.1}, {Value: 0.4}, {Value: 0.7}, {Value: 3.14}}) + + f("2-1x4", []sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}}) + + f("1+1x1 _ -4 stale 3+20x1", []sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}}) } diff --git a/app/vmalert-tool/unittest/unittest_test.go b/app/vmalert-tool/unittest/unittest_test.go index c2c014d6e..6fcec96cf 100644 --- a/app/vmalert-tool/unittest/unittest_test.go +++ b/app/vmalert-tool/unittest/unittest_test.go @@ -14,34 +14,33 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } -func TestUnitRule(t *testing.T) { - testCases := []struct { - name string - disableGroupLabel bool - files []string - failed bool - }{ - { - name: "run multi files", - files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"}, - failed: false, - }, - { - name: "disable group label", - disableGroupLabel: true, - files: []string{"./testdata/disable-group-label.yaml"}, - failed: false, - }, - { - name: "failing test", - files: []string{"./testdata/failed-test.yaml"}, - failed: true, - }, - } - for _, tc := range testCases { - fail := UnitTest(tc.files, tc.disableGroupLabel) - if fail != tc.failed { - t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail) +func TestUnitTest_Failure(t *testing.T) { + f := func(files []string) { + t.Helper() + + failed := UnitTest(files, false) + if !failed { + t.Fatalf("expecting failed test") } } + + // failing test + f([]string{"./testdata/failed-test.yaml"}) +} + +func TestUnitTest_Success(t *testing.T) { + f := func(disableGroupLabel bool, files []string) { + t.Helper() + + failed := UnitTest(files, disableGroupLabel) + if failed { + t.Fatalf("unexpected failed test") + } + } + + // run multi files + f(false, []string{"./testdata/test1.yaml", "./testdata/test2.yaml"}) + + // disable group label + f(true, []string{"./testdata/disable-group-label.yaml"}) } diff --git a/app/vmalert/config/config_test.go b/app/vmalert/config/config_test.go index c9896be1c..68184002c 100644 --- a/app/vmalert/config/config_test.go +++ b/app/vmalert/config/config_test.go @@ -23,12 +23,6 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } -func TestParseGood(t *testing.T) { - if _, err := Parse([]string{"testdata/rules/*good.rules", "testdata/dir/*good.*"}, notifier.ValidateTemplates, true); err != nil { - t.Errorf("error parsing files %s", err) - } -} - func TestParseFromURL(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/bad", func(w http.ResponseWriter, _ *http.Request) { @@ -55,438 +49,353 @@ groups: defer srv.Close() if _, err := Parse([]string{srv.URL + "/good-alert", srv.URL + "/good-rr"}, notifier.ValidateTemplates, true); err != nil { - t.Errorf("error parsing URLs %s", err) + t.Fatalf("error parsing URLs %s", err) } if _, err := Parse([]string{srv.URL + "/bad"}, notifier.ValidateTemplates, true); err == nil { - t.Errorf("expected parsing error: %s", err) + t.Fatalf("expected parsing error: %s", err) } } -func TestParseBad(t *testing.T) { - testCases := []struct { - path []string - expErr string - }{ - { - []string{"testdata/rules/rules_interval_bad.rules"}, - "eval_offset should be smaller than interval", - }, - { - []string{"testdata/rules/rules0-bad.rules"}, - "unexpected token", - }, - { - []string{"testdata/dir/rules0-bad.rules"}, - "error parsing annotation", - }, - { - []string{"testdata/dir/rules1-bad.rules"}, - "duplicate in file", - }, - { - []string{"testdata/dir/rules2-bad.rules"}, - "function \"unknown\" not defined", - }, - { - []string{"testdata/dir/rules3-bad.rules"}, - "either `record` or `alert` must be set", - }, - { - []string{"testdata/dir/rules4-bad.rules"}, - "either `record` or `alert` must be set", - }, - { - []string{"testdata/rules/rules1-bad.rules"}, - "bad graphite expr", - }, - { - []string{"testdata/dir/rules6-bad.rules"}, - "missing ':' in header", - }, - { - []string{"http://unreachable-url"}, - "failed to", - }, +func TestParse_Success(t *testing.T) { + _, err := Parse([]string{"testdata/rules/*good.rules", "testdata/dir/*good.*"}, notifier.ValidateTemplates, true) + if err != nil { + t.Fatalf("error parsing files %s", err) } - for _, tc := range testCases { - _, err := Parse(tc.path, notifier.ValidateTemplates, true) +} + +func TestParse_Failure(t *testing.T) { + f := func(paths []string, errStrExpected string) { + t.Helper() + + _, err := Parse(paths, notifier.ValidateTemplates, true) if err == nil { - t.Errorf("expected to get error") - return + t.Fatalf("expected to get error") } - if !strings.Contains(err.Error(), tc.expErr) { - t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err) + if !strings.Contains(err.Error(), errStrExpected) { + t.Fatalf("expected err to contain %q; got %q instead", errStrExpected, err) } } + + f([]string{"testdata/rules/rules_interval_bad.rules"}, "eval_offset should be smaller than interval") + f([]string{"testdata/rules/rules0-bad.rules"}, "unexpected token") + f([]string{"testdata/dir/rules0-bad.rules"}, "error parsing annotation") + f([]string{"testdata/dir/rules1-bad.rules"}, "duplicate in file") + f([]string{"testdata/dir/rules2-bad.rules"}, "function \"unknown\" not defined") + f([]string{"testdata/dir/rules3-bad.rules"}, "either `record` or `alert` must be set") + f([]string{"testdata/dir/rules4-bad.rules"}, "either `record` or `alert` must be set") + f([]string{"testdata/rules/rules1-bad.rules"}, "bad graphite expr") + f([]string{"testdata/dir/rules6-bad.rules"}, "missing ':' in header") + f([]string{"http://unreachable-url"}, "failed to") } -func TestRule_Validate(t *testing.T) { +func TestRuleValidate(t *testing.T) { if err := (&Rule{}).Validate(); err == nil { - t.Errorf("expected empty name error") + t.Fatalf("expected empty name error") } if err := (&Rule{Alert: "alert"}).Validate(); err == nil { - t.Errorf("expected empty expr error") + t.Fatalf("expected empty expr error") } if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil { - t.Errorf("expected valid rule; got %s", err) + t.Fatalf("expected valid rule; got %s", err) } } -func TestGroup_Validate(t *testing.T) { - testCases := []struct { - group *Group - rules []Rule - validateAnnotations bool - validateExpressions bool - expErr string - }{ - { - group: &Group{}, - expErr: "group name must be set", - }, - { - group: &Group{ - Name: "negative interval", - Interval: promutils.NewDuration(-1), +func TestGroupValidate_Failure(t *testing.T) { + f := func(group *Group, validateExpressions bool, errStrExpected string) { + t.Helper() + + err := group.Validate(nil, validateExpressions) + if err == nil { + t.Fatalf("expecting non-nil error") + } + errStr := err.Error() + if !strings.Contains(errStr, errStrExpected) { + t.Fatalf("missing %q in the returned error %q", errStrExpected, errStr) + } + } + + f(&Group{}, false, "group name must be set") + + f(&Group{ + Name: "negative interval", + Interval: promutils.NewDuration(-1), + }, false, "interval shouldn't be lower than 0") + + f(&Group{ + Name: "wrong eval_offset", + Interval: promutils.NewDuration(time.Minute), + EvalOffset: promutils.NewDuration(2 * time.Minute), + }, false, "eval_offset should be smaller than interval") + + f(&Group{ + Name: "wrong limit", + Limit: -1, + }, false, "invalid limit") + + f(&Group{ + Name: "wrong concurrency", + Concurrency: -1, + }, false, "invalid concurrency") + + f(&Group{ + Name: "test", + Rules: []Rule{ + { + Alert: "alert", + Expr: "up == 1", }, - expErr: "interval shouldn't be lower than 0", - }, - { - group: &Group{ - Name: "wrong eval_offset", - Interval: promutils.NewDuration(time.Minute), - EvalOffset: promutils.NewDuration(2 * time.Minute), + { + Alert: "alert", + Expr: "up == 1", }, - expErr: "eval_offset should be smaller than interval", }, - { - group: &Group{ - Name: "wrong limit", - Limit: -1, + }, false, "duplicate") + + f(&Group{ + Name: "test", + Rules: []Rule{ + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + }, + }, false, "duplicate") + + f(&Group{ + Name: "test", + Rules: []Rule{ + {Record: "record", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + {Record: "record", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + }, + }, false, "duplicate") + + f(&Group{ + Name: "test", + Rules: []Rule{ + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "description": "{{ value|query }}", + }}, + }, + }, false, "duplicate") + + f(&Group{ + Name: "test", + Rules: []Rule{ + {Record: "alert", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "summary": "{{ value|query }}", + }}, + }, + }, false, "duplicate") + + f(&Group{ + Name: "test graphite prometheus bad expr", + Type: NewGraphiteType(), + Rules: []Rule{ + { + Expr: "sum(up == 0 ) by (host)", + For: promutils.NewDuration(10 * time.Millisecond), }, - expErr: "invalid limit", - }, - { - group: &Group{ - Name: "wrong concurrency", - Concurrency: -1, + { + Expr: "sumSeries(time('foo.bar',10))", }, - expErr: "invalid concurrency", }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - { - Record: "record", - Expr: "up | 0", - }, + }, false, "invalid rule") + + f(&Group{ + Name: "test graphite inherit", + Type: NewGraphiteType(), + Rules: []Rule{ + { + Expr: "sumSeries(time('foo.bar',10))", + For: promutils.NewDuration(10 * time.Millisecond), + }, + { + Expr: "sum(up == 0 ) by (host)", + }, + }, + }, false, "either `record` or `alert` must be set") + + // validate expressions + f(&Group{ + Name: "test", + Rules: []Rule{ + { + Record: "record", + Expr: "up | 0", + }, + }, + }, true, "invalid expression") + + f(&Group{ + Name: "test thanos", + Type: NewRawType("thanos"), + Rules: []Rule{ + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "description": "{{ value|query }}", + }}, + }, + }, true, "unknown datasource type") + + f(&Group{ + Name: "test graphite", + Type: NewGraphiteType(), + Rules: []Rule{ + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "description": "some-description", + }}, + }, + }, true, "bad graphite expr") +} + +func TestGroupValidate_Success(t *testing.T) { + f := func(group *Group, validateAnnotations, validateExpressions bool) { + t.Helper() + + var validateTplFn ValidateTplFn + if validateAnnotations { + validateTplFn = notifier.ValidateTemplates + } + err := group.Validate(validateTplFn, validateExpressions) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + } + + f(&Group{ + Name: "test", + Rules: []Rule{ + { + Record: "record", + Expr: "up | 0", + }, + }, + }, false, false) + + f(&Group{ + Name: "test", + Rules: []Rule{ + { + Alert: "alert", + Expr: "up == 1", + Labels: map[string]string{ + "summary": "{{ value|query }}", }, }, - expErr: "", }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - { - Record: "record", - Expr: "up | 0", - }, - }, - }, - expErr: "invalid expression", - validateExpressions: true, - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - { - Alert: "alert", - Expr: "up == 1", - Labels: map[string]string{ - "summary": "{{ value|query }}", - }, - }, - }, - }, - expErr: "", - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - { - Alert: "alert", - Expr: "up == 1", - Labels: map[string]string{ - "summary": ` + }, false, false) + + // validate annotiations + f(&Group{ + Name: "test", + Rules: []Rule{ + { + Alert: "alert", + Expr: "up == 1", + Labels: map[string]string{ + "summary": ` {{ with printf "node_memory_MemTotal{job='node',instance='%s'}" "localhost" | query }} {{ . | first | value | humanize1024 }}B {{ end }}`, - }, - }, - }, - }, - validateAnnotations: true, - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - { - Alert: "alert", - Expr: "up == 1", - }, - { - Alert: "alert", - Expr: "up == 1", - }, - }, - }, - expErr: "duplicate", - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - }, - }, - expErr: "duplicate", - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - {Record: "record", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - {Record: "record", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - }, - }, - expErr: "duplicate", - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "description": "{{ value|query }}", - }}, - }, - }, - expErr: "", - }, - { - group: &Group{ - Name: "test", - Rules: []Rule{ - {Record: "alert", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "summary": "{{ value|query }}", - }}, - }, - }, - expErr: "", - }, - { - group: &Group{ - Name: "test thanos", - Type: NewRawType("thanos"), - Rules: []Rule{ - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "description": "{{ value|query }}", - }}, - }, - }, - validateExpressions: true, - expErr: "unknown datasource type", - }, - { - group: &Group{ - Name: "test graphite", - Type: NewGraphiteType(), - Rules: []Rule{ - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "description": "some-description", - }}, - }, - }, - validateExpressions: true, - expErr: "", - }, - { - group: &Group{ - Name: "test prometheus", - Type: NewPrometheusType(), - Rules: []Rule{ - {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "description": "{{ value|query }}", - }}, - }, - }, - validateExpressions: true, - expErr: "", - }, - { - group: &Group{ - Name: "test graphite inherit", - Type: NewGraphiteType(), - Rules: []Rule{ - { - Expr: "sumSeries(time('foo.bar',10))", - For: promutils.NewDuration(10 * time.Millisecond), - }, - { - Expr: "sum(up == 0 ) by (host)", - }, }, }, }, - { - group: &Group{ - Name: "test graphite prometheus bad expr", - Type: NewGraphiteType(), - Rules: []Rule{ - { - Expr: "sum(up == 0 ) by (host)", - For: promutils.NewDuration(10 * time.Millisecond), - }, - { - Expr: "sumSeries(time('foo.bar',10))", - }, - }, - }, - expErr: "invalid rule", - }, - } + }, true, false) - for _, tc := range testCases { - var validateTplFn ValidateTplFn - if tc.validateAnnotations { - validateTplFn = notifier.ValidateTemplates - } - err := tc.group.Validate(validateTplFn, tc.validateExpressions) - if err == nil { - if tc.expErr != "" { - t.Errorf("expected to get err %q; got nil insted", tc.expErr) - } - continue - } - if !strings.Contains(err.Error(), tc.expErr) { - t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err) - } - } + // validate expressions + f(&Group{ + Name: "test prometheus", + Type: NewPrometheusType(), + Rules: []Rule{ + {Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "description": "{{ value|query }}", + }}, + }, + }, false, true) } -func TestHashRule(t *testing.T) { - testCases := []struct { - a, b Rule - equal bool - }{ - { - Rule{Record: "record", Expr: "up == 1"}, - Rule{Record: "record", Expr: "up == 1"}, - true, - }, - { - Rule{Alert: "alert", Expr: "up == 1"}, - Rule{Alert: "alert", Expr: "up == 1"}, - true, - }, - { - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - true, - }, - { - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "baz": "foo", - "foo": "bar", - }}, - true, - }, - { - Rule{Alert: "record", Expr: "up == 1"}, - Rule{Alert: "record", Expr: "up == 1"}, - true, - }, - { - Rule{Alert: "alert", Expr: "up == 1", For: promutils.NewDuration(time.Minute), KeepFiringFor: promutils.NewDuration(time.Minute)}, - Rule{Alert: "alert", Expr: "up == 1"}, - true, - }, - { - Rule{Alert: "record", Expr: "up == 1"}, - Rule{Record: "record", Expr: "up == 1"}, - false, - }, - { - Rule{Record: "record", Expr: "up == 1"}, - Rule{Record: "record", Expr: "up == 2"}, - false, - }, - { - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "baz": "foo", - "foo": "baz", - }}, - false, - }, - { - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "baz": "foo", - }}, - false, - }, - { - Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ - "foo": "bar", - "baz": "foo", - }}, - Rule{Alert: "alert", Expr: "up == 1"}, - false, - }, - } - for i, tc := range testCases { - aID, bID := HashRule(tc.a), HashRule(tc.b) - if tc.equal != (aID == bID) { - t.Fatalf("missmatch for rule %d", i) +func TestHashRule_NotEqual(t *testing.T) { + f := func(a, b Rule) { + t.Helper() + + aID, bID := HashRule(a), HashRule(b) + if aID == bID { + t.Fatalf("rule hashes mustn't be equal; got %d", aID) } } + + f(Rule{Alert: "record", Expr: "up == 1"}, Rule{Record: "record", Expr: "up == 1"}) + + f(Rule{Record: "record", Expr: "up == 1"}, Rule{Record: "record", Expr: "up == 2"}) + + f(Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}, Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "baz": "foo", + "foo": "baz", + }}) + + f(Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}, Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "baz": "foo", + }}) + + f(Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}, Rule{Alert: "alert", Expr: "up == 1"}) +} + +func TestHashRule_Equal(t *testing.T) { + f := func(a, b Rule) { + t.Helper() + + aID, bID := HashRule(a), HashRule(b) + if aID != bID { + t.Fatalf("rule hashes must be equal; got %d and %d", aID, bID) + } + } + + f(Rule{Record: "record", Expr: "up == 1"}, Rule{Record: "record", Expr: "up == 1"}) + + f(Rule{Alert: "alert", Expr: "up == 1"}, Rule{Alert: "alert", Expr: "up == 1"}) + + f(Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}, Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}) + + f(Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "foo": "bar", + "baz": "foo", + }}, Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{ + "baz": "foo", + "foo": "bar", + }}) + + f(Rule{Alert: "record", Expr: "up == 1"}, Rule{Alert: "record", Expr: "up == 1"}) + + f(Rule{ + Alert: "alert", Expr: "up == 1", For: promutils.NewDuration(time.Minute), KeepFiringFor: promutils.NewDuration(time.Minute), + }, Rule{Alert: "alert", Expr: "up == 1"}) } func TestGroupChecksum(t *testing.T) { diff --git a/app/vmalert/config/log/logger_test.go b/app/vmalert/config/log/logger_test.go index ea0f4fe6f..9135489cc 100644 --- a/app/vmalert/config/log/logger_test.go +++ b/app/vmalert/config/log/logger_test.go @@ -18,14 +18,14 @@ func TestOutput(t *testing.T) { mustMatch := func(exp string) { t.Helper() + if exp == "" { if testOutput.String() != "" { - t.Errorf("expected output to be empty; got %q", testOutput.String()) - return + t.Fatalf("expected output to be empty; got %q", testOutput.String()) } } if !strings.Contains(testOutput.String(), exp) { - t.Errorf("output %q should contain %q", testOutput.String(), exp) + t.Fatalf("output %q should contain %q", testOutput.String(), exp) } fmt.Println(testOutput.String()) testOutput.Reset() diff --git a/app/vmalert/datasource/init.go b/app/vmalert/datasource/init.go index a4184a23e..74fb77935 100644 --- a/app/vmalert/datasource/init.go +++ b/app/vmalert/datasource/init.go @@ -12,6 +12,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" ) var ( @@ -100,7 +101,7 @@ func Init(extraParams url.Values) (QuerierBuilder, error) { if err != nil { return nil, fmt.Errorf("failed to create transport: %w", err) } - tr.DialContext = httputils.GetStatDialFunc("vmalert_datasource") + tr.DialContext = netutil.NewStatDialFunc("vmalert_datasource") tr.DisableKeepAlives = *disableKeepAlive tr.MaxIdleConnsPerHost = *maxIdleConnections if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost { diff --git a/app/vmalert/datasource/vm_test.go b/app/vmalert/datasource/vm_test.go index d2caaf137..9c3519a24 100644 --- a/app/vmalert/datasource/vm_test.go +++ b/app/vmalert/datasource/vm_test.go @@ -31,26 +31,26 @@ var ( func TestVMInstantQuery(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) { - t.Errorf("should not be called") + t.Fatalf("should not be called") }) c := -1 mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) { c++ if r.Method != http.MethodPost { - t.Errorf("expected POST method got %s", r.Method) + t.Fatalf("expected POST method got %s", r.Method) } if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass { - t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass) + t.Fatalf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass) } if r.URL.Query().Get("query") != query { - t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) + t.Fatalf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) } timeParam := r.URL.Query().Get("time") if timeParam == "" { - t.Errorf("expected 'time' in query param, got nil instead") + t.Fatalf("expected 'time' in query param, got nil instead") } if _, err := time.Parse(time.RFC3339, timeParam); err != nil { - t.Errorf("failed to parse 'time' query param %q: %s", timeParam, err) + t.Fatalf("failed to parse 'time' query param %q: %s", timeParam, err) } switch c { case 0: @@ -197,13 +197,13 @@ func TestVMInstantQuery(t *testing.T) { func TestVMInstantQueryWithRetry(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) { - t.Errorf("should not be called") + t.Fatalf("should not be called") }) c := -1 mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) { c++ if r.URL.Query().Get("query") != query { - t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) + t.Fatalf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) } switch c { case 0: @@ -289,37 +289,37 @@ func metricsEqual(t *testing.T, gotM, expectedM []Metric) { func TestVMRangeQuery(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) { - t.Errorf("should not be called") + t.Fatalf("should not be called") }) c := -1 mux.HandleFunc("/api/v1/query_range", func(w http.ResponseWriter, r *http.Request) { c++ if r.Method != http.MethodPost { - t.Errorf("expected POST method got %s", r.Method) + t.Fatalf("expected POST method got %s", r.Method) } if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass { - t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass) + t.Fatalf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass) } if r.URL.Query().Get("query") != query { - t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) + t.Fatalf("expected %s in query param, got %s", query, r.URL.Query().Get("query")) } startTS := r.URL.Query().Get("start") if startTS == "" { - t.Errorf("expected 'start' in query param, got nil instead") + t.Fatalf("expected 'start' in query param, got nil instead") } if _, err := time.Parse(time.RFC3339, startTS); err != nil { - t.Errorf("failed to parse 'start' query param: %s", err) + t.Fatalf("failed to parse 'start' query param: %s", err) } endTS := r.URL.Query().Get("end") if endTS == "" { - t.Errorf("expected 'end' in query param, got nil instead") + t.Fatalf("expected 'end' in query param, got nil instead") } if _, err := time.Parse(time.RFC3339, endTS); err != nil { - t.Errorf("failed to parse 'end' query param: %s", err) + t.Fatalf("failed to parse 'end' query param: %s", err) } step := r.URL.Query().Get("step") if step != "15s" { - t.Errorf("expected 'step' query param to be 15s; got %q instead", step) + t.Fatalf("expected 'step' query param to be 15s; got %q instead", step) } switch c { case 0: @@ -370,368 +370,299 @@ func TestVMRangeQuery(t *testing.T) { } func TestRequestParams(t *testing.T) { - authCfg, err := baCfg.NewConfig(".") - if err != nil { - t.Fatalf("unexpected: %s", err) - } query := "up" timestamp := time.Date(2001, 2, 3, 4, 5, 6, 0, time.UTC) + + f := func(isQueryRange bool, vm *VMStorage, checkFn func(t *testing.T, r *http.Request)) { + t.Helper() + + req, err := vm.newRequest(ctx) + if err != nil { + t.Fatalf("error in newRequest: %s", err) + } + + switch vm.dataSourceType { + case "", datasourcePrometheus: + if isQueryRange { + vm.setPrometheusRangeReqParams(req, query, timestamp, timestamp) + } else { + vm.setPrometheusInstantReqParams(req, query, timestamp) + } + case datasourceGraphite: + vm.setGraphiteReqParams(req, query) + } + + checkFn(t, req) + } + + authCfg, err := baCfg.NewConfig(".") + if err != nil { + t.Fatalf("unexpected error: %s", err) + } storage := VMStorage{ extraParams: url.Values{"round_digits": {"10"}}, } - testCases := []struct { - name string - queryRange bool - vm *VMStorage - checkFn func(t *testing.T, r *http.Request) - }{ - { - "prometheus path", - false, - &VMStorage{ - dataSourceType: datasourcePrometheus, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, "/api/v1/query", r.URL.Path) - }, - }, - { - "prometheus prefix", - false, - &VMStorage{ - dataSourceType: datasourcePrometheus, - appendTypePrefix: true, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, "/prometheus/api/v1/query", r.URL.Path) - }, - }, - { - "prometheus range path", - true, - &VMStorage{ - dataSourceType: datasourcePrometheus, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, "/api/v1/query_range", r.URL.Path) - }, - }, - { - "prometheus range prefix", - true, - &VMStorage{ - dataSourceType: datasourcePrometheus, - appendTypePrefix: true, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, "/prometheus/api/v1/query_range", r.URL.Path) - }, - }, - { - "graphite path", - false, - &VMStorage{ - dataSourceType: datasourceGraphite, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, graphitePath, r.URL.Path) - }, - }, - { - "graphite prefix", - false, - &VMStorage{ - dataSourceType: datasourceGraphite, - appendTypePrefix: true, - }, - func(t *testing.T, r *http.Request) { - checkEqualString(t, graphitePrefix+graphitePath, r.URL.Path) - }, - }, - { - "default params", - false, - &VMStorage{}, - func(t *testing.T, r *http.Request) { - exp := url.Values{"query": {query}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "default range params", - true, - &VMStorage{}, - func(t *testing.T, r *http.Request) { - ts := timestamp.Format(time.RFC3339) - exp := url.Values{"query": {query}, "start": {ts}, "end": {ts}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "basic auth", - false, - &VMStorage{authCfg: authCfg}, - func(t *testing.T, r *http.Request) { - u, p, _ := r.BasicAuth() - checkEqualString(t, "foo", u) - checkEqualString(t, "bar", p) - }, - }, - { - "basic auth range", - true, - &VMStorage{authCfg: authCfg}, - func(t *testing.T, r *http.Request) { - u, p, _ := r.BasicAuth() - checkEqualString(t, "foo", u) - checkEqualString(t, "bar", p) - }, - }, - { - "evaluation interval", - false, - &VMStorage{ - evaluationInterval: 15 * time.Second, - }, - func(t *testing.T, r *http.Request) { - evalInterval := 15 * time.Second - exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "step override", - false, - &VMStorage{ - queryStep: time.Minute, - }, - func(t *testing.T, r *http.Request) { - exp := url.Values{ - "query": {query}, - "step": {fmt.Sprintf("%ds", int(time.Minute.Seconds()))}, - "time": {timestamp.Format(time.RFC3339)}, - } - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "step to seconds", - false, - &VMStorage{ - evaluationInterval: 3 * time.Hour, - }, - func(t *testing.T, r *http.Request) { - evalInterval := 3 * time.Hour - exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "prometheus extra params", - false, - &VMStorage{ - extraParams: url.Values{"round_digits": {"10"}}, - }, - func(t *testing.T, r *http.Request) { - exp := url.Values{"query": {query}, "round_digits": {"10"}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "prometheus extra params range", - true, - &VMStorage{ - extraParams: url.Values{ - "nocache": {"1"}, - "max_lookback": {"1h"}, - }, - }, - func(t *testing.T, r *http.Request) { - exp := url.Values{ - "query": {query}, - "end": {timestamp.Format(time.RFC3339)}, - "start": {timestamp.Format(time.RFC3339)}, - "nocache": {"1"}, - "max_lookback": {"1h"}, - } - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "custom params overrides the original params", - false, - storage.Clone().ApplyParams(QuerierParams{ - QueryParams: url.Values{"round_digits": {"2"}}, - }), - func(t *testing.T, r *http.Request) { - exp := url.Values{"query": {query}, "round_digits": {"2"}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "allow duplicates in query params", - false, - storage.Clone().ApplyParams(QuerierParams{ - QueryParams: url.Values{"extra_labels": {"env=dev", "foo=bar"}}, - }), - func(t *testing.T, r *http.Request) { - exp := url.Values{"query": {query}, "round_digits": {"10"}, "extra_labels": {"env=dev", "foo=bar"}, "time": {timestamp.Format(time.RFC3339)}} - checkEqualString(t, exp.Encode(), r.URL.RawQuery) - }, - }, - { - "graphite extra params", - false, - &VMStorage{ - dataSourceType: datasourceGraphite, - extraParams: url.Values{ - "nocache": {"1"}, - "max_lookback": {"1h"}, - }, - }, - func(t *testing.T, r *http.Request) { - exp := fmt.Sprintf("format=json&from=-5min&max_lookback=1h&nocache=1&target=%s&until=now", query) - checkEqualString(t, exp, r.URL.RawQuery) - }, - }, - { - "graphite extra params allows to override from", - false, - &VMStorage{ - dataSourceType: datasourceGraphite, - extraParams: url.Values{ - "from": {"-10m"}, - }, - }, - func(t *testing.T, r *http.Request) { - exp := fmt.Sprintf("format=json&from=-10m&target=%s&until=now", query) - checkEqualString(t, exp, r.URL.RawQuery) - }, - }, - } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - req, err := tc.vm.newRequest(ctx) - if err != nil { - t.Fatal(err) - } - switch tc.vm.dataSourceType { - case "", datasourcePrometheus: - if tc.queryRange { - tc.vm.setPrometheusRangeReqParams(req, query, timestamp, timestamp) - } else { - tc.vm.setPrometheusInstantReqParams(req, query, timestamp) - } - case datasourceGraphite: - tc.vm.setGraphiteReqParams(req, query) - } - tc.checkFn(t, req) - }) - } + // prometheus path + f(false, &VMStorage{ + dataSourceType: datasourcePrometheus, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, "/api/v1/query", r.URL.Path) + }) + + // prometheus prefix + f(false, &VMStorage{ + dataSourceType: datasourcePrometheus, + appendTypePrefix: true, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, "/prometheus/api/v1/query", r.URL.Path) + }) + + // prometheus range path + f(true, &VMStorage{ + dataSourceType: datasourcePrometheus, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, "/api/v1/query_range", r.URL.Path) + }) + + // prometheus range prefix + f(true, &VMStorage{ + dataSourceType: datasourcePrometheus, + appendTypePrefix: true, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, "/prometheus/api/v1/query_range", r.URL.Path) + }) + + // graphite path + f(false, &VMStorage{ + dataSourceType: datasourceGraphite, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, graphitePath, r.URL.Path) + }) + + // graphite prefix + f(false, &VMStorage{ + dataSourceType: datasourceGraphite, + appendTypePrefix: true, + }, func(t *testing.T, r *http.Request) { + checkEqualString(t, graphitePrefix+graphitePath, r.URL.Path) + }) + + // default params + f(false, &VMStorage{}, func(t *testing.T, r *http.Request) { + exp := url.Values{"query": {query}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // default range params + f(true, &VMStorage{}, func(t *testing.T, r *http.Request) { + ts := timestamp.Format(time.RFC3339) + exp := url.Values{"query": {query}, "start": {ts}, "end": {ts}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // basic auth + f(false, &VMStorage{ + authCfg: authCfg, + }, func(t *testing.T, r *http.Request) { + u, p, _ := r.BasicAuth() + checkEqualString(t, "foo", u) + checkEqualString(t, "bar", p) + }) + + // basic auth range + f(true, &VMStorage{ + authCfg: authCfg, + }, func(t *testing.T, r *http.Request) { + u, p, _ := r.BasicAuth() + checkEqualString(t, "foo", u) + checkEqualString(t, "bar", p) + }) + + // evaluation interval + f(false, &VMStorage{ + evaluationInterval: 15 * time.Second, + }, func(t *testing.T, r *http.Request) { + evalInterval := 15 * time.Second + exp := url.Values{"query": {query}, "step": {evalInterval.String()}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // step override + f(false, &VMStorage{ + queryStep: time.Minute, + }, func(t *testing.T, r *http.Request) { + exp := url.Values{ + "query": {query}, + "step": {fmt.Sprintf("%ds", int(time.Minute.Seconds()))}, + "time": {timestamp.Format(time.RFC3339)}, + } + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // step to seconds + f(false, &VMStorage{ + evaluationInterval: 3 * time.Hour, + }, func(t *testing.T, r *http.Request) { + evalInterval := 3 * time.Hour + exp := url.Values{"query": {query}, "step": {fmt.Sprintf("%ds", int(evalInterval.Seconds()))}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // prometheus extra params + f(false, &VMStorage{ + extraParams: url.Values{"round_digits": {"10"}}, + }, func(t *testing.T, r *http.Request) { + exp := url.Values{"query": {query}, "round_digits": {"10"}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // prometheus extra params range + f(true, &VMStorage{ + extraParams: url.Values{ + "nocache": {"1"}, + "max_lookback": {"1h"}, + }, + }, func(t *testing.T, r *http.Request) { + exp := url.Values{ + "query": {query}, + "end": {timestamp.Format(time.RFC3339)}, + "start": {timestamp.Format(time.RFC3339)}, + "nocache": {"1"}, + "max_lookback": {"1h"}, + } + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // custom params overrides the original params + f(false, storage.Clone().ApplyParams(QuerierParams{ + QueryParams: url.Values{"round_digits": {"2"}}, + }), func(t *testing.T, r *http.Request) { + exp := url.Values{"query": {query}, "round_digits": {"2"}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // allow duplicates in query params + f(false, storage.Clone().ApplyParams(QuerierParams{ + QueryParams: url.Values{"extra_labels": {"env=dev", "foo=bar"}}, + }), func(t *testing.T, r *http.Request) { + exp := url.Values{"query": {query}, "round_digits": {"10"}, "extra_labels": {"env=dev", "foo=bar"}, "time": {timestamp.Format(time.RFC3339)}} + checkEqualString(t, exp.Encode(), r.URL.RawQuery) + }) + + // graphite extra params + f(false, &VMStorage{ + dataSourceType: datasourceGraphite, + extraParams: url.Values{ + "nocache": {"1"}, + "max_lookback": {"1h"}, + }, + }, func(t *testing.T, r *http.Request) { + exp := fmt.Sprintf("format=json&from=-5min&max_lookback=1h&nocache=1&target=%s&until=now", query) + checkEqualString(t, exp, r.URL.RawQuery) + }) + + // graphite extra params allows to override from + f(false, &VMStorage{ + dataSourceType: datasourceGraphite, + extraParams: url.Values{ + "from": {"-10m"}, + }, + }, func(t *testing.T, r *http.Request) { + exp := fmt.Sprintf("format=json&from=-10m&target=%s&until=now", query) + checkEqualString(t, exp, r.URL.RawQuery) + }) } func TestHeaders(t *testing.T) { - testCases := []struct { - name string - vmFn func() *VMStorage - checkFn func(t *testing.T, r *http.Request) - }{ - { - name: "basic auth", - vmFn: func() *VMStorage { - cfg, err := utils.AuthConfig(utils.WithBasicAuth("foo", "bar", "")) - if err != nil { - t.Errorf("Error get auth config: %s", err) - } - return &VMStorage{authCfg: cfg} - }, - checkFn: func(t *testing.T, r *http.Request) { - u, p, _ := r.BasicAuth() - checkEqualString(t, "foo", u) - checkEqualString(t, "bar", p) - }, - }, - { - name: "bearer auth", - vmFn: func() *VMStorage { - cfg, err := utils.AuthConfig(utils.WithBearer("foo", "")) - if err != nil { - t.Errorf("Error get auth config: %s", err) - } - return &VMStorage{authCfg: cfg} - }, - checkFn: func(t *testing.T, r *http.Request) { - reqToken := r.Header.Get("Authorization") - splitToken := strings.Split(reqToken, "Bearer ") - if len(splitToken) != 2 { - t.Errorf("expected two items got %d", len(splitToken)) - } - token := splitToken[1] - checkEqualString(t, "foo", token) - }, - }, - { - name: "custom extraHeaders", - vmFn: func() *VMStorage { - return &VMStorage{extraHeaders: []keyValue{ - {key: "Foo", value: "bar"}, - {key: "Baz", value: "qux"}, - }} - }, - checkFn: func(t *testing.T, r *http.Request) { - h1 := r.Header.Get("Foo") - checkEqualString(t, "bar", h1) - h2 := r.Header.Get("Baz") - checkEqualString(t, "qux", h2) - }, - }, - { - name: "custom header overrides basic auth", - vmFn: func() *VMStorage { - cfg, err := utils.AuthConfig(utils.WithBasicAuth("foo", "bar", "")) - if err != nil { - t.Errorf("Error get auth config: %s", err) - } - return &VMStorage{ - authCfg: cfg, - extraHeaders: []keyValue{ - {key: "Authorization", value: "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="}, - }, - } - }, - checkFn: func(t *testing.T, r *http.Request) { - u, p, _ := r.BasicAuth() - checkEqualString(t, "Aladdin", u) - checkEqualString(t, "open sesame", p) - }, - }, - } - for _, tt := range testCases { - t.Run(tt.name, func(t *testing.T) { - vm := tt.vmFn() - req, err := vm.newQueryRequest(ctx, "foo", time.Now()) - if err != nil { - t.Fatal(err) - } - tt.checkFn(t, req) - }) + f := func(vmFn func() *VMStorage, checkFn func(t *testing.T, r *http.Request)) { + t.Helper() + + vm := vmFn() + req, err := vm.newQueryRequest(ctx, "foo", time.Now()) + if err != nil { + t.Fatalf("error in newQueryRequest: %s", err) + } + checkFn(t, req) } + + // basic auth + f(func() *VMStorage { + cfg, err := utils.AuthConfig(utils.WithBasicAuth("foo", "bar", "")) + if err != nil { + t.Fatalf("Error get auth config: %s", err) + } + return &VMStorage{authCfg: cfg} + }, func(t *testing.T, r *http.Request) { + u, p, _ := r.BasicAuth() + checkEqualString(t, "foo", u) + checkEqualString(t, "bar", p) + }) + + // bearer auth + f(func() *VMStorage { + cfg, err := utils.AuthConfig(utils.WithBearer("foo", "")) + if err != nil { + t.Fatalf("Error get auth config: %s", err) + } + return &VMStorage{authCfg: cfg} + }, func(t *testing.T, r *http.Request) { + reqToken := r.Header.Get("Authorization") + splitToken := strings.Split(reqToken, "Bearer ") + if len(splitToken) != 2 { + t.Fatalf("expected two items got %d", len(splitToken)) + } + token := splitToken[1] + checkEqualString(t, "foo", token) + }) + + // custom extraHeaders + f(func() *VMStorage { + return &VMStorage{extraHeaders: []keyValue{ + {key: "Foo", value: "bar"}, + {key: "Baz", value: "qux"}, + }} + }, func(t *testing.T, r *http.Request) { + h1 := r.Header.Get("Foo") + checkEqualString(t, "bar", h1) + h2 := r.Header.Get("Baz") + checkEqualString(t, "qux", h2) + }) + + // custom header overrides basic auth + f(func() *VMStorage { + cfg, err := utils.AuthConfig(utils.WithBasicAuth("foo", "bar", "")) + if err != nil { + t.Fatalf("Error get auth config: %s", err) + } + return &VMStorage{ + authCfg: cfg, + extraHeaders: []keyValue{ + {key: "Authorization", value: "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ=="}, + }, + } + }, func(t *testing.T, r *http.Request) { + u, p, _ := r.BasicAuth() + checkEqualString(t, "Aladdin", u) + checkEqualString(t, "open sesame", p) + }) } func checkEqualString(t *testing.T, exp, got string) { t.Helper() + if got != exp { - t.Errorf("expected to get: \n%q; \ngot: \n%q", exp, got) + t.Fatalf("expected to get: \n%q; \ngot: \n%q", exp, got) } } func expectError(t *testing.T, err error, exp string) { t.Helper() + if err == nil { - t.Errorf("expected non-nil error") + t.Fatalf("expected non-nil error") } if !strings.Contains(err.Error(), exp) { - t.Errorf("expected error %q to contain %q", err, exp) + t.Fatalf("expected error %q to contain %q", err, exp) } } diff --git a/app/vmalert/main_test.go b/app/vmalert/main_test.go index d6a289285..9a5ff12ee 100644 --- a/app/vmalert/main_test.go +++ b/app/vmalert/main_test.go @@ -25,26 +25,26 @@ func TestGetExternalURL(t *testing.T) { invalidURL := "victoriametrics.com/path" _, err := getExternalURL(invalidURL) if err == nil { - t.Errorf("expected error, got nil") + t.Fatalf("expected error, got nil") } expURL := "https://victoriametrics.com/path" u, err := getExternalURL(expURL) if err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } if u.String() != expURL { - t.Errorf("unexpected url: want %q, got %s", expURL, u.String()) + t.Fatalf("unexpected url: want %q, got %s", expURL, u.String()) } h, _ := os.Hostname() expURL = fmt.Sprintf("http://%s:8880", h) u, err = getExternalURL("") if err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } if u.String() != expURL { - t.Errorf("unexpected url: want %s, got %s", expURL, u.String()) + t.Fatalf("unexpected url: want %s, got %s", expURL, u.String()) } } @@ -53,22 +53,22 @@ func TestGetAlertURLGenerator(t *testing.T) { u, _ := url.Parse("https://victoriametrics.com/path") fn, err := getAlertURLGenerator(u, "", false) if err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } exp := fmt.Sprintf("https://victoriametrics.com/path/vmalert/alert?%s=42&%s=2", paramGroupID, paramAlertID) if exp != fn(testAlert) { - t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert)) + t.Fatalf("unexpected url want %s, got %s", exp, fn(testAlert)) } _, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true) if err == nil { - t.Errorf("expected template validation error got nil") + t.Fatalf("expected template validation error got nil") } fn, err = getAlertURLGenerator(u, "foo?query={{$value}}&ds={{ $labels.tenant }}", true) if err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } if exp := "https://victoriametrics.com/path/foo?query=4&ds=baz"; exp != fn(testAlert) { - t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert)) + t.Fatalf("unexpected url want %s, got %s", exp, fn(testAlert)) } } diff --git a/app/vmalert/manager_test.go b/app/vmalert/manager_test.go index e5f879796..a4582a2fb 100644 --- a/app/vmalert/manager_test.go +++ b/app/vmalert/manager_test.go @@ -82,9 +82,8 @@ func TestManagerUpdateConcurrent(t *testing.T) { wg.Wait() } -// TestManagerUpdate tests sequential configuration -// updates. -func TestManagerUpdate(t *testing.T) { +// TestManagerUpdate tests sequential configuration updates. +func TestManagerUpdate_Success(t *testing.T) { const defaultEvalInterval = time.Second * 30 currentEvalInterval := *evaluationInterval *evaluationInterval = defaultEvalInterval @@ -120,145 +119,127 @@ func TestManagerUpdate(t *testing.T) { } ) - testCases := []struct { - name string - initPath string - updatePath string - want []*rule.Group - }{ - { - name: "update good rules", - initPath: "config/testdata/rules/rules0-good.rules", - updatePath: "config/testdata/dir/rules1-good.rules", - want: []*rule.Group{ - { - File: "config/testdata/dir/rules1-good.rules", - Name: "duplicatedGroupDiffFiles", - Type: config.NewPrometheusType(), - Interval: defaultEvalInterval, - Rules: []rule.Rule{ - &rule.AlertingRule{ - Name: "VMRows", - Expr: "vm_rows > 0", - For: 5 * time.Minute, - Labels: map[string]string{"dc": "gcp", "label": "bar"}, - Annotations: map[string]string{ - "summary": "{{ $value }}", - "description": "{{$labels}}", - }, - }, - }, - }, - }, - }, - { - name: "update good rules from 1 to 2 groups", - initPath: "config/testdata/dir/rules/rules1-good.rules", - updatePath: "config/testdata/rules/rules0-good.rules", - want: []*rule.Group{ - { - File: "config/testdata/rules/rules0-good.rules", - Name: "groupGorSingleAlert", - Type: config.NewPrometheusType(), - Interval: defaultEvalInterval, - Rules: []rule.Rule{VMRows}, - }, - { - File: "config/testdata/rules/rules0-good.rules", - Interval: defaultEvalInterval, - Type: config.NewPrometheusType(), - Name: "TestGroup", - Rules: []rule.Rule{ - Conns, - ExampleAlertAlwaysFiring, - }, - }, - }, - }, - { - name: "update with one bad rule file", - initPath: "config/testdata/rules/rules0-good.rules", - updatePath: "config/testdata/dir/rules2-bad.rules", - want: []*rule.Group{ - { - File: "config/testdata/rules/rules0-good.rules", - Name: "groupGorSingleAlert", - Type: config.NewPrometheusType(), - Interval: defaultEvalInterval, - Rules: []rule.Rule{VMRows}, - }, - { - File: "config/testdata/rules/rules0-good.rules", - Interval: defaultEvalInterval, - Name: "TestGroup", - Type: config.NewPrometheusType(), - Rules: []rule.Rule{ - Conns, - ExampleAlertAlwaysFiring, - }, - }, - }, - }, - { - name: "update empty dir rules from 0 to 2 groups", - initPath: "config/testdata/empty/*", - updatePath: "config/testdata/rules/rules0-good.rules", - want: []*rule.Group{ - { - File: "config/testdata/rules/rules0-good.rules", - Name: "groupGorSingleAlert", - Type: config.NewPrometheusType(), - Interval: defaultEvalInterval, - Rules: []rule.Rule{VMRows}, - }, - { - File: "config/testdata/rules/rules0-good.rules", - Interval: defaultEvalInterval, - Type: config.NewPrometheusType(), - Name: "TestGroup", - Rules: []rule.Rule{ - Conns, - ExampleAlertAlwaysFiring, - }, - }, - }, - }, + f := func(initPath, updatePath string, groupsExpected []*rule.Group) { + t.Helper() + + ctx, cancel := context.WithCancel(context.TODO()) + m := &manager{ + groups: make(map[uint64]*rule.Group), + querierBuilder: &datasource.FakeQuerier{}, + notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} }, + } + + cfgInit := loadCfg(t, []string{initPath}, true, true) + if err := m.update(ctx, cfgInit, false); err != nil { + t.Fatalf("failed to complete initial rules update: %s", err) + } + + cfgUpdate, err := config.Parse([]string{updatePath}, notifier.ValidateTemplates, true) + if err == nil { // update can fail and that's expected + _ = m.update(ctx, cfgUpdate, false) + } + if len(groupsExpected) != len(m.groups) { + t.Fatalf("unexpected number of groups; got %d; want %d", len(m.groups), len(groupsExpected)) + } + + for _, wantG := range groupsExpected { + gotG, ok := m.groups[wantG.ID()] + if !ok { + t.Fatalf("expected to have group %q", wantG.Name) + } + compareGroups(t, wantG, gotG) + } + + cancel() + m.close() } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - ctx, cancel := context.WithCancel(context.TODO()) - m := &manager{ - groups: make(map[uint64]*rule.Group), - querierBuilder: &datasource.FakeQuerier{}, - notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} }, - } - cfgInit := loadCfg(t, []string{tc.initPath}, true, true) - if err := m.update(ctx, cfgInit, false); err != nil { - t.Fatalf("failed to complete initial rules update: %s", err) - } + // update good rules + f("config/testdata/rules/rules0-good.rules", "config/testdata/dir/rules1-good.rules", []*rule.Group{ + { + File: "config/testdata/dir/rules1-good.rules", + Name: "duplicatedGroupDiffFiles", + Type: config.NewPrometheusType(), + Interval: defaultEvalInterval, + Rules: []rule.Rule{ + &rule.AlertingRule{ + Name: "VMRows", + Expr: "vm_rows > 0", + For: 5 * time.Minute, + Labels: map[string]string{"dc": "gcp", "label": "bar"}, + Annotations: map[string]string{ + "summary": "{{ $value }}", + "description": "{{$labels}}", + }, + }, + }, + }, + }) - cfgUpdate, err := config.Parse([]string{tc.updatePath}, notifier.ValidateTemplates, true) - if err == nil { // update can fail and that's expected - _ = m.update(ctx, cfgUpdate, false) - } - if len(tc.want) != len(m.groups) { - t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups)) - } + // update good rules from 1 to 2 groups + f("config/testdata/dir/rules/rules1-good.rules", "config/testdata/rules/rules0-good.rules", []*rule.Group{ + { + File: "config/testdata/rules/rules0-good.rules", + Name: "groupGorSingleAlert", + Type: config.NewPrometheusType(), + Interval: defaultEvalInterval, + Rules: []rule.Rule{VMRows}, + }, + { + File: "config/testdata/rules/rules0-good.rules", + Interval: defaultEvalInterval, + Type: config.NewPrometheusType(), + Name: "TestGroup", + Rules: []rule.Rule{ + Conns, + ExampleAlertAlwaysFiring, + }, + }, + }) - for _, wantG := range tc.want { - gotG, ok := m.groups[wantG.ID()] - if !ok { - t.Fatalf("expected to have group %q", wantG.Name) - } - compareGroups(t, wantG, gotG) - } + // update with one bad rule file + f("config/testdata/rules/rules0-good.rules", "config/testdata/dir/rules2-bad.rules", []*rule.Group{ + { + File: "config/testdata/rules/rules0-good.rules", + Name: "groupGorSingleAlert", + Type: config.NewPrometheusType(), + Interval: defaultEvalInterval, + Rules: []rule.Rule{VMRows}, + }, + { + File: "config/testdata/rules/rules0-good.rules", + Interval: defaultEvalInterval, + Name: "TestGroup", + Type: config.NewPrometheusType(), + Rules: []rule.Rule{ + Conns, + ExampleAlertAlwaysFiring, + }, + }, + }) - cancel() - m.close() - }) - } + // update empty dir rules from 0 to 2 groups + f("config/testdata/empty/*", "config/testdata/rules/rules0-good.rules", []*rule.Group{ + { + File: "config/testdata/rules/rules0-good.rules", + Name: "groupGorSingleAlert", + Type: config.NewPrometheusType(), + Interval: defaultEvalInterval, + Rules: []rule.Rule{VMRows}, + }, + { + File: "config/testdata/rules/rules0-good.rules", + Interval: defaultEvalInterval, + Type: config.NewPrometheusType(), + Name: "TestGroup", + Rules: []rule.Rule{ + Conns, + ExampleAlertAlwaysFiring, + }, + }, + }) } + func compareGroups(t *testing.T, a, b *rule.Group) { t.Helper() if a.Name != b.Name { @@ -285,82 +266,59 @@ func compareGroups(t *testing.T, a, b *rule.Group) { } } -func TestManagerUpdateNegative(t *testing.T) { - testCases := []struct { - notifiers []notifier.Notifier - rw remotewrite.RWClient - cfg config.Group - expErr string - }{ - { - nil, - nil, - config.Group{ - Name: "Recording rule only", - Rules: []config.Rule{ - {Record: "record", Expr: "max(up)"}, - }, - }, - "contains recording rules", - }, - { - nil, - nil, - config.Group{ - Name: "Alerting rule only", - Rules: []config.Rule{ - {Alert: "alert", Expr: "up > 0"}, - }, - }, - "contains alerting rules", - }, - { - []notifier.Notifier{¬ifier.FakeNotifier{}}, - nil, - config.Group{ - Name: "Recording and alerting rules", - Rules: []config.Rule{ - {Alert: "alert1", Expr: "up > 0"}, - {Alert: "alert2", Expr: "up > 0"}, - {Record: "record", Expr: "max(up)"}, - }, - }, - "contains recording rules", - }, - { - nil, - &remotewrite.Client{}, - config.Group{ - Name: "Recording and alerting rules", - Rules: []config.Rule{ - {Record: "record1", Expr: "max(up)"}, - {Record: "record2", Expr: "max(up)"}, - {Alert: "alert", Expr: "up > 0"}, - }, - }, - "contains alerting rules", - }, +func TestManagerUpdate_Failure(t *testing.T) { + f := func(notifiers []notifier.Notifier, rw remotewrite.RWClient, cfg config.Group, errStrExpected string) { + t.Helper() + + m := &manager{ + groups: make(map[uint64]*rule.Group), + querierBuilder: &datasource.FakeQuerier{}, + rw: rw, + } + if notifiers != nil { + m.notifiers = func() []notifier.Notifier { return notifiers } + } + err := m.update(context.Background(), []config.Group{cfg}, false) + if err == nil { + t.Fatalf("expected to get error; got nil") + } + errStr := err.Error() + if !strings.Contains(errStr, errStrExpected) { + t.Fatalf("missing %q in the error %q", errStrExpected, errStr) + } } - for _, tc := range testCases { - t.Run(tc.cfg.Name, func(t *testing.T) { - m := &manager{ - groups: make(map[uint64]*rule.Group), - querierBuilder: &datasource.FakeQuerier{}, - rw: tc.rw, - } - if tc.notifiers != nil { - m.notifiers = func() []notifier.Notifier { return tc.notifiers } - } - err := m.update(context.Background(), []config.Group{tc.cfg}, false) - if err == nil { - t.Fatalf("expected to get error; got nil") - } - if !strings.Contains(err.Error(), tc.expErr) { - t.Fatalf("expected err to contain %q; got %q", tc.expErr, err) - } - }) - } + f(nil, nil, config.Group{ + Name: "Recording rule only", + Rules: []config.Rule{ + {Record: "record", Expr: "max(up)"}, + }, + }, "contains recording rules") + + f(nil, nil, config.Group{ + Name: "Alerting rule only", + Rules: []config.Rule{ + {Alert: "alert", Expr: "up > 0"}, + }, + }, "contains alerting rules") + + f([]notifier.Notifier{¬ifier.FakeNotifier{}}, nil, config.Group{ + Name: "Recording and alerting rules", + Rules: []config.Rule{ + {Alert: "alert1", Expr: "up > 0"}, + {Alert: "alert2", Expr: "up > 0"}, + {Record: "record", Expr: "max(up)"}, + }, + }, "contains recording rules") + + f(nil, &remotewrite.Client{}, config.Group{ + Name: "Recording and alerting rules", + Rules: []config.Rule{ + {Record: "record1", Expr: "max(up)"}, + {Record: "record2", Expr: "max(up)"}, + {Alert: "alert", Expr: "up > 0"}, + }, + }, "contains alerting rules") } func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressions bool) []config.Group { diff --git a/app/vmalert/notifier/alert_test.go b/app/vmalert/notifier/alert_test.go index b02678dce..30c2db57b 100644 --- a/app/vmalert/notifier/alert_test.go +++ b/app/vmalert/notifier/alert_test.go @@ -11,7 +11,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel" ) -func TestAlert_ExecTemplate(t *testing.T) { +func TestAlertExecTemplate(t *testing.T) { extLabels := make(map[string]string) const ( extCluster = "prod" @@ -23,201 +23,164 @@ func TestAlert_ExecTemplate(t *testing.T) { _, err := Init(nil, extLabels, extURL) checkErr(t, err) - testCases := []struct { - name string - alert *Alert - annotations map[string]string - expTpl map[string]string - }{ - { - name: "empty-alert", - alert: &Alert{}, - annotations: map[string]string{}, - expTpl: map[string]string{}, - }, - { - name: "no-template", - alert: &Alert{ - Value: 1e4, - Labels: map[string]string{ - "instance": "localhost", + f := func(alert *Alert, annotations map[string]string, tplExpected map[string]string) { + t.Helper() + + if err := ValidateTemplates(annotations); err != nil { + t.Fatalf("cannot validate annotations: %s", err) + } + + qFn := func(_ string) ([]datasource.Metric, error) { + return []datasource.Metric{ + { + Labels: []datasource.Label{ + {Name: "foo", Value: "bar"}, + {Name: "baz", Value: "qux"}, + }, + Values: []float64{1}, + Timestamps: []int64{1}, }, - }, - annotations: map[string]string{}, - expTpl: map[string]string{}, - }, - { - name: "label-template", - alert: &Alert{ - Value: 1e4, - Labels: map[string]string{ - "job": "staging", - "instance": "localhost", + { + Labels: []datasource.Label{ + {Name: "foo", Value: "garply"}, + {Name: "baz", Value: "fred"}, + }, + Values: []float64{2}, + Timestamps: []int64{1}, }, - For: 5 * time.Minute, - }, - annotations: map[string]string{ - "summary": "Too high connection number for {{$labels.instance}} for job {{$labels.job}}", - "description": "It is {{ $value }} connections for {{$labels.instance}} for more than {{ .For }}", - }, - expTpl: map[string]string{ - "summary": "Too high connection number for localhost for job staging", - "description": "It is 10000 connections for localhost for more than 5m0s", - }, - }, - { - name: "expression-template", - alert: &Alert{ - Expr: `vm_rows{"label"="bar"}<0`, - }, - annotations: map[string]string{ - "exprEscapedQuery": "{{ $expr|queryEscape }}", - "exprEscapedPath": "{{ $expr|pathEscape }}", - "exprEscapedJSON": "{{ $expr|jsonEscape }}", - "exprEscapedQuotes": "{{ $expr|quotesEscape }}", - "exprEscapedHTML": "{{ $expr|htmlEscape }}", - }, - expTpl: map[string]string{ - "exprEscapedQuery": "vm_rows%7B%22label%22%3D%22bar%22%7D%3C0", - "exprEscapedPath": "vm_rows%7B%22label%22=%22bar%22%7D%3C0", - "exprEscapedJSON": `"vm_rows{\"label\"=\"bar\"}\u003c0"`, - "exprEscapedQuotes": `vm_rows{\"label\"=\"bar\"}\u003c0`, - "exprEscapedHTML": "vm_rows{"label"="bar"}<0", - }, - }, - { - name: "query", - alert: &Alert{Expr: `vm_rows{"label"="bar"}>0`}, - annotations: map[string]string{ - "summary": `{{ query "foo" | first | value }}`, - "desc": `{{ range query "bar" }}{{ . | label "foo" }} {{ . | value }};{{ end }}`, - }, - expTpl: map[string]string{ - "summary": "1", - "desc": "bar 1;garply 2;", - }, - }, - { - name: "external", - alert: &Alert{ - Value: 1e4, - Labels: map[string]string{ - "job": "staging", - "instance": "localhost", - }, - }, - annotations: map[string]string{ - "url": "{{ $externalURL }}", - "summary": "Issues with {{$labels.instance}} (dc-{{$externalLabels.dc}}) for job {{$labels.job}}", - "description": "It is {{ $value }} connections for {{$labels.instance}} (cluster-{{$externalLabels.cluster}})", - }, - expTpl: map[string]string{ - "url": extURL, - "summary": fmt.Sprintf("Issues with localhost (dc-%s) for job staging", extDC), - "description": fmt.Sprintf("It is 10000 connections for localhost (cluster-%s)", extCluster), - }, - }, - { - name: "alert and group IDs", - alert: &Alert{ - ID: 42, - GroupID: 24, - }, - annotations: map[string]string{ - "url": "/api/v1/alert?alertID={{$alertID}}&groupID={{$groupID}}", - }, - expTpl: map[string]string{ - "url": "/api/v1/alert?alertID=42&groupID=24", - }, - }, - { - name: "ActiveAt time", - alert: &Alert{ - ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), - }, - annotations: map[string]string{ - "diagram": "![](http://example.com?render={{$activeAt.Unix}}", - }, - expTpl: map[string]string{ - "diagram": "![](http://example.com?render=1660941298", - }, - }, - { - name: "ActiveAt time is nil", - alert: &Alert{}, - annotations: map[string]string{ - "default_time": "{{$activeAt}}", - }, - expTpl: map[string]string{ - "default_time": "0001-01-01 00:00:00 +0000 UTC", - }, - }, - { - name: "ActiveAt custom format", - alert: &Alert{ - ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), - }, - annotations: map[string]string{ - "fire_time": `{{$activeAt.Format "2006/01/02 15:04:05"}}`, - }, - expTpl: map[string]string{ - "fire_time": "2022/08/19 20:34:58", - }, - }, - { - name: "ActiveAt query range", - alert: &Alert{ - ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), - }, - annotations: map[string]string{ - "grafana_url": `vm-grafana.com?from={{($activeAt.Add (parseDurationTime "1h")).Unix}}&to={{($activeAt.Add (parseDurationTime "-1h")).Unix}}`, - }, - expTpl: map[string]string{ - "grafana_url": "vm-grafana.com?from=1660944898&to=1660937698", - }, - }, + }, nil + } + + tpl, err := alert.ExecTemplate(qFn, alert.Labels, annotations) + if err != nil { + t.Fatalf("cannot execute template: %s", err) + } + if len(tpl) != len(tplExpected) { + t.Fatalf("unexpected number of elements; got %d; want %d", len(tpl), len(tplExpected)) + } + for k := range tplExpected { + got, exp := tpl[k], tplExpected[k] + if got != exp { + t.Fatalf("unexpected template for key=%q; got %q; want %q", k, got, exp) + } + } } - qFn := func(_ string) ([]datasource.Metric, error) { - return []datasource.Metric{ - { - Labels: []datasource.Label{ - {Name: "foo", Value: "bar"}, - {Name: "baz", Value: "qux"}, - }, - Values: []float64{1}, - Timestamps: []int64{1}, - }, - { - Labels: []datasource.Label{ - {Name: "foo", Value: "garply"}, - {Name: "baz", Value: "fred"}, - }, - Values: []float64{2}, - Timestamps: []int64{1}, - }, - }, nil - } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if err := ValidateTemplates(tc.annotations); err != nil { - t.Fatal(err) - } - tpl, err := tc.alert.ExecTemplate(qFn, tc.alert.Labels, tc.annotations) - if err != nil { - t.Fatal(err) - } - if len(tpl) != len(tc.expTpl) { - t.Fatalf("expected %d elements; got %d", len(tc.expTpl), len(tpl)) - } - for k := range tc.expTpl { - got, exp := tpl[k], tc.expTpl[k] - if got != exp { - t.Fatalf("expected %q=%q; got %q=%q", k, exp, k, got) - } - } - }) - } + // empty-alert + f(&Alert{}, map[string]string{}, map[string]string{}) + + // no-template + f(&Alert{ + Value: 1e4, + Labels: map[string]string{ + "instance": "localhost", + }, + }, map[string]string{}, map[string]string{}) + + // label-template + f(&Alert{ + Value: 1e4, + Labels: map[string]string{ + "job": "staging", + "instance": "localhost", + }, + For: 5 * time.Minute, + }, map[string]string{ + "summary": "Too high connection number for {{$labels.instance}} for job {{$labels.job}}", + "description": "It is {{ $value }} connections for {{$labels.instance}} for more than {{ .For }}", + }, map[string]string{ + "summary": "Too high connection number for localhost for job staging", + "description": "It is 10000 connections for localhost for more than 5m0s", + }) + + // expression-template + f(&Alert{ + Expr: `vm_rows{"label"="bar"}<0`, + }, map[string]string{ + "exprEscapedQuery": "{{ $expr|queryEscape }}", + "exprEscapedPath": "{{ $expr|pathEscape }}", + "exprEscapedJSON": "{{ $expr|jsonEscape }}", + "exprEscapedQuotes": "{{ $expr|quotesEscape }}", + "exprEscapedHTML": "{{ $expr|htmlEscape }}", + }, map[string]string{ + "exprEscapedQuery": "vm_rows%7B%22label%22%3D%22bar%22%7D%3C0", + "exprEscapedPath": "vm_rows%7B%22label%22=%22bar%22%7D%3C0", + "exprEscapedJSON": `"vm_rows{\"label\"=\"bar\"}\u003c0"`, + "exprEscapedQuotes": `vm_rows{\"label\"=\"bar\"}\u003c0`, + "exprEscapedHTML": "vm_rows{"label"="bar"}<0", + }) + + // query + f(&Alert{ + Expr: `vm_rows{"label"="bar"}>0`, + }, map[string]string{ + "summary": `{{ query "foo" | first | value }}`, + "desc": `{{ range query "bar" }}{{ . | label "foo" }} {{ . | value }};{{ end }}`, + }, map[string]string{ + "summary": "1", + "desc": "bar 1;garply 2;", + }) + + // external + f(&Alert{ + Value: 1e4, + Labels: map[string]string{ + "job": "staging", + "instance": "localhost", + }, + }, map[string]string{ + "url": "{{ $externalURL }}", + "summary": "Issues with {{$labels.instance}} (dc-{{$externalLabels.dc}}) for job {{$labels.job}}", + "description": "It is {{ $value }} connections for {{$labels.instance}} (cluster-{{$externalLabels.cluster}})", + }, map[string]string{ + "url": extURL, + "summary": fmt.Sprintf("Issues with localhost (dc-%s) for job staging", extDC), + "description": fmt.Sprintf("It is 10000 connections for localhost (cluster-%s)", extCluster), + }) + + // alert and group IDs + f(&Alert{ + ID: 42, + GroupID: 24, + }, map[string]string{ + "url": "/api/v1/alert?alertID={{$alertID}}&groupID={{$groupID}}", + }, map[string]string{ + "url": "/api/v1/alert?alertID=42&groupID=24", + }) + + // ActiveAt time + f(&Alert{ + ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), + }, map[string]string{ + "diagram": "![](http://example.com?render={{$activeAt.Unix}}", + }, map[string]string{ + "diagram": "![](http://example.com?render=1660941298", + }) + + // ActiveAt time is nil + f(&Alert{}, map[string]string{ + "default_time": "{{$activeAt}}", + }, map[string]string{ + "default_time": "0001-01-01 00:00:00 +0000 UTC", + }) + + // ActiveAt custom format + f(&Alert{ + ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), + }, map[string]string{ + "fire_time": `{{$activeAt.Format "2006/01/02 15:04:05"}}`, + }, map[string]string{ + "fire_time": "2022/08/19 20:34:58", + }) + + // ActiveAt query range + f(&Alert{ + ActiveAt: time.Date(2022, 8, 19, 20, 34, 58, 651387237, time.UTC), + }, map[string]string{ + "grafana_url": `vm-grafana.com?from={{($activeAt.Add (parseDurationTime "1h")).Unix}}&to={{($activeAt.Add (parseDurationTime "-1h")).Unix}}`, + }, map[string]string{ + "grafana_url": "vm-grafana.com?from=1660944898&to=1660937698", + }) } func TestAlert_toPromLabels(t *testing.T) { diff --git a/app/vmalert/notifier/alertmanager_test.go b/app/vmalert/notifier/alertmanager_test.go index 5f3ed8c6e..94868b8bf 100644 --- a/app/vmalert/notifier/alertmanager_test.go +++ b/app/vmalert/notifier/alertmanager_test.go @@ -16,10 +16,10 @@ func TestAlertManager_Addr(t *testing.T) { const addr = "http://localhost" am, err := NewAlertManager(addr, nil, promauth.HTTPClientConfig{}, nil, 0) if err != nil { - t.Errorf("unexpected error: %s", err) + t.Fatalf("unexpected error: %s", err) } if am.Addr() != addr { - t.Errorf("expected to have %q; got %q", addr, am.Addr()) + t.Fatalf("expected to have %q; got %q", addr, am.Addr()) } } @@ -28,21 +28,20 @@ func TestAlertManager_Send(t *testing.T) { const headerKey, headerValue = "TenantID", "foo" mux := http.NewServeMux() mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) { - t.Errorf("should not be called") + t.Fatalf("should not be called") }) c := -1 mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) { user, pass, ok := r.BasicAuth() if !ok { - t.Errorf("unauthorized request") + t.Fatalf("unauthorized request") } if user != baUser || pass != baPass { - t.Errorf("wrong creds %q:%q; expected %q:%q", - user, pass, baUser, baPass) + t.Fatalf("wrong creds %q:%q; expected %q:%q", user, pass, baUser, baPass) } c++ if r.Method != http.MethodPost { - t.Errorf("expected POST method got %s", r.Method) + t.Fatalf("expected POST method got %s", r.Method) } switch c { case 0: @@ -59,25 +58,23 @@ func TestAlertManager_Send(t *testing.T) { GeneratorURL string `json:"generatorURL"` } if err := json.NewDecoder(r.Body).Decode(&a); err != nil { - t.Errorf("can not unmarshal data into alert %s", err) - t.FailNow() + t.Fatalf("can not unmarshal data into alert %s", err) } if len(a) != 1 { - t.Errorf("expected 1 alert in array got %d", len(a)) + t.Fatalf("expected 1 alert in array got %d", len(a)) } if a[0].GeneratorURL != "0/0" { - t.Errorf("expected 0/0 as generatorURL got %s", a[0].GeneratorURL) + t.Fatalf("expected 0/0 as generatorURL got %s", a[0].GeneratorURL) } if a[0].StartsAt.IsZero() { - t.Errorf("expected non-zero start time") + t.Fatalf("expected non-zero start time") } if a[0].EndAt.IsZero() { - t.Errorf("expected non-zero end time") + t.Fatalf("expected non-zero end time") } case 3: if r.Header.Get(headerKey) != headerValue { - t.Errorf("expected header %q to be set to %q; got %q instead", - headerKey, headerValue, r.Header.Get(headerKey)) + t.Fatalf("expected header %q to be set to %q; got %q instead", headerKey, headerValue, r.Header.Get(headerKey)) } } }) @@ -94,13 +91,13 @@ func TestAlertManager_Send(t *testing.T) { return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10) }, aCfg, nil, 0) if err != nil { - t.Errorf("unexpected error: %s", err) + t.Fatalf("unexpected error: %s", err) } if err := am.Send(context.Background(), []Alert{{}, {}}, nil); err == nil { - t.Error("expected connection error got nil") + t.Fatalf("expected connection error got nil") } if err := am.Send(context.Background(), []Alert{}, nil); err == nil { - t.Error("expected wrong http code error got nil") + t.Fatalf("expected wrong http code error got nil") } if err := am.Send(context.Background(), []Alert{{ GroupID: 0, @@ -109,12 +106,12 @@ func TestAlertManager_Send(t *testing.T) { End: time.Now().UTC(), Annotations: map[string]string{"a": "b", "c": "d", "e": "f"}, }}, nil); err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } if c != 2 { - t.Errorf("expected 2 calls(count from zero) to server got %d", c) + t.Fatalf("expected 2 calls(count from zero) to server got %d", c) } if err := am.Send(context.Background(), nil, map[string]string{headerKey: headerValue}); err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } } diff --git a/app/vmalert/notifier/config_test.go b/app/vmalert/notifier/config_test.go index 442b2f3a7..86e577cba 100644 --- a/app/vmalert/notifier/config_test.go +++ b/app/vmalert/notifier/config_test.go @@ -5,10 +5,14 @@ import ( "testing" ) -func TestConfigParseGood(t *testing.T) { +func TestParseConfig_Success(t *testing.T) { f := func(path string) { + t.Helper() + _, err := parseConfig(path) - checkErr(t, err) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } } f("testdata/mixed.good.yaml") f("testdata/consul.good.yaml") @@ -16,14 +20,16 @@ func TestConfigParseGood(t *testing.T) { f("testdata/static.good.yaml") } -func TestConfigParseBad(t *testing.T) { +func TestParseConfig_Failure(t *testing.T) { f := func(path, expErr string) { + t.Helper() + _, err := parseConfig(path) if err == nil { t.Fatalf("expected to get non-nil err for config %q", path) } if !strings.Contains(err.Error(), expErr) { - t.Errorf("expected err to contain %q; got %q instead", expErr, err) + t.Fatalf("expected err to contain %q; got %q instead", expErr, err) } } diff --git a/app/vmalert/notifier/config_watcher_test.go b/app/vmalert/notifier/config_watcher_test.go index 2f00d7289..627b7bc23 100644 --- a/app/vmalert/notifier/config_watcher_test.go +++ b/app/vmalert/notifier/config_watcher_test.go @@ -319,46 +319,41 @@ func TestMergeHTTPClientConfigs(t *testing.T) { } } -func TestParseLabels(t *testing.T) { - testCases := []struct { - name string - target string - cfg *Config - expectedAddress string - expectedErr bool - }{ - { - "invalid address", - "invalid:*//url", - &Config{}, - "", - true, - }, - { - "use some default params", - "alertmanager:9093", - &Config{PathPrefix: "test"}, - "http://alertmanager:9093/test/api/v2/alerts", - false, - }, - { - "use target address", - "https://alertmanager:9093/api/v1/alerts", - &Config{Scheme: "http", PathPrefix: "test"}, - "https://alertmanager:9093/api/v1/alerts", - false, - }, +func TestParseLabels_Failure(t *testing.T) { + f := func(target string, cfg *Config) { + t.Helper() + + _, _, err := parseLabels(target, nil, cfg) + if err == nil { + t.Fatalf("expecting non-nil error") + } } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - address, _, err := parseLabels(tc.target, nil, tc.cfg) - if err == nil == tc.expectedErr { - t.Fatalf("unexpected error; got %t; want %t", err != nil, tc.expectedErr) - } - if address != tc.expectedAddress { - t.Fatalf("unexpected address; got %q; want %q", address, tc.expectedAddress) - } - }) - } + // invalid address + f("invalid:*//url", &Config{}) +} + +func TestParseLabels_Success(t *testing.T) { + f := func(target string, cfg *Config, expectedAddress string) { + t.Helper() + + address, _, err := parseLabels(target, nil, cfg) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + if address != expectedAddress { + t.Fatalf("unexpected address; got %q; want %q", address, expectedAddress) + } + } + + // use some default params + f("alertmanager:9093", &Config{ + PathPrefix: "test", + }, "http://alertmanager:9093/test/api/v2/alerts") + + // use target address + f("https://alertmanager:9093/api/v1/alerts", &Config{ + Scheme: "http", + PathPrefix: "test", + }, "https://alertmanager:9093/api/v1/alerts") } diff --git a/app/vmalert/notifier/notifier_blackhole_test.go b/app/vmalert/notifier/notifier_blackhole_test.go index 236f42275..c9615b0b1 100644 --- a/app/vmalert/notifier/notifier_blackhole_test.go +++ b/app/vmalert/notifier/notifier_blackhole_test.go @@ -17,12 +17,12 @@ func TestBlackHoleNotifier_Send(t *testing.T) { End: time.Now().UTC(), Annotations: map[string]string{"a": "b", "c": "d", "e": "f"}, }}, nil); err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } alertCount := bh.metrics.alertsSent.Get() if alertCount != 1 { - t.Errorf("expect value 1; instead got %d", alertCount) + t.Fatalf("expect value 1; instead got %d", alertCount) } } @@ -35,7 +35,7 @@ func TestBlackHoleNotifier_Close(t *testing.T) { End: time.Now().UTC(), Annotations: map[string]string{"a": "b", "c": "d", "e": "f"}, }}, nil); err != nil { - t.Errorf("unexpected error %s", err) + t.Fatalf("unexpected error %s", err) } bh.Close() @@ -44,7 +44,7 @@ func TestBlackHoleNotifier_Close(t *testing.T) { alertMetricName := "vmalert_alerts_sent_total{addr=\"blackhole\"}" for _, name := range defaultMetrics.ListMetricNames() { if name == alertMetricName { - t.Errorf("Metric name should have unregistered.But still present") + t.Fatalf("Metric name should have unregistered.But still present") } } } diff --git a/app/vmalert/remoteread/init.go b/app/vmalert/remoteread/init.go index e243857cb..c7ce0ea1d 100644 --- a/app/vmalert/remoteread/init.go +++ b/app/vmalert/remoteread/init.go @@ -10,6 +10,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" ) var ( @@ -70,7 +71,7 @@ func Init() (datasource.QuerierBuilder, error) { return nil, fmt.Errorf("failed to create transport: %w", err) } tr.IdleConnTimeout = *idleConnectionTimeout - tr.DialContext = httputils.GetStatDialFunc("vmalert_remoteread") + tr.DialContext = netutil.NewStatDialFunc("vmalert_remoteread") endpointParams, err := flagutil.ParseJSONMap(*oauth2EndpointParams) if err != nil { diff --git a/app/vmalert/remotewrite/client_test.go b/app/vmalert/remotewrite/client_test.go index c88dcb5ce..b16f0827c 100644 --- a/app/vmalert/remotewrite/client_test.go +++ b/app/vmalert/remotewrite/client_test.go @@ -44,8 +44,7 @@ func TestClient_Push(t *testing.T) { } r := rand.New(rand.NewSource(1)) - const rowsN = 1e4 - var sent int + const rowsN = int(1e4) for i := 0; i < rowsN; i++ { s := prompbmarshal.TimeSeries{ Samples: []prompbmarshal.Sample{{ @@ -57,17 +56,11 @@ func TestClient_Push(t *testing.T) { if err != nil { t.Fatalf("unexpected err: %s", err) } - if err == nil { - sent++ - } err = faultyClient.Push(s) if err != nil { t.Fatalf("unexpected err: %s", err) } } - if sent == 0 { - t.Fatalf("0 series sent") - } if err := client.Close(); err != nil { t.Fatalf("failed to close client: %s", err) } @@ -75,77 +68,66 @@ func TestClient_Push(t *testing.T) { t.Fatalf("failed to close faulty client: %s", err) } got := testSrv.accepted() - if got != sent { - t.Fatalf("expected to have %d series; got %d", sent, got) + if got != rowsN { + t.Fatalf("expected to have %d series; got %d", rowsN, got) } got = faultySrv.accepted() - if got != sent { - t.Fatalf("expected to have %d series for faulty client; got %d", sent, got) + if got != rowsN { + t.Fatalf("expected to have %d series for faulty client; got %d", rowsN, got) } } func TestClient_run_maxBatchSizeDuringShutdown(t *testing.T) { - batchSize := 20 + const batchSize = 20 - testTable := []struct { - name string // name of the test case - pushCnt int // how many time series is pushed to the client - batchCnt int // the expected batch count sent by the client - }{ - { - name: "pushCnt % batchSize == 0", - pushCnt: batchSize * 40, - batchCnt: 40, - }, - { - name: "pushCnt % batchSize != 0", - pushCnt: batchSize*40 + 1, - batchCnt: 40 + 1, - }, - } + f := func(pushCnt, batchCntExpected int) { + t.Helper() - for _, tt := range testTable { - t.Run(tt.name, func(t *testing.T) { - // run new server - bcServer := newBatchCntRWServer() + // run new server + bcServer := newBatchCntRWServer() - // run new client - rwClient, err := NewClient(context.Background(), Config{ - MaxBatchSize: batchSize, + // run new client + rwClient, err := NewClient(context.Background(), Config{ + MaxBatchSize: batchSize, - // Set everything to 1 to simplify the calculation. - Concurrency: 1, - MaxQueueSize: 1000, - FlushInterval: time.Minute, + // Set everything to 1 to simplify the calculation. + Concurrency: 1, + MaxQueueSize: 1000, + FlushInterval: time.Minute, - // batch count server - Addr: bcServer.URL, - }) - if err != nil { - t.Fatalf("new remote write client failed, err: %v", err) - } - - // push time series to the client. - for i := 0; i < tt.pushCnt; i++ { - if err = rwClient.Push(prompbmarshal.TimeSeries{}); err != nil { - t.Fatalf("push time series to the client failed, err: %v", err) - } - } - - // close the client so the rest ts will be flushed in `shutdown` - if err = rwClient.Close(); err != nil { - t.Fatalf("shutdown client failed, err: %v", err) - } - - // finally check how many batches is sent. - if tt.batchCnt != bcServer.acceptedBatches() { - t.Errorf("client sent batch count incorrect, want: %d, get: %d", tt.batchCnt, bcServer.acceptedBatches()) - } - if tt.pushCnt != bcServer.accepted() { - t.Errorf("client sent time series count incorrect, want: %d, get: %d", tt.pushCnt, bcServer.accepted()) - } + // batch count server + Addr: bcServer.URL, }) + if err != nil { + t.Fatalf("cannot create remote write client: %s", err) + } + + // push time series to the client. + for i := 0; i < pushCnt; i++ { + if err = rwClient.Push(prompbmarshal.TimeSeries{}); err != nil { + t.Fatalf("cannot time series to the client: %s", err) + } + } + + // close the client so the rest ts will be flushed in `shutdown` + if err = rwClient.Close(); err != nil { + t.Fatalf("cannot shutdown client: %s", err) + } + + // finally check how many batches is sent. + if bcServer.acceptedBatches() != batchCntExpected { + t.Fatalf("client sent batch count incorrect; got %d; want %d", bcServer.acceptedBatches(), batchCntExpected) + } + if pushCnt != bcServer.accepted() { + t.Fatalf("client sent time series count incorrect; got %d; want %d", bcServer.accepted(), pushCnt) + } } + + // pushCnt % batchSize == 0 + f(batchSize*40, 40) + + //pushCnt % batchSize != 0 + f(batchSize*40+1, 40+1) } func newRWServer() *rwServer { diff --git a/app/vmalert/remotewrite/init.go b/app/vmalert/remotewrite/init.go index 05309da8f..bd8352d91 100644 --- a/app/vmalert/remotewrite/init.go +++ b/app/vmalert/remotewrite/init.go @@ -9,6 +9,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" ) var ( @@ -74,7 +75,7 @@ func Init(ctx context.Context) (*Client, error) { return nil, fmt.Errorf("failed to create transport: %w", err) } t.IdleConnTimeout = *idleConnectionTimeout - t.DialContext = httputils.GetStatDialFunc("vmalert_remotewrite") + t.DialContext = netutil.NewStatDialFunc("vmalert_remotewrite") endpointParams, err := flagutil.ParseJSONMap(*oauth2EndpointParams) if err != nil { diff --git a/app/vmalert/replay_test.go b/app/vmalert/replay_test.go index f4b0e504f..ff43f4819 100644 --- a/app/vmalert/replay_test.go +++ b/app/vmalert/replay_test.go @@ -39,135 +39,102 @@ func (fr *fakeReplayQuerier) QueryRange(_ context.Context, q string, from, to ti } func TestReplay(t *testing.T) { - testCases := []struct { - name string - from, to string - maxDP int - cfg []config.Group - qb *fakeReplayQuerier - }{ - { - name: "one rule + one response", - from: "2021-01-01T12:00:00.000Z", - to: "2021-01-01T12:02:00.000Z", - maxDP: 10, - cfg: []config.Group{ - {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, - }, - qb: &fakeReplayQuerier{ - registry: map[string]map[string]struct{}{ - "sum(up)": {"12:00:00+12:02:00": {}}, - }, - }, - }, - { - name: "one rule + multiple responses", - from: "2021-01-01T12:00:00.000Z", - to: "2021-01-01T12:02:30.000Z", - maxDP: 1, - cfg: []config.Group{ - {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, - }, - qb: &fakeReplayQuerier{ - registry: map[string]map[string]struct{}{ - "sum(up)": { - "12:00:00+12:01:00": {}, - "12:01:00+12:02:00": {}, - "12:02:00+12:02:30": {}, - }, - }, - }, - }, - { - name: "datapoints per step", - from: "2021-01-01T12:00:00.000Z", - to: "2021-01-01T15:02:30.000Z", - maxDP: 60, - cfg: []config.Group{ - {Interval: promutils.NewDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, - }, - qb: &fakeReplayQuerier{ - registry: map[string]map[string]struct{}{ - "sum(up)": { - "12:00:00+13:00:00": {}, - "13:00:00+14:00:00": {}, - "14:00:00+15:00:00": {}, - "15:00:00+15:02:30": {}, - }, - }, - }, - }, - { - name: "multiple recording rules + multiple responses", - from: "2021-01-01T12:00:00.000Z", - to: "2021-01-01T12:02:30.000Z", - maxDP: 1, - cfg: []config.Group{ - {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, - {Rules: []config.Rule{{Record: "bar", Expr: "max(up)"}}}, - }, - qb: &fakeReplayQuerier{ - registry: map[string]map[string]struct{}{ - "sum(up)": { - "12:00:00+12:01:00": {}, - "12:01:00+12:02:00": {}, - "12:02:00+12:02:30": {}, - }, - "max(up)": { - "12:00:00+12:01:00": {}, - "12:01:00+12:02:00": {}, - "12:02:00+12:02:30": {}, - }, - }, - }, - }, - { - name: "multiple alerting rules + multiple responses", - from: "2021-01-01T12:00:00.000Z", - to: "2021-01-01T12:02:30.000Z", - maxDP: 1, - cfg: []config.Group{ - {Rules: []config.Rule{{Alert: "foo", Expr: "sum(up) > 1"}}}, - {Rules: []config.Rule{{Alert: "bar", Expr: "max(up) < 1"}}}, - }, - qb: &fakeReplayQuerier{ - registry: map[string]map[string]struct{}{ - "sum(up) > 1": { - "12:00:00+12:01:00": {}, - "12:01:00+12:02:00": {}, - "12:02:00+12:02:30": {}, - }, - "max(up) < 1": { - "12:00:00+12:01:00": {}, - "12:01:00+12:02:00": {}, - "12:02:00+12:02:30": {}, - }, - }, - }, - }, + f := func(from, to string, maxDP int, cfg []config.Group, qb *fakeReplayQuerier) { + t.Helper() + + fromOrig, toOrig, maxDatapointsOrig := *replayFrom, *replayTo, *replayMaxDatapoints + retriesOrig, delayOrig := *replayRuleRetryAttempts, *replayRulesDelay + defer func() { + *replayFrom, *replayTo = fromOrig, toOrig + *replayMaxDatapoints, *replayRuleRetryAttempts = maxDatapointsOrig, retriesOrig + *replayRulesDelay = delayOrig + }() + + *replayRuleRetryAttempts = 1 + *replayRulesDelay = time.Millisecond + rwb := &remotewrite.DebugClient{} + *replayFrom = from + *replayTo = to + *replayMaxDatapoints = maxDP + if err := replay(cfg, qb, rwb); err != nil { + t.Fatalf("replay failed: %s", err) + } + if len(qb.registry) > 0 { + t.Fatalf("not all requests were sent: %#v", qb.registry) + } } - from, to, maxDP := *replayFrom, *replayTo, *replayMaxDatapoints - retries, delay := *replayRuleRetryAttempts, *replayRulesDelay - defer func() { - *replayFrom, *replayTo = from, to - *replayMaxDatapoints, *replayRuleRetryAttempts = maxDP, retries - *replayRulesDelay = delay - }() + // one rule + one response + f("2021-01-01T12:00:00.000Z", "2021-01-01T12:02:00.000Z", 10, []config.Group{ + {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, + }, &fakeReplayQuerier{ + registry: map[string]map[string]struct{}{ + "sum(up)": {"12:00:00+12:02:00": {}}, + }, + }) - *replayRuleRetryAttempts = 1 - *replayRulesDelay = time.Millisecond - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - *replayFrom = tc.from - *replayTo = tc.to - *replayMaxDatapoints = tc.maxDP - if err := replay(tc.cfg, tc.qb, &remotewrite.DebugClient{}); err != nil { - t.Fatalf("replay failed: %s", err) - } - if len(tc.qb.registry) > 0 { - t.Fatalf("not all requests were sent: %#v", tc.qb.registry) - } - }) - } + // one rule + multiple responses + f("2021-01-01T12:00:00.000Z", "2021-01-01T12:02:30.000Z", 1, []config.Group{ + {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, + }, &fakeReplayQuerier{ + registry: map[string]map[string]struct{}{ + "sum(up)": { + "12:00:00+12:01:00": {}, + "12:01:00+12:02:00": {}, + "12:02:00+12:02:30": {}, + }, + }, + }) + + // datapoints per step + f("2021-01-01T12:00:00.000Z", "2021-01-01T15:02:30.000Z", 60, []config.Group{ + {Interval: promutils.NewDuration(time.Minute), Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, + }, &fakeReplayQuerier{ + registry: map[string]map[string]struct{}{ + "sum(up)": { + "12:00:00+13:00:00": {}, + "13:00:00+14:00:00": {}, + "14:00:00+15:00:00": {}, + "15:00:00+15:02:30": {}, + }, + }, + }) + + // multiple recording rules + multiple responses + f("2021-01-01T12:00:00.000Z", "2021-01-01T12:02:30.000Z", 1, []config.Group{ + {Rules: []config.Rule{{Record: "foo", Expr: "sum(up)"}}}, + {Rules: []config.Rule{{Record: "bar", Expr: "max(up)"}}}, + }, &fakeReplayQuerier{ + registry: map[string]map[string]struct{}{ + "sum(up)": { + "12:00:00+12:01:00": {}, + "12:01:00+12:02:00": {}, + "12:02:00+12:02:30": {}, + }, + "max(up)": { + "12:00:00+12:01:00": {}, + "12:01:00+12:02:00": {}, + "12:02:00+12:02:30": {}, + }, + }, + }) + + // multiple alerting rules + multiple responses + f("2021-01-01T12:00:00.000Z", "2021-01-01T12:02:30.000Z", 1, []config.Group{ + {Rules: []config.Rule{{Alert: "foo", Expr: "sum(up) > 1"}}}, + {Rules: []config.Rule{{Alert: "bar", Expr: "max(up) < 1"}}}, + }, &fakeReplayQuerier{ + registry: map[string]map[string]struct{}{ + "sum(up) > 1": { + "12:00:00+12:01:00": {}, + "12:01:00+12:02:00": {}, + "12:02:00+12:02:30": {}, + }, + "max(up) < 1": { + "12:00:00+12:01:00": {}, + "12:01:00+12:02:00": {}, + "12:02:00+12:02:30": {}, + }, + }, + }) } diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index 07dda4273..9558e9db9 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -19,115 +19,102 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" ) -func TestAlertingRule_ToTimeSeries(t *testing.T) { +func TestAlertingRuleToTimeSeries(t *testing.T) { timestamp := time.Now() - testCases := []struct { - rule *AlertingRule - alert *notifier.Alert - expTS []prompbmarshal.TimeSeries - }{ - { - newTestAlertingRule("instant", 0), - ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)}, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertMetricName, - alertStateLabel: notifier.StateFiring.String(), - }), - newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, - []int64{timestamp.UnixNano()}, - map[string]string{ - "__name__": alertForStateMetricName, - }), - }, - }, - { - newTestAlertingRule("instant extra labels", 0), - ¬ifier.Alert{ - State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), - Labels: map[string]string{ - "job": "foo", - "instance": "bar", - }, - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertMetricName, - alertStateLabel: notifier.StateFiring.String(), - "job": "foo", - "instance": "bar", - }), - newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, - []int64{timestamp.UnixNano()}, - map[string]string{ - "__name__": alertForStateMetricName, - "job": "foo", - "instance": "bar", - }), - }, - }, - { - newTestAlertingRule("instant labels override", 0), - ¬ifier.Alert{ - State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), - Labels: map[string]string{ - alertStateLabel: "foo", - "__name__": "bar", - }, - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertMetricName, - alertStateLabel: notifier.StateFiring.String(), - }), - newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, - []int64{timestamp.UnixNano()}, - map[string]string{ - "__name__": alertForStateMetricName, - alertStateLabel: "foo", - }), - }, - }, - { - newTestAlertingRule("for", time.Second), - ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)}, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertMetricName, - alertStateLabel: notifier.StateFiring.String(), - }), - newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, - []int64{timestamp.UnixNano()}, - map[string]string{ - "__name__": alertForStateMetricName, - }), - }, - }, - { - newTestAlertingRule("for pending", 10*time.Second), - ¬ifier.Alert{State: notifier.StatePending, ActiveAt: timestamp.Add(time.Second)}, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertMetricName, - alertStateLabel: notifier.StatePending.String(), - }), - newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, - []int64{timestamp.UnixNano()}, - map[string]string{ - "__name__": alertForStateMetricName, - }), - }, - }, - } - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - tc.rule.alerts[tc.alert.ID] = tc.alert - tss := tc.rule.toTimeSeries(timestamp.Unix()) - if err := compareTimeSeries(t, tc.expTS, tss); err != nil { - t.Fatalf("timeseries missmatch: %s", err) - } - }) + + f := func(rule *AlertingRule, alert *notifier.Alert, tssExpected []prompbmarshal.TimeSeries) { + t.Helper() + + rule.alerts[alert.ID] = alert + tss := rule.toTimeSeries(timestamp.Unix()) + if err := compareTimeSeries(t, tssExpected, tss); err != nil { + t.Fatalf("timeseries mismatch: %s", err) + } } + + f(newTestAlertingRule("instant", 0), ¬ifier.Alert{ + State: notifier.StateFiring, + ActiveAt: timestamp.Add(time.Second), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertMetricName, + alertStateLabel: notifier.StateFiring.String(), + }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + }), + }) + + f(newTestAlertingRule("instant extra labels", 0), ¬ifier.Alert{ + State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + Labels: map[string]string{ + "job": "foo", + "instance": "bar", + }, + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertMetricName, + alertStateLabel: notifier.StateFiring.String(), + "job": "foo", + "instance": "bar", + }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + "job": "foo", + "instance": "bar", + }), + }) + + f(newTestAlertingRule("instant labels override", 0), ¬ifier.Alert{ + State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + Labels: map[string]string{ + alertStateLabel: "foo", + "__name__": "bar", + }, + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertMetricName, + alertStateLabel: notifier.StateFiring.String(), + }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + alertStateLabel: "foo", + }), + }) + + f(newTestAlertingRule("for", time.Second), ¬ifier.Alert{ + State: notifier.StateFiring, + ActiveAt: timestamp.Add(time.Second), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertMetricName, + alertStateLabel: notifier.StateFiring.String(), + }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + }), + }) + + f(newTestAlertingRule("for pending", 10*time.Second), ¬ifier.Alert{ + State: notifier.StatePending, + ActiveAt: timestamp.Add(time.Second), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertMetricName, + alertStateLabel: notifier.StatePending.String(), + }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": alertForStateMetricName, + }), + }) } func TestAlertingRule_Exec(t *testing.T) { @@ -136,516 +123,437 @@ func TestAlertingRule_Exec(t *testing.T) { labels []string alert *notifier.Alert } - testCases := []struct { - rule *AlertingRule - steps [][]datasource.Metric - expAlerts map[int][]testAlert - }{ - { - newTestAlertingRule("empty", 0), - [][]datasource.Metric{}, - nil, - }, - { - newTestAlertingRule("empty labels", 0), - [][]datasource.Metric{ - {datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}}, - }, - map[int][]testAlert{ - 0: {{alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - { - newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive", 0), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {}, - {metricWithLabels(t, "name", "foo")}, - {}, - {}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - }, - }, - { - newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive=>firing", 0), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {}, - {metricWithLabels(t, "name", "foo")}, - {}, - {}, - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - { - newTestAlertingRule("multiple-firing", 0), - [][]datasource.Metric{ - { - metricWithLabels(t, "name", "foo"), - metricWithLabels(t, "name", "foo1"), - metricWithLabels(t, "name", "foo2"), - }, - }, - map[int][]testAlert{ - 0: { - {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - }, - }, - }, - { - newTestAlertingRule("multiple-steps-firing", 0), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo1")}, - {metricWithLabels(t, "name", "foo2")}, - }, - // 1: fire first alert - // 2: fire second alert, set first inactive - // 3: fire third alert, set second inactive - map[int][]testAlert{ - 0: { - {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - }, - 1: { - {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, - {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - }, - 2: { - {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, - {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, - {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, - }, - }, - }, - { - newTestAlertingRule("for-pending", time.Minute), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - }, - }, - { - newTestAlertingRule("for-fired", defaultStep), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - { - newTestAlertingRule("for-pending=>empty", time.Second), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - // empty step to delete pending alerts - {}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 2: {}, - }, - }, - { - newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - // empty step to set alert inactive - {}, - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - { - newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>firing", defaultStep, defaultStep), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - // empty step to keep firing - {}, - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - { - newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>keepfiring=>inactive=>pending=>firing", defaultStep, 2*defaultStep), - [][]datasource.Metric{ - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - // empty step to keep firing - {}, - // another empty step to keep firing - {}, - // empty step to set alert inactive - {}, - {metricWithLabels(t, "name", "foo")}, - {metricWithLabels(t, "name", "foo")}, - }, - map[int][]testAlert{ - 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, - 5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, - 6: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, - }, - }, - } - fakeGroup := Group{Name: "TestRule_Exec"} - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - fq := &datasource.FakeQuerier{} - tc.rule.q = fq - tc.rule.GroupID = fakeGroup.ID() - ts := time.Now() - for i, step := range tc.steps { - fq.Reset() - fq.Add(step...) - if _, err := tc.rule.exec(context.TODO(), ts, 0); err != nil { - t.Fatalf("unexpected err: %s", err) - } - // shift the execution timestamp before the next iteration - ts = ts.Add(defaultStep) + f := func(rule *AlertingRule, steps [][]datasource.Metric, alertsExpected map[int][]testAlert) { + t.Helper() - if _, ok := tc.expAlerts[i]; !ok { - continue + fq := &datasource.FakeQuerier{} + rule.q = fq + + fakeGroup := Group{ + Name: "TestRule_Exec", + } + rule.GroupID = fakeGroup.ID() + ts := time.Now() + for i, step := range steps { + fq.Reset() + fq.Add(step...) + if _, err := rule.exec(context.TODO(), ts, 0); err != nil { + t.Fatalf("unexpected error: %s", err) + } + + // shift the execution timestamp before the next iteration + ts = ts.Add(defaultStep) + + if _, ok := alertsExpected[i]; !ok { + continue + } + if len(rule.alerts) != len(alertsExpected[i]) { + t.Fatalf("evalIndex %d: expected %d alerts; got %d", i, len(alertsExpected[i]), len(rule.alerts)) + } + expAlerts := make(map[uint64]*notifier.Alert) + for _, ta := range alertsExpected[i] { + labels := make(map[string]string) + for i := 0; i < len(ta.labels); i += 2 { + k, v := ta.labels[i], ta.labels[i+1] + labels[k] = v } - if len(tc.rule.alerts) != len(tc.expAlerts[i]) { - t.Fatalf("evalIndex %d: expected %d alerts; got %d", i, len(tc.expAlerts[i]), len(tc.rule.alerts)) + labels[alertNameLabel] = rule.Name + h := hash(labels) + expAlerts[h] = ta.alert + } + for key, exp := range expAlerts { + got, ok := rule.alerts[key] + if !ok { + t.Fatalf("evalIndex %d: expected to have key %d", i, key) } - expAlerts := make(map[uint64]*notifier.Alert) - for _, ta := range tc.expAlerts[i] { - labels := make(map[string]string) - for i := 0; i < len(ta.labels); i += 2 { - k, v := ta.labels[i], ta.labels[i+1] - labels[k] = v - } - labels[alertNameLabel] = tc.rule.Name - h := hash(labels) - expAlerts[h] = ta.alert - } - for key, exp := range expAlerts { - got, ok := tc.rule.alerts[key] - if !ok { - t.Fatalf("evalIndex %d: expected to have key %d", i, key) - } - if got.State != exp.State { - t.Fatalf("evalIndex %d: expected state %d; got %d", i, exp.State, got.State) - } + if got.State != exp.State { + t.Fatalf("evalIndex %d: expected state %d; got %d", i, exp.State, got.State) } } - }) + } } + + f(newTestAlertingRule("empty", 0), [][]datasource.Metric{}, nil) + + f(newTestAlertingRule("empty labels", 0), [][]datasource.Metric{ + {datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}}, + }, map[int][]testAlert{ + 0: {{alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) + + f(newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive", 0), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {}, + {metricWithLabels(t, "name", "foo")}, + {}, + {}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + }) + + f(newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive=>firing", 0), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {}, + {metricWithLabels(t, "name", "foo")}, + {}, + {}, + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) + + f(newTestAlertingRule("multiple-firing", 0), [][]datasource.Metric{ + { + metricWithLabels(t, "name", "foo"), + metricWithLabels(t, "name", "foo1"), + metricWithLabels(t, "name", "foo2"), + }, + }, map[int][]testAlert{ + 0: { + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + }, + }) + + // 1: fire first alert + // 2: fire second alert, set first inactive + // 3: fire third alert, set second inactive + f(newTestAlertingRule("multiple-steps-firing", 0), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo1")}, + {metricWithLabels(t, "name", "foo2")}, + }, + map[int][]testAlert{ + 0: { + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + }, + 1: { + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, + {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + }, + 2: { + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, + {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, + {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + }, + }) + + f(newTestAlertingRule("for-pending", time.Minute), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + }) + + f(newTestAlertingRule("for-fired", defaultStep), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) + + f(newTestAlertingRule("for-pending=>empty", time.Second), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + // empty step to delete pending alerts + {}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 2: {}, + }) + + f(newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + // empty step to set alert inactive + {}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) + + f(newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>firing", defaultStep, defaultStep), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + // empty step to keep firing + {}, + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) + + f(newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>keepfiring=>inactive=>pending=>firing", defaultStep, 2*defaultStep), [][]datasource.Metric{ + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + // empty step to keep firing + {}, + // another empty step to keep firing + {}, + // empty step to set alert inactive + {}, + {metricWithLabels(t, "name", "foo")}, + {metricWithLabels(t, "name", "foo")}, + }, map[int][]testAlert{ + 0: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 1: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 2: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 3: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + 4: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}}, + 5: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}}, + 6: {{labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}}, + }) } -func TestAlertingRule_ExecRange(t *testing.T) { - fakeGroup := Group{Name: "TestRule_ExecRange"} - testCases := []struct { - rule *AlertingRule - data []datasource.Metric - expAlerts []*notifier.Alert - expHoldAlertStateAlerts map[uint64]*notifier.Alert - }{ - { - newTestAlertingRule("empty", 0), - []datasource.Metric{}, - nil, - nil, - }, - { - newTestAlertingRule("empty labels", 0), - []datasource.Metric{ - {Values: []float64{1}, Timestamps: []int64{1}}, - }, - []*notifier.Alert{ - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - }, - nil, - }, - { - newTestAlertingRule("single-firing", 0), - []datasource.Metric{ - metricWithLabels(t, "name", "foo"), - }, - []*notifier.Alert{ - { - Labels: map[string]string{"name": "foo"}, - State: notifier.StateFiring, - ActiveAt: time.Unix(1, 0), - }, - }, - nil, - }, - { - newTestAlertingRule("single-firing-on-range", 0), - []datasource.Metric{ - {Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}}, - }, - []*notifier.Alert{ - {State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)}, - }, - nil, - }, - { - newTestAlertingRule("for-pending", time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, - }, - []*notifier.Alert{ - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(3, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, - }, - map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): { - GroupID: fakeGroup.ID(), - Name: "for-pending", - Labels: map[string]string{"alertname": "for-pending"}, - Annotations: map[string]string{}, - State: notifier.StatePending, - ActiveAt: time.Unix(5, 0), - Value: 1, - For: time.Second, - }}, - }, - { - newTestAlertingRule("for-firing", 3*time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, - }, - []*notifier.Alert{ - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - }, - map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): { - GroupID: fakeGroup.ID(), - Name: "for-firing", - Labels: map[string]string{"alertname": "for-firing"}, - Annotations: map[string]string{}, - State: notifier.StateFiring, - ActiveAt: time.Unix(1, 0), - Start: time.Unix(5, 0), - Value: 1, - For: 3 * time.Second, - }}, - }, - { - newTestAlertingRule("for-hold-pending", time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}}, - }, - []*notifier.Alert{ - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, - }, - map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): { - GroupID: fakeGroup.ID(), - Name: "for-hold-pending", - Labels: map[string]string{"alertname": "for-hold-pending"}, - Annotations: map[string]string{}, - State: notifier.StatePending, - ActiveAt: time.Unix(5, 0), - Value: 1, - For: time.Second, - }}, - }, - { - newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}}, - }, - []*notifier.Alert{ - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - // It is expected for ActiveAT to remain the same while rule continues to fire in each iteration - {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, - }, - nil, - }, - { - newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}}, - }, - []*notifier.Alert{ - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(20, 0)}, - }, - nil, - }, - { - newTestAlertingRule("multi-series", 3*time.Second), - []datasource.Metric{ - {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, - { - Values: []float64{1, 1}, Timestamps: []int64{1, 5}, - Labels: []datasource.Label{{Name: "foo", Value: "bar"}}, - }, - }, - []*notifier.Alert{ - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, - {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, - { - State: notifier.StatePending, ActiveAt: time.Unix(1, 0), - Labels: map[string]string{ - "foo": "bar", - }, - }, - { - State: notifier.StatePending, ActiveAt: time.Unix(5, 0), - Labels: map[string]string{ - "foo": "bar", - }, - }, - }, - map[uint64]*notifier.Alert{ - hash(map[string]string{"alertname": "multi-series"}): { - GroupID: fakeGroup.ID(), - Name: "multi-series", - Labels: map[string]string{"alertname": "multi-series"}, - Annotations: map[string]string{}, - State: notifier.StateFiring, - ActiveAt: time.Unix(1, 0), - Start: time.Unix(5, 0), - Value: 1, - For: 3 * time.Second, - }, - hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): { - GroupID: fakeGroup.ID(), - Name: "multi-series", - Labels: map[string]string{"alertname": "multi-series", "foo": "bar"}, - Annotations: map[string]string{}, - State: notifier.StatePending, - ActiveAt: time.Unix(5, 0), - Value: 1, - For: 3 * time.Second, - }, - }, - }, - { - newTestRuleWithLabels("multi-series-firing", "source", "vm"), - []datasource.Metric{ - {Values: []float64{1, 1}, Timestamps: []int64{1, 100}}, - { - Values: []float64{1, 1}, Timestamps: []int64{1, 5}, - Labels: []datasource.Label{{Name: "foo", Value: "bar"}}, - }, - }, - []*notifier.Alert{ - { - State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), - Labels: map[string]string{ - "source": "vm", - }, - }, - { - State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), - Labels: map[string]string{ - "source": "vm", - }, - }, - // - { - State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), - Labels: map[string]string{ - "foo": "bar", - "source": "vm", - }, - }, - { - State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), - Labels: map[string]string{ - "foo": "bar", - "source": "vm", - }, - }, - }, - nil, - }, +func TestAlertingRuleExecRange(t *testing.T) { + fakeGroup := Group{ + Name: "TestRule_ExecRange", } - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - fq := &datasource.FakeQuerier{} - tc.rule.q = fq - tc.rule.GroupID = fakeGroup.ID() - fq.Add(tc.data...) - gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0)) - if err != nil { - t.Fatalf("unexpected err: %s", err) - } - var expTS []prompbmarshal.TimeSeries - var j int - for _, series := range tc.data { - for _, timestamp := range series.Timestamps { - a := tc.expAlerts[j] - if a.Labels == nil { - a.Labels = make(map[string]string) - } - a.Labels[alertNameLabel] = tc.rule.Name - expTS = append(expTS, tc.rule.alertToTimeSeries(a, timestamp)...) - j++ + + f := func(rule *AlertingRule, data []datasource.Metric, alertsExpected []*notifier.Alert, holdAlertStateAlertsExpected map[uint64]*notifier.Alert) { + t.Helper() + + fq := &datasource.FakeQuerier{} + rule.q = fq + rule.GroupID = fakeGroup.ID() + fq.Add(data...) + gotTS, err := rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0)) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + var expTS []prompbmarshal.TimeSeries + var j int + for _, series := range data { + for _, timestamp := range series.Timestamps { + a := alertsExpected[j] + if a.Labels == nil { + a.Labels = make(map[string]string) } + a.Labels[alertNameLabel] = rule.Name + expTS = append(expTS, rule.alertToTimeSeries(a, timestamp)...) + j++ } - if len(gotTS) != len(expTS) { - t.Fatalf("expected %d time series; got %d", len(expTS), len(gotTS)) + } + if len(gotTS) != len(expTS) { + t.Fatalf("expected %d time series; got %d", len(expTS), len(gotTS)) + } + for i := range expTS { + got, exp := gotTS[i], expTS[i] + if !reflect.DeepEqual(got, exp) { + t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got) } - for i := range expTS { - got, exp := gotTS[i], expTS[i] - if !reflect.DeepEqual(got, exp) { - t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got) - } + } + if holdAlertStateAlertsExpected != nil { + if !reflect.DeepEqual(holdAlertStateAlertsExpected, rule.alerts) { + t.Fatalf("expected hold alerts state: \n%v but got \n%v", holdAlertStateAlertsExpected, rule.alerts) } - if tc.expHoldAlertStateAlerts != nil { - if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) { - t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts) - } - } - }) + } } + + f(newTestAlertingRule("empty", 0), []datasource.Metric{}, nil, nil) + + f(newTestAlertingRule("empty labels", 0), []datasource.Metric{ + {Values: []float64{1}, Timestamps: []int64{1}}, + }, []*notifier.Alert{ + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + }, nil) + + f(newTestAlertingRule("single-firing", 0), []datasource.Metric{ + metricWithLabels(t, "name", "foo"), + }, []*notifier.Alert{ + { + Labels: map[string]string{"name": "foo"}, + State: notifier.StateFiring, + ActiveAt: time.Unix(1, 0), + }, + }, nil) + + f(newTestAlertingRule("single-firing-on-range", 0), []datasource.Metric{ + {Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}}, + }, []*notifier.Alert{ + {State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)}, + }, nil) + + f(newTestAlertingRule("for-pending", time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, + }, []*notifier.Alert{ + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(3, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, + }, map[uint64]*notifier.Alert{ + hash(map[string]string{"alertname": "for-pending"}): { + GroupID: fakeGroup.ID(), + Name: "for-pending", + Labels: map[string]string{"alertname": "for-pending"}, + Annotations: map[string]string{}, + State: notifier.StatePending, + ActiveAt: time.Unix(5, 0), + Value: 1, + For: time.Second, + }, + }) + + f(newTestAlertingRule("for-firing", 3*time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, + }, []*notifier.Alert{ + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + }, map[uint64]*notifier.Alert{ + hash(map[string]string{"alertname": "for-firing"}): { + GroupID: fakeGroup.ID(), + Name: "for-firing", + Labels: map[string]string{"alertname": "for-firing"}, + Annotations: map[string]string{}, + State: notifier.StateFiring, + ActiveAt: time.Unix(1, 0), + Start: time.Unix(5, 0), + Value: 1, + For: 3 * time.Second, + }, + }) + + f(newTestAlertingRule("for-hold-pending", time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}}, + }, []*notifier.Alert{ + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, + }, map[uint64]*notifier.Alert{ + hash(map[string]string{"alertname": "for-hold-pending"}): { + GroupID: fakeGroup.ID(), + Name: "for-hold-pending", + Labels: map[string]string{"alertname": "for-hold-pending"}, + Annotations: map[string]string{}, + State: notifier.StatePending, + ActiveAt: time.Unix(5, 0), + Value: 1, + For: time.Second, + }, + }) + + f(newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}}, + }, []*notifier.Alert{ + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + // It is expected for ActiveAT to remain the same while rule continues to fire in each iteration + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + }, nil) + + f(newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}}, + }, []*notifier.Alert{ + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(20, 0)}, + }, nil) + + f(newTestAlertingRule("multi-series", 3*time.Second), []datasource.Metric{ + {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, + { + Values: []float64{1, 1}, Timestamps: []int64{1, 5}, + Labels: []datasource.Label{{Name: "foo", Value: "bar"}}, + }, + }, []*notifier.Alert{ + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + { + State: notifier.StatePending, ActiveAt: time.Unix(1, 0), + Labels: map[string]string{ + "foo": "bar", + }, + }, + { + State: notifier.StatePending, ActiveAt: time.Unix(5, 0), + Labels: map[string]string{ + "foo": "bar", + }, + }, + }, map[uint64]*notifier.Alert{ + hash(map[string]string{"alertname": "multi-series"}): { + GroupID: fakeGroup.ID(), + Name: "multi-series", + Labels: map[string]string{"alertname": "multi-series"}, + Annotations: map[string]string{}, + State: notifier.StateFiring, + ActiveAt: time.Unix(1, 0), + Start: time.Unix(5, 0), + Value: 1, + For: 3 * time.Second, + }, + hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): { + GroupID: fakeGroup.ID(), + Name: "multi-series", + Labels: map[string]string{"alertname": "multi-series", "foo": "bar"}, + Annotations: map[string]string{}, + State: notifier.StatePending, + ActiveAt: time.Unix(5, 0), + Value: 1, + For: 3 * time.Second, + }, + }) + + f(newTestRuleWithLabels("multi-series-firing", "source", "vm"), []datasource.Metric{ + {Values: []float64{1, 1}, Timestamps: []int64{1, 100}}, + { + Values: []float64{1, 1}, Timestamps: []int64{1, 5}, + Labels: []datasource.Label{{Name: "foo", Value: "bar"}}, + }, + }, []*notifier.Alert{ + { + State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + Labels: map[string]string{ + "source": "vm", + }, + }, + { + State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), + Labels: map[string]string{ + "source": "vm", + }, + }, + // + { + State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + Labels: map[string]string{ + "foo": "bar", + "source": "vm", + }, + }, + { + State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), + Labels: map[string]string{ + "foo": "bar", + "source": "vm", + }, + }, + }, nil) } func TestGroup_Restore(t *testing.T) { @@ -850,206 +758,206 @@ func TestAlertingRule_Exec_Negative(t *testing.T) { } } -func TestAlertingRuleLimit(t *testing.T) { - fq := &datasource.FakeQuerier{} - ar := newTestAlertingRule("test", 0) - ar.Labels = map[string]string{"job": "test"} - ar.q = fq - ar.For = time.Minute - testCases := []struct { - limit int - err string - tssNum int - }{ - { - limit: 0, - tssNum: 4, - }, - { - limit: -1, - tssNum: 4, - }, - { - limit: 1, - err: "exec exceeded limit of 1 with 2 alerts", - tssNum: 0, - }, - { - limit: 4, - tssNum: 4, - }, - } - var ( - err error - timestamp = time.Now() - ) - fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) - fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job")) - for _, testCase := range testCases { - _, err = ar.exec(context.TODO(), timestamp, testCase.limit) - if err != nil && !strings.EqualFold(err.Error(), testCase.err) { - t.Fatal(err) +func TestAlertingRuleLimit_Failure(t *testing.T) { + f := func(limit int, errStrExpected string) { + t.Helper() + + fq := &datasource.FakeQuerier{} + ar := newTestAlertingRule("test", 0) + ar.Labels = map[string]string{"job": "test"} + ar.q = fq + ar.For = time.Minute + + fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) + fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job")) + + timestamp := time.Now() + _, err := ar.exec(context.TODO(), timestamp, limit) + if err == nil { + t.Fatalf("expecting non-nil error") } + errStr := err.Error() + if !strings.Contains(errStr, errStrExpected) { + t.Fatalf("missing %q in error %q", errStrExpected, errStr) + } + fq.Reset() } - fq.Reset() + + f(1, "exec exceeded limit of 1 with 2 alerts") +} + +func TestAlertingRuleLimit_Success(t *testing.T) { + f := func(limit int) { + t.Helper() + + fq := &datasource.FakeQuerier{} + ar := newTestAlertingRule("test", 0) + ar.Labels = map[string]string{"job": "test"} + ar.q = fq + ar.For = time.Minute + + fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) + fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job")) + + timestamp := time.Now() + _, err := ar.exec(context.TODO(), timestamp, limit) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + fq.Reset() + } + + f(0) + f(-1) + f(4) } func TestAlertingRule_Template(t *testing.T) { - testCases := []struct { - rule *AlertingRule - metrics []datasource.Metric - expAlerts map[uint64]*notifier.Alert - }{ - { - &AlertingRule{ - Name: "common", - Labels: map[string]string{ - "region": "east", - }, - Annotations: map[string]string{ - "summary": `{{ $labels.alertname }}: Too high connection number for "{{ $labels.instance }}"`, - }, - alerts: make(map[uint64]*notifier.Alert), - }, - []datasource.Metric{ - metricWithValueAndLabels(t, 1, "instance", "foo"), - metricWithValueAndLabels(t, 1, "instance", "bar"), - }, - map[uint64]*notifier.Alert{ - hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): { - Annotations: map[string]string{ - "summary": `common: Too high connection number for "foo"`, - }, - Labels: map[string]string{ - alertNameLabel: "common", - "region": "east", - "instance": "foo", - }, - }, - hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): { - Annotations: map[string]string{ - "summary": `common: Too high connection number for "bar"`, - }, - Labels: map[string]string{ - alertNameLabel: "common", - "region": "east", - "instance": "bar", - }, - }, - }, - }, - { - &AlertingRule{ - Name: "override label", - Labels: map[string]string{ - "instance": "{{ $labels.instance }}", - }, - Annotations: map[string]string{ - "summary": `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}"`, - "description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`, - }, - alerts: make(map[uint64]*notifier.Alert), - }, - []datasource.Metric{ - metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"), - metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"), - }, - map[uint64]*notifier.Alert{ - hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): { - Labels: map[string]string{ - alertNameLabel: "override label", - "exported_alertname": "override", - "instance": "foo", - }, - Annotations: map[string]string{ - "summary": `first: Too high connection number for "foo"`, - "description": `override: It is 2 connections for "foo"`, - }, - }, - hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): { - Labels: map[string]string{ - alertNameLabel: "override label", - "exported_alertname": "override", - "instance": "bar", - }, - Annotations: map[string]string{ - "summary": `second: Too high connection number for "bar"`, - "description": `override: It is 10 connections for "bar"`, - }, - }, - }, - }, - { - &AlertingRule{ - Name: "OriginLabels", - GroupName: "Testing", - Labels: map[string]string{ - "instance": "{{ $labels.instance }}", - }, - Annotations: map[string]string{ - "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, - }, - alerts: make(map[uint64]*notifier.Alert), - }, - []datasource.Metric{ - metricWithValueAndLabels(t, 1, - alertNameLabel, "originAlertname", - alertGroupNameLabel, "originGroupname", - "instance", "foo"), - }, - map[uint64]*notifier.Alert{ - hash(map[string]string{ - alertNameLabel: "OriginLabels", - "exported_alertname": "originAlertname", - alertGroupNameLabel: "Testing", - "exported_alertgroup": "originGroupname", - "instance": "foo", - }): { - Labels: map[string]string{ - alertNameLabel: "OriginLabels", - "exported_alertname": "originAlertname", - alertGroupNameLabel: "Testing", - "exported_alertgroup": "originGroupname", - "instance": "foo", - }, - Annotations: map[string]string{ - "summary": `Alert "originAlertname(originGroupname)" for instance foo`, - }, - }, - }, - }, - } - fakeGroup := Group{Name: "TestRule_Exec"} - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - fq := &datasource.FakeQuerier{} - tc.rule.GroupID = fakeGroup.ID() - tc.rule.q = fq - tc.rule.state = &ruleState{entries: make([]StateEntry, 10)} - fq.Add(tc.metrics...) - if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil { - t.Fatalf("unexpected err: %s", err) + f := func(rule *AlertingRule, metrics []datasource.Metric, alertsExpected map[uint64]*notifier.Alert) { + t.Helper() + + fakeGroup := Group{ + Name: "TestRule_Exec", + } + fq := &datasource.FakeQuerier{} + rule.GroupID = fakeGroup.ID() + rule.q = fq + rule.state = &ruleState{ + entries: make([]StateEntry, 10), + } + fq.Add(metrics...) + + if _, err := rule.exec(context.TODO(), time.Now(), 0); err != nil { + t.Fatalf("unexpected error: %s", err) + } + for hash, expAlert := range alertsExpected { + gotAlert := rule.alerts[hash] + if gotAlert == nil { + t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations) } - for hash, expAlert := range tc.expAlerts { - gotAlert := tc.rule.alerts[hash] - if gotAlert == nil { - t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations) - } - if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) { - t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations) - } - if !reflect.DeepEqual(expAlert.Labels, gotAlert.Labels) { - t.Fatalf("expected to have labels %#v; got %#v", expAlert.Labels, gotAlert.Labels) - } + if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) { + t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations) } - }) + if !reflect.DeepEqual(expAlert.Labels, gotAlert.Labels) { + t.Fatalf("expected to have labels %#v; got %#v", expAlert.Labels, gotAlert.Labels) + } + } } + + f(&AlertingRule{ + Name: "common", + Labels: map[string]string{ + "region": "east", + }, + Annotations: map[string]string{ + "summary": `{{ $labels.alertname }}: Too high connection number for "{{ $labels.instance }}"`, + }, + alerts: make(map[uint64]*notifier.Alert), + }, []datasource.Metric{ + metricWithValueAndLabels(t, 1, "instance", "foo"), + metricWithValueAndLabels(t, 1, "instance", "bar"), + }, map[uint64]*notifier.Alert{ + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): { + Annotations: map[string]string{ + "summary": `common: Too high connection number for "foo"`, + }, + Labels: map[string]string{ + alertNameLabel: "common", + "region": "east", + "instance": "foo", + }, + }, + hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): { + Annotations: map[string]string{ + "summary": `common: Too high connection number for "bar"`, + }, + Labels: map[string]string{ + alertNameLabel: "common", + "region": "east", + "instance": "bar", + }, + }, + }) + + f(&AlertingRule{ + Name: "override label", + Labels: map[string]string{ + "instance": "{{ $labels.instance }}", + }, + Annotations: map[string]string{ + "summary": `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}"`, + "description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`, + }, + alerts: make(map[uint64]*notifier.Alert), + }, []datasource.Metric{ + metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"), + metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"), + }, map[uint64]*notifier.Alert{ + hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): { + Labels: map[string]string{ + alertNameLabel: "override label", + "exported_alertname": "override", + "instance": "foo", + }, + Annotations: map[string]string{ + "summary": `first: Too high connection number for "foo"`, + "description": `override: It is 2 connections for "foo"`, + }, + }, + hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): { + Labels: map[string]string{ + alertNameLabel: "override label", + "exported_alertname": "override", + "instance": "bar", + }, + Annotations: map[string]string{ + "summary": `second: Too high connection number for "bar"`, + "description": `override: It is 10 connections for "bar"`, + }, + }, + }) + + f(&AlertingRule{ + Name: "OriginLabels", + GroupName: "Testing", + Labels: map[string]string{ + "instance": "{{ $labels.instance }}", + }, + Annotations: map[string]string{ + "summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`, + }, + alerts: make(map[uint64]*notifier.Alert), + }, []datasource.Metric{ + metricWithValueAndLabels(t, 1, + alertNameLabel, "originAlertname", + alertGroupNameLabel, "originGroupname", + "instance", "foo"), + }, map[uint64]*notifier.Alert{ + hash(map[string]string{ + alertNameLabel: "OriginLabels", + "exported_alertname": "originAlertname", + alertGroupNameLabel: "Testing", + "exported_alertgroup": "originGroupname", + "instance": "foo", + }): { + Labels: map[string]string{ + alertNameLabel: "OriginLabels", + "exported_alertname": "originAlertname", + alertGroupNameLabel: "Testing", + "exported_alertgroup": "originGroupname", + "instance": "foo", + }, + Annotations: map[string]string{ + "summary": `Alert "originAlertname(originGroupname)" for instance foo`, + }, + }, + }) } func TestAlertsToSend(t *testing.T) { - ts := time.Now() f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) { t.Helper() + ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)} for i, a := range alerts { ar.alerts[uint64(i)] = a @@ -1076,27 +984,30 @@ func TestAlertsToSend(t *testing.T) { } } - f( // check if firing alerts need to be sent with non-zero resendDelay - []*notifier.Alert{ - {Name: "a", State: notifier.StateFiring, Start: ts}, - // no need to resend firing - {Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)}, - // last message is for resolved, send firing message this time - {Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)}, - // resend firing - {Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)}, - }, + ts := time.Now() + + // check if firing alerts need to be sent with non-zero resendDelay + f([]*notifier.Alert{ + {Name: "a", State: notifier.StateFiring, Start: ts}, + // no need to resend firing + {Name: "b", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(5 * time.Minute)}, + // last message is for resolved, send firing message this time + {Name: "c", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-30 * time.Second), End: ts.Add(-1 * time.Minute)}, + // resend firing + {Name: "d", State: notifier.StateFiring, Start: ts, LastSent: ts.Add(-1 * time.Minute)}, + }, []*notifier.Alert{{Name: "a"}, {Name: "c"}, {Name: "d"}}, 5*time.Minute, time.Minute, ) - f( // check if resolved alerts need to be sent with non-zero resendDelay - []*notifier.Alert{ - {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)}, - // no need to resend resolved - {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts}, - // resend resolved - {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)}, - }, + + // check if resolved alerts need to be sent with non-zero resendDelay + f([]*notifier.Alert{ + {Name: "a", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts.Add(-30 * time.Second)}, + // no need to resend resolved + {Name: "b", State: notifier.StateInactive, ResolvedAt: ts, LastSent: ts}, + // resend resolved + {Name: "c", State: notifier.StateInactive, ResolvedAt: ts.Add(-1 * time.Minute), LastSent: ts.Add(-1 * time.Minute)}, + }, []*notifier.Alert{{Name: "a"}, {Name: "c"}}, 5*time.Minute, time.Minute, ) @@ -1180,10 +1091,10 @@ func TestAlertingRule_ToLabels(t *testing.T) { } if !reflect.DeepEqual(ls.origin, expectedOriginLabels) { - t.Errorf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels) + t.Fatalf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels) } if !reflect.DeepEqual(ls.processed, expectedProcessedLabels) { - t.Errorf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels) + t.Fatalf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels) } } diff --git a/app/vmalert/rule/group_test.go b/app/vmalert/rule/group_test.go index 0ff6adec0..711bb277d 100644 --- a/app/vmalert/rule/group_test.go +++ b/app/vmalert/rule/group_test.go @@ -37,158 +37,142 @@ func TestMain(m *testing.M) { } func TestUpdateWith(t *testing.T) { - testCases := []struct { - name string - currentRules []config.Rule - newRules []config.Rule - }{ - { - "new rule", - nil, - []config.Rule{{Alert: "bar"}}, - }, - { - "update alerting rule", - []config.Rule{ - { - Alert: "foo", - Expr: "up > 0", - For: promutils.NewDuration(time.Second), - Labels: map[string]string{ - "bar": "baz", - }, - Annotations: map[string]string{ - "summary": "{{ $value|humanize }}", - "description": "{{$labels}}", - }, - }, - { - Alert: "bar", - Expr: "up > 0", - For: promutils.NewDuration(time.Second), - Labels: map[string]string{ - "bar": "baz", - }, - }, - }, - []config.Rule{ - { - Alert: "foo", - Expr: "up > 10", - For: promutils.NewDuration(time.Second), - Labels: map[string]string{ - "baz": "bar", - }, - Annotations: map[string]string{ - "summary": "none", - }, - }, - { - Alert: "bar", - Expr: "up > 0", - For: promutils.NewDuration(2 * time.Second), - KeepFiringFor: promutils.NewDuration(time.Minute), - Labels: map[string]string{ - "bar": "baz", - }, - }, - }, - }, - { - "update recording rule", - []config.Rule{{ - Record: "foo", - Expr: "max(up)", - Labels: map[string]string{ - "bar": "baz", - }, - }}, - []config.Rule{{ - Record: "foo", - Expr: "min(up)", - Labels: map[string]string{ - "baz": "bar", - }, - }}, - }, - { - "empty rule", - []config.Rule{{Alert: "foo"}, {Record: "bar"}}, - nil, - }, - { - "multiple rules", - []config.Rule{ - {Alert: "bar"}, - {Alert: "baz"}, - {Alert: "foo"}, - }, - []config.Rule{ - {Alert: "baz"}, - {Record: "foo"}, - }, - }, - { - "replace rule", - []config.Rule{{Alert: "foo1"}}, - []config.Rule{{Alert: "foo2"}}, - }, - { - "replace multiple rules", - []config.Rule{ - {Alert: "foo1"}, - {Record: "foo2"}, - {Alert: "foo3"}, - }, - []config.Rule{ - {Alert: "foo3"}, - {Alert: "foo4"}, - {Record: "foo5"}, - }, - }, - } + f := func(currentRules, newRules []config.Rule) { + t.Helper() - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - g := &Group{Name: "test"} - qb := &datasource.FakeQuerier{} - for _, r := range tc.currentRules { - r.ID = config.HashRule(r) - g.Rules = append(g.Rules, g.newRule(qb, r)) - } + g := &Group{ + Name: "test", + } + qb := &datasource.FakeQuerier{} + for _, r := range currentRules { + r.ID = config.HashRule(r) + g.Rules = append(g.Rules, g.newRule(qb, r)) + } - ng := &Group{Name: "test"} - for _, r := range tc.newRules { - r.ID = config.HashRule(r) - ng.Rules = append(ng.Rules, ng.newRule(qb, r)) - } + ng := &Group{ + Name: "test", + } + for _, r := range newRules { + r.ID = config.HashRule(r) + ng.Rules = append(ng.Rules, ng.newRule(qb, r)) + } - err := g.updateWith(ng) - if err != nil { - t.Fatal(err) - } + err := g.updateWith(ng) + if err != nil { + t.Fatalf("cannot update rule: %s", err) + } - if len(g.Rules) != len(tc.newRules) { - t.Fatalf("expected to have %d rules; got: %d", - len(g.Rules), len(tc.newRules)) - } - sort.Slice(g.Rules, func(i, j int) bool { - return g.Rules[i].ID() < g.Rules[j].ID() - }) - sort.Slice(ng.Rules, func(i, j int) bool { - return ng.Rules[i].ID() < ng.Rules[j].ID() - }) - for i, r := range g.Rules { - got, want := r, ng.Rules[i] - if got.ID() != want.ID() { - t.Fatalf("expected to have rule %q; got %q", want, got) - } - if err := CompareRules(t, got, want); err != nil { - t.Fatalf("comparison error: %s", err) - } - } + if len(g.Rules) != len(newRules) { + t.Fatalf("expected to have %d rules; got: %d", len(g.Rules), len(newRules)) + } + sort.Slice(g.Rules, func(i, j int) bool { + return g.Rules[i].ID() < g.Rules[j].ID() }) + sort.Slice(ng.Rules, func(i, j int) bool { + return ng.Rules[i].ID() < ng.Rules[j].ID() + }) + for i, r := range g.Rules { + got, want := r, ng.Rules[i] + if got.ID() != want.ID() { + t.Fatalf("expected to have rule %q; got %q", want, got) + } + if err := CompareRules(t, got, want); err != nil { + t.Fatalf("comparison error: %s", err) + } + } } + + // new rule + f(nil, []config.Rule{ + {Alert: "bar"}, + }) + + // update alerting rule + f([]config.Rule{ + { + Alert: "foo", + Expr: "up > 0", + For: promutils.NewDuration(time.Second), + Labels: map[string]string{ + "bar": "baz", + }, + Annotations: map[string]string{ + "summary": "{{ $value|humanize }}", + "description": "{{$labels}}", + }, + }, + { + Alert: "bar", + Expr: "up > 0", + For: promutils.NewDuration(time.Second), + Labels: map[string]string{ + "bar": "baz", + }, + }, + }, []config.Rule{ + { + Alert: "foo", + Expr: "up > 10", + For: promutils.NewDuration(time.Second), + Labels: map[string]string{ + "baz": "bar", + }, + Annotations: map[string]string{ + "summary": "none", + }, + }, + { + Alert: "bar", + Expr: "up > 0", + For: promutils.NewDuration(2 * time.Second), + KeepFiringFor: promutils.NewDuration(time.Minute), + Labels: map[string]string{ + "bar": "baz", + }, + }, + }) + + // update recording rule + f([]config.Rule{{ + Record: "foo", + Expr: "max(up)", + Labels: map[string]string{ + "bar": "baz", + }, + }}, []config.Rule{{ + Record: "foo", + Expr: "min(up)", + Labels: map[string]string{ + "baz": "bar", + }, + }}) + + // empty rule + f([]config.Rule{{Alert: "foo"}, {Record: "bar"}}, nil) + + // multiple rules + f([]config.Rule{ + {Alert: "bar"}, + {Alert: "baz"}, + {Alert: "foo"}, + }, []config.Rule{ + {Alert: "baz"}, + {Record: "foo"}, + }) + + // replace rule + f([]config.Rule{{Alert: "foo1"}}, []config.Rule{{Alert: "foo2"}}) + + // replace multiple rules + f([]config.Rule{ + {Alert: "foo1"}, + {Record: "foo2"}, + {Alert: "foo3"}, + }, []config.Rule{ + {Alert: "foo3"}, + {Alert: "foo4"}, + {Record: "foo5"}, + }) } func TestGroupStart(t *testing.T) { @@ -312,30 +296,23 @@ func TestGroupStart(t *testing.T) { <-finished } -func TestResolveDuration(t *testing.T) { - testCases := []struct { - groupInterval time.Duration - maxDuration time.Duration - resendDelay time.Duration - expected time.Duration - }{ - {time.Minute, 0, 0, 4 * time.Minute}, - {time.Minute, 0, 2 * time.Minute, 8 * time.Minute}, - {time.Minute, 4 * time.Minute, 4 * time.Minute, 4 * time.Minute}, - {2 * time.Minute, time.Minute, 2 * time.Minute, time.Minute}, - {time.Minute, 2 * time.Minute, 1 * time.Minute, 2 * time.Minute}, - {2 * time.Minute, 0, 1 * time.Minute, 8 * time.Minute}, - {0, 0, 0, 0}, +func TestGetResolveDuration(t *testing.T) { + f := func(groupInterval, maxDuration, resendDelay, resultExpected time.Duration) { + t.Helper() + + result := getResolveDuration(groupInterval, resendDelay, maxDuration) + if result != resultExpected { + t.Fatalf("unexpected result; got %s; want %s", result, resultExpected) + } } - for _, tc := range testCases { - t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) { - got := getResolveDuration(tc.groupInterval, tc.resendDelay, tc.maxDuration) - if got != tc.expected { - t.Errorf("expected to have %v; got %v", tc.expected, got) - } - }) - } + f(0, 0, 0, 0) + f(time.Minute, 0, 0, 4*time.Minute) + f(time.Minute, 0, 2*time.Minute, 8*time.Minute) + f(time.Minute, 4*time.Minute, 4*time.Minute, 4*time.Minute) + f(2*time.Minute, time.Minute, 2*time.Minute, time.Minute) + f(time.Minute, 2*time.Minute, 1*time.Minute, 2*time.Minute) + f(2*time.Minute, 0, 1*time.Minute, 8*time.Minute) } func TestGetStaleSeries(t *testing.T) { @@ -345,6 +322,7 @@ func TestGetStaleSeries(t *testing.T) { } f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) { t.Helper() + var tss []prompbmarshal.TimeSeries for _, l := range labels { tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l)) @@ -606,7 +584,7 @@ func TestGroupStartDelay(t *testing.T) { delay := delayBeforeStart(at, key, g.Interval, g.EvalOffset) gotStart := at.Add(delay) if expTS != gotStart { - t.Errorf("expected to get %v; got %v instead", expTS, gotStart) + t.Fatalf("expected to get %v; got %v instead", expTS, gotStart) } } @@ -647,157 +625,118 @@ func TestGroupStartDelay(t *testing.T) { } func TestGetPrometheusReqTimestamp(t *testing.T) { + f := func(g *Group, tsOrigin, tsExpected string) { + t.Helper() + + originT, _ := time.Parse(time.RFC3339, tsOrigin) + expT, _ := time.Parse(time.RFC3339, tsExpected) + gotTS := g.adjustReqTimestamp(originT) + if !gotTS.Equal(expT) { + t.Fatalf("get wrong prometheus request timestamp: %s; want %s", gotTS, expT) + } + } + offset := 30 * time.Minute evalDelay := 1 * time.Minute disableAlign := false - testCases := []struct { - name string - g *Group - originTS, expTS string - }{ - { - "with query align + default evalDelay", - &Group{ - Interval: time.Hour, - }, - "2023-08-28T11:11:00+00:00", - "2023-08-28T11:00:00+00:00", - }, - { - "without query align + default evalDelay", - &Group{ - Interval: time.Hour, - evalAlignment: &disableAlign, - }, - "2023-08-28T11:11:00+00:00", - "2023-08-28T11:10:30+00:00", - }, - { - "with eval_offset, find previous offset point + default evalDelay", - &Group{ - EvalOffset: &offset, - Interval: time.Hour, - }, - "2023-08-28T11:11:00+00:00", - "2023-08-28T10:30:00+00:00", - }, - { - "with eval_offset + default evalDelay", - &Group{ - EvalOffset: &offset, - Interval: time.Hour, - }, - "2023-08-28T11:41:00+00:00", - "2023-08-28T11:30:00+00:00", - }, - { - "1h interval with eval_delay", - &Group{ - EvalDelay: &evalDelay, - Interval: time.Hour, - }, - "2023-08-28T11:41:00+00:00", - "2023-08-28T11:00:00+00:00", - }, - { - "1m interval with eval_delay", - &Group{ - EvalDelay: &evalDelay, - Interval: time.Minute, - }, - "2023-08-28T11:41:13+00:00", - "2023-08-28T11:40:00+00:00", - }, - { - "disable alignment with eval_delay", - &Group{ - EvalDelay: &evalDelay, - Interval: time.Hour, - evalAlignment: &disableAlign, - }, - "2023-08-28T11:41:00+00:00", - "2023-08-28T11:40:00+00:00", - }, - } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - originT, _ := time.Parse(time.RFC3339, tc.originTS) - expT, _ := time.Parse(time.RFC3339, tc.expTS) - gotTS := tc.g.adjustReqTimestamp(originT) - if !gotTS.Equal(expT) { - t.Fatalf("get wrong prometheus request timestamp, expect %s, got %s", expT, gotTS) - } - }) - } + + // with query align + default evalDelay + f(&Group{ + Interval: time.Hour, + }, "2023-08-28T11:11:00+00:00", "2023-08-28T11:00:00+00:00") + + // without query align + default evalDelay + f(&Group{ + Interval: time.Hour, + evalAlignment: &disableAlign, + }, "2023-08-28T11:11:00+00:00", "2023-08-28T11:10:30+00:00") + + // with eval_offset, find previous offset point + default evalDelay + f(&Group{ + EvalOffset: &offset, + Interval: time.Hour, + }, "2023-08-28T11:11:00+00:00", "2023-08-28T10:30:00+00:00") + + // with eval_offset + default evalDelay + f(&Group{ + EvalOffset: &offset, + Interval: time.Hour, + }, "2023-08-28T11:41:00+00:00", "2023-08-28T11:30:00+00:00") + + // 1h interval with eval_delay + f(&Group{ + EvalDelay: &evalDelay, + Interval: time.Hour, + }, "2023-08-28T11:41:00+00:00", "2023-08-28T11:00:00+00:00") + + // 1m interval with eval_delay + f(&Group{ + EvalDelay: &evalDelay, + Interval: time.Minute, + }, "2023-08-28T11:41:13+00:00", "2023-08-28T11:40:00+00:00") + + // disable alignment with eval_delay + f(&Group{ + EvalDelay: &evalDelay, + Interval: time.Hour, + evalAlignment: &disableAlign, + }, "2023-08-28T11:41:00+00:00", "2023-08-28T11:40:00+00:00") } func TestRangeIterator(t *testing.T) { - testCases := []struct { - ri rangeIterator - result [][2]time.Time - }{ - { - ri: rangeIterator{ - start: parseTime(t, "2021-01-01T12:00:00.000Z"), - end: parseTime(t, "2021-01-01T12:30:00.000Z"), - step: 5 * time.Minute, - }, - result: [][2]time.Time{ - {parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")}, - {parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")}, - {parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")}, - {parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")}, - {parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")}, - {parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, - }, - }, - { - ri: rangeIterator{ - start: parseTime(t, "2021-01-01T12:00:00.000Z"), - end: parseTime(t, "2021-01-01T12:30:00.000Z"), - step: 45 * time.Minute, - }, - result: [][2]time.Time{ - {parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, - {parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, - }, - }, - { - ri: rangeIterator{ - start: parseTime(t, "2021-01-01T12:00:12.000Z"), - end: parseTime(t, "2021-01-01T12:00:17.000Z"), - step: time.Second, - }, - result: [][2]time.Time{ - {parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")}, - {parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")}, - {parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")}, - {parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")}, - {parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")}, - }, - }, + f := func(ri rangeIterator, resultExpected [][2]time.Time) { + t.Helper() + + var j int + for ri.next() { + if len(resultExpected) < j+1 { + t.Fatalf("unexpected result for iterator on step %d: %v - %v", j, ri.s, ri.e) + } + s, e := ri.s, ri.e + expS, expE := resultExpected[j][0], resultExpected[j][1] + if s != expS { + t.Fatalf("expected to get start=%v; got %v", expS, s) + } + if e != expE { + t.Fatalf("expected to get end=%v; got %v", expE, e) + } + j++ + } } - for i, tc := range testCases { - t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) { - var j int - for tc.ri.next() { - if len(tc.result) < j+1 { - t.Fatalf("unexpected result for iterator on step %d: %v - %v", - j, tc.ri.s, tc.ri.e) - } - s, e := tc.ri.s, tc.ri.e - expS, expE := tc.result[j][0], tc.result[j][1] - if s != expS { - t.Fatalf("expected to get start=%v; got %v", expS, s) - } - if e != expE { - t.Fatalf("expected to get end=%v; got %v", expE, e) - } - j++ - } - }) - } + f(rangeIterator{ + start: parseTime(t, "2021-01-01T12:00:00.000Z"), + end: parseTime(t, "2021-01-01T12:30:00.000Z"), + step: 5 * time.Minute, + }, [][2]time.Time{ + {parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")}, + {parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")}, + {parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")}, + {parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")}, + {parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")}, + {parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, + }) + + f(rangeIterator{ + start: parseTime(t, "2021-01-01T12:00:00.000Z"), + end: parseTime(t, "2021-01-01T12:30:00.000Z"), + step: 45 * time.Minute, + }, [][2]time.Time{ + {parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, + {parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")}, + }) + + f(rangeIterator{ + start: parseTime(t, "2021-01-01T12:00:12.000Z"), + end: parseTime(t, "2021-01-01T12:00:17.000Z"), + step: time.Second, + }, [][2]time.Time{ + {parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")}, + {parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")}, + {parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")}, + {parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")}, + {parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")}, + }) } func parseTime(t *testing.T, s string) time.Time { diff --git a/app/vmalert/rule/recording_test.go b/app/vmalert/rule/recording_test.go index 019d50fc0..62b9e7265 100644 --- a/app/vmalert/rule/recording_test.go +++ b/app/vmalert/rule/recording_test.go @@ -13,218 +13,225 @@ import ( ) func TestRecordingRule_Exec(t *testing.T) { + f := func(rule *RecordingRule, metrics []datasource.Metric, tssExpected []prompbmarshal.TimeSeries) { + t.Helper() + + fq := &datasource.FakeQuerier{} + fq.Add(metrics...) + rule.q = fq + rule.state = &ruleState{ + entries: make([]StateEntry, 10), + } + tss, err := rule.exec(context.TODO(), time.Now(), 0) + if err != nil { + t.Fatalf("unexpected RecordingRule.exec error: %s", err) + } + if err := compareTimeSeries(t, tssExpected, tss); err != nil { + t.Fatalf("timeseries missmatch: %s", err) + } + } + timestamp := time.Now() - testCases := []struct { - rule *RecordingRule - metrics []datasource.Metric - expTS []prompbmarshal.TimeSeries - }{ - { - &RecordingRule{Name: "foo"}, - []datasource.Metric{metricWithValueAndLabels(t, 10, - "__name__", "bar", - )}, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{10}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "foo", - }), - }, + + f(&RecordingRule{ + Name: "foo", + }, []datasource.Metric{ + metricWithValueAndLabels(t, 10, "__name__", "bar"), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{10}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "foo", + }), + }) + + f(&RecordingRule{ + Name: "foobarbaz", + }, []datasource.Metric{ + metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"), + metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"), + metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "foo", + }), + newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "bar", + }), + newTimeSeries([]float64{3}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "baz", + }), + }) + + f(&RecordingRule{ + Name: "job:foo", + Labels: map[string]string{ + "source": "test", }, - { - &RecordingRule{Name: "foobarbaz"}, - []datasource.Metric{ - metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"), - metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"), - metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"), - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "foobarbaz", - "job": "foo", - }), - newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "foobarbaz", - "job": "bar", - }), - newTimeSeries([]float64{3}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "foobarbaz", - "job": "baz", - }), - }, - }, - { - &RecordingRule{ - Name: "job:foo", - Labels: map[string]string{ - "source": "test", - }, - }, - []datasource.Metric{ - metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), - metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar", "source", "origin"), - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "job:foo", - "job": "foo", - "source": "test", - }), - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "job:foo", - "job": "bar", - "source": "test", - "exported_source": "origin", - }), - }, - }, - } - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - fq := &datasource.FakeQuerier{} - fq.Add(tc.metrics...) - tc.rule.q = fq - tc.rule.state = &ruleState{entries: make([]StateEntry, 10)} - tss, err := tc.rule.exec(context.TODO(), time.Now(), 0) - if err != nil { - t.Fatalf("unexpected Exec err: %s", err) - } - if err := compareTimeSeries(t, tc.expTS, tss); err != nil { - t.Fatalf("timeseries missmatch: %s", err) - } - }) - } + }, []datasource.Metric{ + metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), + metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar", "source", "origin"), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "job:foo", + "job": "foo", + "source": "test", + }), + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "job:foo", + "job": "bar", + "source": "test", + "exported_source": "origin", + }), + }) } func TestRecordingRule_ExecRange(t *testing.T) { - timestamp := time.Now() - testCases := []struct { - rule *RecordingRule - metrics []datasource.Metric - expTS []prompbmarshal.TimeSeries - }{ - { - &RecordingRule{Name: "foo"}, - []datasource.Metric{metricWithValuesAndLabels(t, []float64{10, 20, 30}, - "__name__", "bar", - )}, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{10, 20, 30}, - []int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()}, - map[string]string{ - "__name__": "foo", - }), - }, - }, - { - &RecordingRule{Name: "foobarbaz"}, - []datasource.Metric{ - metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"), - metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), - metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "foobarbaz", - "job": "foo", - }), - newTimeSeries([]float64{2, 3}, []int64{timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{ - "__name__": "foobarbaz", - "job": "bar", - }), - newTimeSeries([]float64{4, 5, 6}, - []int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()}, - map[string]string{ - "__name__": "foobarbaz", - "job": "baz", - }), - }, - }, - { - &RecordingRule{Name: "job:foo", Labels: map[string]string{ - "source": "test", - }}, - []datasource.Metric{ - metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), - metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"), - }, - []prompbmarshal.TimeSeries{ - newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "job:foo", - "job": "foo", - "source": "test", - }), - newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": "job:foo", - "job": "bar", - "source": "test", - }), - }, - }, - } - for _, tc := range testCases { - t.Run(tc.rule.Name, func(t *testing.T) { - fq := &datasource.FakeQuerier{} - fq.Add(tc.metrics...) - tc.rule.q = fq - tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now()) - if err != nil { - t.Fatalf("unexpected Exec err: %s", err) - } - if err := compareTimeSeries(t, tc.expTS, tss); err != nil { - t.Fatalf("timeseries missmatch: %s", err) - } - }) - } -} + f := func(rule *RecordingRule, metrics []datasource.Metric, tssExpected []prompbmarshal.TimeSeries) { + t.Helper() -func TestRecordingRuleLimit(t *testing.T) { - timestamp := time.Now() - testCases := []struct { - limit int - err string - }{ - { - limit: 0, - }, - { - limit: -1, - }, - { - limit: 1, - err: "exec exceeded limit of 1 with 3 series", - }, - { - limit: 2, - err: "exec exceeded limit of 2 with 3 series", - }, + fq := &datasource.FakeQuerier{} + fq.Add(metrics...) + rule.q = fq + tss, err := rule.execRange(context.TODO(), time.Now(), time.Now()) + if err != nil { + t.Fatalf("unexpected RecordingRule.execRange error: %s", err) + } + if err := compareTimeSeries(t, tssExpected, tss); err != nil { + t.Fatalf("timeseries missmatch: %s", err) + } } - testMetrics := []datasource.Metric{ + + timestamp := time.Now() + + f(&RecordingRule{ + Name: "foo", + }, []datasource.Metric{ + metricWithValuesAndLabels(t, []float64{10, 20, 30}, "__name__", "bar"), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{10, 20, 30}, []int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{ + "__name__": "foo", + }), + }) + + f(&RecordingRule{ + Name: "foobarbaz", + }, []datasource.Metric{ metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"), metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), - } - rule := &RecordingRule{Name: "job:foo", - state: &ruleState{entries: make([]StateEntry, 10)}, + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "foo", + }), + newTimeSeries([]float64{2, 3}, []int64{timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "bar", + }), + newTimeSeries([]float64{4, 5, 6}, + []int64{timestamp.UnixNano(), timestamp.UnixNano(), timestamp.UnixNano()}, map[string]string{ + "__name__": "foobarbaz", + "job": "baz", + }), + }) + + f(&RecordingRule{ + Name: "job:foo", Labels: map[string]string{ - "source": "test_limit", + "source": "test", }, - metrics: &recordingRuleMetrics{ - errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`), - }, - } - var err error - for _, testCase := range testCases { - fq := &datasource.FakeQuerier{} - fq.Add(testMetrics...) - rule.q = fq - _, err = rule.exec(context.TODO(), timestamp, testCase.limit) - if err != nil && !strings.EqualFold(err.Error(), testCase.err) { - t.Fatal(err) - } - } + }, []datasource.Metric{ + metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), + metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"), + }, []prompbmarshal.TimeSeries{ + newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "job:foo", + "job": "foo", + "source": "test", + }), + newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ + "__name__": "job:foo", + "job": "bar", + "source": "test", + }), + }) } -func TestRecordingRule_ExecNegative(t *testing.T) { +func TestRecordingRuleLimit_Failure(t *testing.T) { + f := func(limit int, errStrExpected string) { + t.Helper() + + testMetrics := []datasource.Metric{ + metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"), + metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), + metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), + } + + fq := &datasource.FakeQuerier{} + fq.Add(testMetrics...) + + rule := &RecordingRule{Name: "job:foo", + state: &ruleState{entries: make([]StateEntry, 10)}, + Labels: map[string]string{ + "source": "test_limit", + }, + metrics: &recordingRuleMetrics{ + errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`), + }, + } + rule.q = fq + + _, err := rule.exec(context.TODO(), time.Now(), limit) + if err == nil { + t.Fatalf("expecting non-nil error") + } + errStr := err.Error() + if !strings.Contains(errStr, errStrExpected) { + t.Fatalf("missing %q in the error %q", errStrExpected, errStr) + } + } + + f(1, "exec exceeded limit of 1 with 3 series") + f(2, "exec exceeded limit of 2 with 3 series") +} + +func TestRecordingRuleLimit_Success(t *testing.T) { + f := func(limit int) { + t.Helper() + + testMetrics := []datasource.Metric{ + metricWithValuesAndLabels(t, []float64{1}, "__name__", "foo", "job", "foo"), + metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), + metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), + } + + fq := &datasource.FakeQuerier{} + fq.Add(testMetrics...) + + rule := &RecordingRule{Name: "job:foo", + state: &ruleState{entries: make([]StateEntry, 10)}, + Labels: map[string]string{ + "source": "test_limit", + }, + metrics: &recordingRuleMetrics{ + errors: utils.GetOrCreateCounter(`vmalert_recording_rules_errors_total{alertname="job:foo"}`), + }, + } + rule.q = fq + + _, err := rule.exec(context.TODO(), time.Now(), limit) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + } + + f(0) + f(-1) +} + +func TestRecordingRuleExec_Negative(t *testing.T) { rr := &RecordingRule{ Name: "job:foo", Labels: map[string]string{ @@ -256,6 +263,6 @@ func TestRecordingRule_ExecNegative(t *testing.T) { _, err = rr.exec(context.TODO(), time.Now(), 0) if err != nil { - t.Fatal(err) + t.Fatalf("cannot execute recroding rule: %s", err) } } diff --git a/app/vmalert/templates/template_test.go b/app/vmalert/templates/template_test.go index 56f7719e6..2293c34a8 100644 --- a/app/vmalert/templates/template_test.go +++ b/app/vmalert/templates/template_test.go @@ -7,10 +7,11 @@ import ( textTpl "text/template" ) -func TestTemplateFuncs(t *testing.T) { - funcs := templateFuncs() +func TestTemplateFuncs_StringConversion(t *testing.T) { f := func(funcName, s, resultExpected string) { t.Helper() + + funcs := templateFuncs() v := funcs[funcName] fLocal := v.(func(s string) string) result := fLocal(s) @@ -18,6 +19,7 @@ func TestTemplateFuncs(t *testing.T) { t.Fatalf("unexpected result for %s(%q); got\n%s\nwant\n%s", funcName, s, result, resultExpected) } } + f("title", "foo bar", "Foo Bar") f("toUpper", "foo", "FOO") f("toLower", "FOO", "foo") @@ -31,7 +33,10 @@ func TestTemplateFuncs(t *testing.T) { f("stripPort", "foo:1234", "foo") f("stripDomain", "foo.bar.baz", "foo") f("stripDomain", "foo.bar:123", "foo:123") +} +func TestTemplateFuncs_Match(t *testing.T) { + funcs := templateFuncs() // check "match" func matchFunc := funcs["match"].(func(pattern, s string) (bool, error)) if _, err := matchFunc("invalid[regexp", "abc"); err == nil { @@ -51,9 +56,13 @@ func TestTemplateFuncs(t *testing.T) { if !ok { t.Fatalf("unexpected mismatch") } +} - formatting := func(funcName string, p any, resultExpected string) { +func TestTemplateFuncs_Formatting(t *testing.T) { + f := func(funcName string, p any, resultExpected string) { t.Helper() + + funcs := templateFuncs() v := funcs[funcName] fLocal := v.(func(s any) (string, error)) result, err := fLocal(p) @@ -64,32 +73,33 @@ func TestTemplateFuncs(t *testing.T) { t.Fatalf("unexpected result for %s(%f); got\n%s\nwant\n%s", funcName, p, result, resultExpected) } } - formatting("humanize1024", float64(0), "0") - formatting("humanize1024", math.Inf(0), "+Inf") - formatting("humanize1024", math.NaN(), "NaN") - formatting("humanize1024", float64(127087), "124.1ki") - formatting("humanize1024", float64(130137088), "124.1Mi") - formatting("humanize1024", float64(133260378112), "124.1Gi") - formatting("humanize1024", float64(136458627186688), "124.1Ti") - formatting("humanize1024", float64(139733634239168512), "124.1Pi") - formatting("humanize1024", float64(143087241460908556288), "124.1Ei") - formatting("humanize1024", float64(146521335255970361638912), "124.1Zi") - formatting("humanize1024", float64(150037847302113650318245888), "124.1Yi") - formatting("humanize1024", float64(153638755637364377925883789312), "1.271e+05Yi") - formatting("humanize", float64(127087), "127.1k") - formatting("humanize", float64(136458627186688), "136.5T") + f("humanize1024", float64(0), "0") + f("humanize1024", math.Inf(0), "+Inf") + f("humanize1024", math.NaN(), "NaN") + f("humanize1024", float64(127087), "124.1ki") + f("humanize1024", float64(130137088), "124.1Mi") + f("humanize1024", float64(133260378112), "124.1Gi") + f("humanize1024", float64(136458627186688), "124.1Ti") + f("humanize1024", float64(139733634239168512), "124.1Pi") + f("humanize1024", float64(143087241460908556288), "124.1Ei") + f("humanize1024", float64(146521335255970361638912), "124.1Zi") + f("humanize1024", float64(150037847302113650318245888), "124.1Yi") + f("humanize1024", float64(153638755637364377925883789312), "1.271e+05Yi") - formatting("humanizeDuration", 1, "1s") - formatting("humanizeDuration", 0.2, "200ms") - formatting("humanizeDuration", 42000, "11h 40m 0s") - formatting("humanizeDuration", 16790555, "194d 8h 2m 35s") + f("humanize", float64(127087), "127.1k") + f("humanize", float64(136458627186688), "136.5T") - formatting("humanizePercentage", 1, "100%") - formatting("humanizePercentage", 0.8, "80%") - formatting("humanizePercentage", 0.015, "1.5%") + f("humanizeDuration", 1, "1s") + f("humanizeDuration", 0.2, "200ms") + f("humanizeDuration", 42000, "11h 40m 0s") + f("humanizeDuration", 16790555, "194d 8h 2m 35s") - formatting("humanizeTimestamp", 1679055557, "2023-03-17 12:19:17 +0000 UTC") + f("humanizePercentage", 1, "100%") + f("humanizePercentage", 0.8, "80%") + f("humanizePercentage", 0.015, "1.5%") + + f("humanizeTimestamp", 1679055557, "2023-03-17 12:19:17 +0000 UTC") } func mkTemplate(current, replacement any) textTemplate { @@ -138,224 +148,201 @@ func equalTemplates(tmpls ...*textTpl.Template) bool { return true } -func TestTemplates_Load(t *testing.T) { - testCases := []struct { - name string - initialTemplate textTemplate - pathPatterns []string - overwrite bool - expectedTemplate textTemplate - expErr string - }{ - { - "non existing path undefined template override", - mkTemplate(nil, nil), - []string{ - "templates/non-existing/good-*.tpl", - "templates/absent/good-*.tpl", - }, - true, - mkTemplate(``, nil), - "", - }, - { - "non existing path defined template override", - mkTemplate(` - {{- define "test.1" -}} - {{- printf "value" -}} - {{- end -}} - `, nil), - []string{ - "templates/non-existing/good-*.tpl", - "templates/absent/good-*.tpl", - }, - true, - mkTemplate(``, nil), - "", - }, - { - "existing path undefined template override", - mkTemplate(nil, nil), - []string{ - "templates/other/nested/good0-*.tpl", - "templates/test/good0-*.tpl", - }, - false, - mkTemplate(` - {{- define "good0-test.tpl" -}}{{- end -}} - {{- define "test.0" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.1" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.2" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.3" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - `, nil), - "", - }, - { - "existing path defined template override", - mkTemplate(` - {{- define "test.1" -}} - {{ printf "Hello %s!" "world" }} - {{- end -}} - `, nil), - []string{ - "templates/other/nested/good0-*.tpl", - "templates/test/good0-*.tpl", - }, - false, - mkTemplate(` - {{- define "good0-test.tpl" -}}{{- end -}} - {{- define "test.0" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.1" -}} - {{ printf "Hello %s!" "world" }} - {{- end -}} - {{- define "test.2" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.3" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - `, ` - {{- define "good0-test.tpl" -}}{{- end -}} - {{- define "test.0" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.1" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.2" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - {{- define "test.3" -}} - {{ printf "Hello %s!" externalURL }} - {{- end -}} - `), - "", - }, - { - "load template with syntax error", - mkTemplate(` - {{- define "test.1" -}} - {{ printf "Hello %s!" "world" }} - {{- end -}} - `, nil), - []string{ - "templates/other/nested/bad0-*.tpl", - "templates/test/good0-*.tpl", - }, - false, - mkTemplate(` - {{- define "test.1" -}} - {{ printf "Hello %s!" "world" }} - {{- end -}} - `, nil), - "failed to parse template glob", - }, +func TestTemplatesLoad_Failure(t *testing.T) { + f := func(pathPatterns []string, expectedErrStr string) { + t.Helper() + + err := Load(pathPatterns, false) + if err == nil { + t.Fatalf("expecting non-nil error") + } + + errStr := err.Error() + if !strings.Contains(errStr, expectedErrStr) { + t.Fatalf("the returned error %q doesn't contain %q", errStr, expectedErrStr) + } } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - masterTmpl = tc.initialTemplate - err := Load(tc.pathPatterns, tc.overwrite) - if tc.expErr == "" && err != nil { - t.Error("happened error that wasn't expected: %w", err) - } - if tc.expErr != "" && err == nil { - t.Error("%+w", err) - t.Error("expected error that didn't happened") - } - if err != nil && !strings.Contains(err.Error(), tc.expErr) { - t.Error("%+w", err) - t.Error("expected string doesn't exist in error message") - } - if !equalTemplates(masterTmpl.replacement, tc.expectedTemplate.replacement) { - t.Fatalf("replacement template is not as expected") - } - if !equalTemplates(masterTmpl.current, tc.expectedTemplate.current) { - t.Fatalf("current template is not as expected") - } - }) - } + // load template with syntax error + f([]string{ + "templates/other/nested/bad0-*.tpl", + "templates/test/good0-*.tpl", + }, "failed to parse template glob") } -func TestTemplates_Reload(t *testing.T) { - testCases := []struct { - name string - initialTemplate textTemplate - expectedTemplate textTemplate - }{ - { - "empty current and replacement templates", - mkTemplate(nil, nil), - mkTemplate(nil, nil), - }, - { - "empty current template only", - mkTemplate(` - {{- define "test.1" -}} - {{- printf "value" -}} - {{- end -}} - `, nil), - mkTemplate(` - {{- define "test.1" -}} - {{- printf "value" -}} - {{- end -}} - `, nil), - }, - { - "empty replacement template only", - mkTemplate(nil, ` - {{- define "test.1" -}} - {{- printf "value" -}} - {{- end -}} - `), - mkTemplate(` - {{- define "test.1" -}} - {{- printf "value" -}} - {{- end -}} - `, nil), - }, - { - "defined both templates", - mkTemplate(` - {{- define "test.0" -}} - {{- printf "value" -}} - {{- end -}} - {{- define "test.1" -}} - {{- printf "before" -}} - {{- end -}} - `, ` - {{- define "test.1" -}} - {{- printf "after" -}} - {{- end -}} - `), - mkTemplate(` - {{- define "test.1" -}} - {{- printf "after" -}} - {{- end -}} - `, nil), - }, +func TestTemplatesLoad_Success(t *testing.T) { + f := func(initialTmpl textTemplate, pathPatterns []string, overwrite bool, expectedTmpl textTemplate) { + t.Helper() + + masterTmplOrig := masterTmpl + masterTmpl = initialTmpl + defer func() { + masterTmpl = masterTmplOrig + }() + + if err := Load(pathPatterns, overwrite); err != nil { + t.Fatalf("cannot load templates: %s", err) + } + + if !equalTemplates(masterTmpl.replacement, expectedTmpl.replacement) { + t.Fatalf("unexpected replacement template\ngot\n%+v\nwant\n%+v", masterTmpl.replacement, expectedTmpl.replacement) + } + if !equalTemplates(masterTmpl.current, expectedTmpl.current) { + t.Fatalf("unexpected current template\ngot\n%+v\nwant\n%+v", masterTmpl.current, expectedTmpl.current) + } } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - masterTmpl = tc.initialTemplate - Reload() - if !equalTemplates(masterTmpl.replacement, tc.expectedTemplate.replacement) { - t.Fatalf("replacement template is not as expected") - } - if !equalTemplates(masterTmpl.current, tc.expectedTemplate.current) { - t.Fatalf("current template is not as expected") - } - }) + // non existing path undefined template override + initialTmpl := mkTemplate(nil, nil) + pathPatterns := []string{ + "templates/non-existing/good-*.tpl", + "templates/absent/good-*.tpl", } + overwrite := true + expectedTmpl := mkTemplate(``, nil) + f(initialTmpl, pathPatterns, overwrite, expectedTmpl) + + // non existing path defined template override + initialTmpl = mkTemplate(` + {{- define "test.1" -}} + {{- printf "value" -}} + {{- end -}} + `, nil) + pathPatterns = []string{ + "templates/non-existing/good-*.tpl", + "templates/absent/good-*.tpl", + } + overwrite = true + expectedTmpl = mkTemplate(``, nil) + f(initialTmpl, pathPatterns, overwrite, expectedTmpl) + + // existing path undefined template override + initialTmpl = mkTemplate(nil, nil) + pathPatterns = []string{ + "templates/other/nested/good0-*.tpl", + "templates/test/good0-*.tpl", + } + overwrite = false + expectedTmpl = mkTemplate(` + {{- define "good0-test.tpl" -}}{{- end -}} + {{- define "test.0" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.1" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.2" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.3" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + `, nil) + f(initialTmpl, pathPatterns, overwrite, expectedTmpl) + + // existing path defined template override + initialTmpl = mkTemplate(` + {{- define "test.1" -}} + {{ printf "Hello %s!" "world" }} + {{- end -}} + `, nil) + pathPatterns = []string{ + "templates/other/nested/good0-*.tpl", + "templates/test/good0-*.tpl", + } + overwrite = false + expectedTmpl = mkTemplate(` + {{- define "good0-test.tpl" -}}{{- end -}} + {{- define "test.0" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.1" -}} + {{ printf "Hello %s!" "world" }} + {{- end -}} + {{- define "test.2" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.3" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + `, ` + {{- define "good0-test.tpl" -}}{{- end -}} + {{- define "test.0" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.1" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.2" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + {{- define "test.3" -}} + {{ printf "Hello %s!" externalURL }} + {{- end -}} + `) + f(initialTmpl, pathPatterns, overwrite, expectedTmpl) +} + +func TestTemplatesReload(t *testing.T) { + f := func(initialTmpl, expectedTmpl textTemplate) { + t.Helper() + + masterTmplOrig := masterTmpl + masterTmpl = initialTmpl + defer func() { + masterTmpl = masterTmplOrig + }() + + Reload() + + if !equalTemplates(masterTmpl.replacement, expectedTmpl.replacement) { + t.Fatalf("unexpected replacement template\ngot\n%+v\nwant\n%+v", masterTmpl.replacement, expectedTmpl.replacement) + } + if !equalTemplates(masterTmpl.current, expectedTmpl.current) { + t.Fatalf("unexpected current template\ngot\n%+v\nwant\n%+v", masterTmpl.current, expectedTmpl.current) + } + } + + // empty current and replacement templates + f(mkTemplate(nil, nil), mkTemplate(nil, nil)) + + // empty current template only + f(mkTemplate(` + {{- define "test.1" -}} + {{- printf "value" -}} + {{- end -}} + `, nil), mkTemplate(` + {{- define "test.1" -}} + {{- printf "value" -}} + {{- end -}} + `, nil)) + + // empty replacement template only + f(mkTemplate(nil, ` + {{- define "test.1" -}} + {{- printf "value" -}} + {{- end -}} + `), mkTemplate(` + {{- define "test.1" -}} + {{- printf "value" -}} + {{- end -}} + `, nil)) + + // defined both templates + f(mkTemplate(` + {{- define "test.0" -}} + {{- printf "value" -}} + {{- end -}} + {{- define "test.1" -}} + {{- printf "before" -}} + {{- end -}} + `, ` + {{- define "test.1" -}} + {{- printf "after" -}} + {{- end -}} + `), mkTemplate(` + {{- define "test.1" -}} + {{- printf "after" -}} + {{- end -}} + `, nil)) } diff --git a/app/vmalert/utils/err_group_test.go b/app/vmalert/utils/err_group_test.go index 366b508ab..b1bb051af 100644 --- a/app/vmalert/utils/err_group_test.go +++ b/app/vmalert/utils/err_group_test.go @@ -7,35 +7,31 @@ import ( ) func TestErrGroup(t *testing.T) { - testCases := []struct { - errs []error - exp string - }{ - {nil, ""}, - {[]error{errors.New("timeout")}, "errors(1): timeout"}, - { - []error{errors.New("timeout"), errors.New("deadline")}, - "errors(2): timeout\ndeadline", - }, - } - for _, tc := range testCases { - eg := new(ErrGroup) - for _, err := range tc.errs { + f := func(errs []error, resultExpected string) { + t.Helper() + + eg := &ErrGroup{} + for _, err := range errs { eg.Add(err) } - if len(tc.errs) == 0 { + if len(errs) == 0 { if eg.Err() != nil { t.Fatalf("expected to get nil error") } - continue + return } if eg.Err() == nil { t.Fatalf("expected to get non-nil error") } - if eg.Error() != tc.exp { - t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error()) + result := eg.Error() + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%v", result, resultExpected) } } + + f(nil, "") + f([]error{errors.New("timeout")}, "errors(1): timeout") + f([]error{errors.New("timeout"), errors.New("deadline")}, "errors(2): timeout\ndeadline") } // TestErrGroupConcurrent supposed to test concurrent diff --git a/app/vmalert/web.go b/app/vmalert/web.go index 0b392ead8..84dc83bb0 100644 --- a/app/vmalert/web.go +++ b/app/vmalert/web.go @@ -19,7 +19,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" ) -var reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings.") +var reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*") var ( apiLinks = [][2]string{ @@ -167,7 +167,7 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool { w.Write(data) return true case "/-/reload": - if !httpserver.CheckAuthFlag(w, r, reloadAuthKey.Get(), "reloadAuthKey") { + if !httpserver.CheckAuthFlag(w, r, reloadAuthKey) { return true } logger.Infof("api config reload was called, sending sighup") diff --git a/app/vmalert/web_test.go b/app/vmalert/web_test.go index 4d78c8039..266930431 100644 --- a/app/vmalert/web_test.go +++ b/app/vmalert/web_test.go @@ -43,16 +43,16 @@ func TestHandler(t *testing.T) { t.Fatalf("unexpected err %s", err) } if code != resp.StatusCode { - t.Errorf("unexpected status code %d want %d", resp.StatusCode, code) + t.Fatalf("unexpected status code %d want %d", resp.StatusCode, code) } defer func() { if err := resp.Body.Close(); err != nil { - t.Errorf("err closing body %s", err) + t.Fatalf("err closing body %s", err) } }() if to != nil { if err = json.NewDecoder(resp.Body).Decode(to); err != nil { - t.Errorf("unexpected err %s", err) + t.Fatalf("unexpected err %s", err) } } } @@ -92,13 +92,13 @@ func TestHandler(t *testing.T) { lr := listAlertsResponse{} getResp(t, ts.URL+"/api/v1/alerts", &lr, 200) if length := len(lr.Data.Alerts); length != 1 { - t.Errorf("expected 1 alert got %d", length) + t.Fatalf("expected 1 alert got %d", length) } lr = listAlertsResponse{} getResp(t, ts.URL+"/vmalert/api/v1/alerts", &lr, 200) if length := len(lr.Data.Alerts); length != 1 { - t.Errorf("expected 1 alert got %d", length) + t.Fatalf("expected 1 alert got %d", length) } }) t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) { @@ -106,13 +106,13 @@ func TestHandler(t *testing.T) { alert := &apiAlert{} getResp(t, ts.URL+"/"+expAlert.APILink(), alert, 200) if !reflect.DeepEqual(alert, expAlert) { - t.Errorf("expected %v is equal to %v", alert, expAlert) + t.Fatalf("expected %v is equal to %v", alert, expAlert) } alert = &apiAlert{} getResp(t, ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200) if !reflect.DeepEqual(alert, expAlert) { - t.Errorf("expected %v is equal to %v", alert, expAlert) + t.Fatalf("expected %v is equal to %v", alert, expAlert) } }) @@ -135,13 +135,13 @@ func TestHandler(t *testing.T) { lr := listGroupsResponse{} getResp(t, ts.URL+"/api/v1/rules", &lr, 200) if length := len(lr.Data.Groups); length != 1 { - t.Errorf("expected 1 group got %d", length) + t.Fatalf("expected 1 group got %d", length) } lr = listGroupsResponse{} getResp(t, ts.URL+"/vmalert/api/v1/rules", &lr, 200) if length := len(lr.Data.Groups); length != 1 { - t.Errorf("expected 1 group got %d", length) + t.Fatalf("expected 1 group got %d", length) } }) t.Run("/api/v1/rule?ruleID&groupID", func(t *testing.T) { @@ -150,14 +150,14 @@ func TestHandler(t *testing.T) { getResp(t, ts.URL+"/"+expRule.APILink(), &gotRule, 200) if expRule.ID != gotRule.ID { - t.Errorf("expected to get Rule %q; got %q instead", expRule.ID, gotRule.ID) + t.Fatalf("expected to get Rule %q; got %q instead", expRule.ID, gotRule.ID) } gotRule = apiRule{} getResp(t, ts.URL+"/vmalert/"+expRule.APILink(), &gotRule, 200) if expRule.ID != gotRule.ID { - t.Errorf("expected to get Rule %q; got %q instead", expRule.ID, gotRule.ID) + t.Fatalf("expected to get Rule %q; got %q instead", expRule.ID, gotRule.ID) } gotRuleWithUpdates := apiRuleWithUpdates{} @@ -173,7 +173,7 @@ func TestHandler(t *testing.T) { lr := listGroupsResponse{} getResp(t, ts.URL+url, &lr, 200) if length := len(lr.Data.Groups); length != expGroups { - t.Errorf("expected %d groups got %d", expGroups, length) + t.Fatalf("expected %d groups got %d", expGroups, length) } if len(lr.Data.Groups) < 1 { return @@ -183,7 +183,7 @@ func TestHandler(t *testing.T) { rulesN += len(gr.Rules) } if rulesN != expRules { - t.Errorf("expected %d rules got %d", expRules, rulesN) + t.Fatalf("expected %d rules got %d", expRules, rulesN) } } @@ -248,16 +248,16 @@ func TestEmptyResponse(t *testing.T) { t.Fatalf("unexpected err %s", err) } if code != resp.StatusCode { - t.Errorf("unexpected status code %d want %d", resp.StatusCode, code) + t.Fatalf("unexpected status code %d want %d", resp.StatusCode, code) } defer func() { if err := resp.Body.Close(); err != nil { - t.Errorf("err closing body %s", err) + t.Fatalf("err closing body %s", err) } }() if to != nil { if err = json.NewDecoder(resp.Body).Decode(to); err != nil { - t.Errorf("unexpected err %s", err) + t.Fatalf("unexpected err %s", err) } } } @@ -266,13 +266,13 @@ func TestEmptyResponse(t *testing.T) { lr := listAlertsResponse{} getResp(t, ts.URL+"/api/v1/alerts", &lr, 200) if lr.Data.Alerts == nil { - t.Errorf("expected /api/v1/alerts response to have non-nil data") + t.Fatalf("expected /api/v1/alerts response to have non-nil data") } lr = listAlertsResponse{} getResp(t, ts.URL+"/vmalert/api/v1/alerts", &lr, 200) if lr.Data.Alerts == nil { - t.Errorf("expected /api/v1/alerts response to have non-nil data") + t.Fatalf("expected /api/v1/alerts response to have non-nil data") } }) @@ -280,13 +280,13 @@ func TestEmptyResponse(t *testing.T) { lr := listGroupsResponse{} getResp(t, ts.URL+"/api/v1/rules", &lr, 200) if lr.Data.Groups == nil { - t.Errorf("expected /api/v1/rules response to have non-nil data") + t.Fatalf("expected /api/v1/rules response to have non-nil data") } lr = listGroupsResponse{} getResp(t, ts.URL+"/vmalert/api/v1/rules", &lr, 200) if lr.Data.Groups == nil { - t.Errorf("expected /api/v1/rules response to have non-nil data") + t.Fatalf("expected /api/v1/rules response to have non-nil data") } }) diff --git a/app/vmalert/web_types_test.go b/app/vmalert/web_types_test.go index 48c747af6..40f5404a3 100644 --- a/app/vmalert/web_types_test.go +++ b/app/vmalert/web_types_test.go @@ -13,11 +13,11 @@ func TestUrlValuesToStrings(t *testing.T) { res := urlValuesToStrings(mapQueryParams) if len(res) != len(expectedRes) { - t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res)) + t.Fatalf("Expected length %d, but got %d", len(expectedRes), len(res)) } for ind, val := range expectedRes { if val != res[ind] { - t.Errorf("Expected %v; but got %v", val, res[ind]) + t.Fatalf("Expected %v; but got %v", val, res[ind]) } } } diff --git a/app/vmauth/auth_config.go b/app/vmauth/auth_config.go index 18788c5c6..24aad3613 100644 --- a/app/vmauth/auth_config.go +++ b/app/vmauth/auth_config.go @@ -354,11 +354,12 @@ func (up *URLPrefix) discoverBackendAddrsIfNeeded() { // ips for the given host have been already discovered continue } + var resolvedAddrs []string if strings.HasPrefix(host, "srv+") { // The host has the format 'srv+realhost'. Strip 'srv+' prefix before performing the lookup. - host = strings.TrimPrefix(host, "srv+") - _, addrs, err := netutil.Resolver.LookupSRV(ctx, "", "", host) + srvHost := strings.TrimPrefix(host, "srv+") + _, addrs, err := netutil.Resolver.LookupSRV(ctx, "", "", srvHost) if err != nil { logger.Warnf("cannot discover backend SRV records for %s: %s; use it literally", bu, err) resolvedAddrs = []string{host} @@ -390,7 +391,6 @@ func (up *URLPrefix) discoverBackendAddrsIfNeeded() { var busNew []*backendURL for _, bu := range up.busOriginal { host := bu.Hostname() - host = strings.TrimPrefix(host, "srv+") port := bu.Port() for _, addr := range hostToAddrs[host] { buCopy := *bu @@ -696,22 +696,15 @@ func loadAuthConfig() (bool, error) { } logger.Infof("loaded information about %d users from -auth.config=%q", len(m), *authConfigPath) - prevAc := authConfig.Load() - if prevAc != nil { - metrics.UnregisterSet(prevAc.ms) + acPrev := authConfig.Load() + if acPrev != nil { + metrics.UnregisterSet(acPrev.ms, true) } metrics.RegisterSet(ac.ms) + authConfig.Store(ac) authConfigData.Store(&data) authUsers.Store(&m) - if prevAc != nil { - // explicilty unregister metrics, since all summary type metrics - // are registered at global state of metrics package - // and must be removed from it to release memory. - // Metrics must be unregistered only after atomic.Value.Store calls above - // Otherwise it may lead to metric gaps, since UnregisterAllMetrics is slow operation - prevAc.ms.UnregisterAllMetrics() - } return true, nil } @@ -1024,8 +1017,6 @@ func (up *URLPrefix) sanitizeAndInitialize() error { } } up.bus.Store(&bus) - up.nextDiscoveryDeadline.Store(0) - up.n.Store(0) return nil } diff --git a/app/vmauth/main.go b/app/vmauth/main.go index 04f949de9..91af81cd1 100644 --- a/app/vmauth/main.go +++ b/app/vmauth/main.go @@ -37,15 +37,15 @@ var ( "With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing") maxIdleConnsPerBackend = flag.Int("maxIdleConnsPerBackend", 100, "The maximum number of idle connections vmauth can open per each backend host. "+ "See also -maxConcurrentRequests") - idleConnTimeout = flag.Duration("idleConnTimeout", 50*time.Second, `Defines a duration for idle (keep-alive connections) to exist. - Consider setting this value less than "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors.`) + idleConnTimeout = flag.Duration("idleConnTimeout", 50*time.Second, "The timeout for HTTP keep-alive connections to backend services. "+ + "It is recommended setting this value to values smaller than -http.idleConnTimeout set at backend services") responseTimeout = flag.Duration("responseTimeout", 5*time.Minute, "The timeout for receiving a response from backend") maxConcurrentRequests = flag.Int("maxConcurrentRequests", 1000, "The maximum number of concurrent requests vmauth can process. Other requests are rejected with "+ "'429 Too Many Requests' http status code. See also -maxConcurrentPerUserRequests and -maxIdleConnsPerBackend command-line options") maxConcurrentPerUserRequests = flag.Int("maxConcurrentPerUserRequests", 300, "The maximum number of concurrent requests vmauth can process per each configured user. "+ "Other requests are rejected with '429 Too Many Requests' http status code. See also -maxConcurrentRequests command-line option and max_concurrent_requests option "+ "in per-user config") - reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings.") + reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*") logInvalidAuthTokens = flag.Bool("logInvalidAuthTokens", false, "Whether to log requests with invalid auth tokens. "+ `Such requests are always counted at vmauth_http_request_errors_total{reason="invalid_auth_token"} metric, which is exposed at /metrics page`) failTimeout = flag.Duration("failTimeout", 3*time.Second, "Sets a delay period for load balancing to skip a malfunctioning backend") @@ -99,7 +99,7 @@ func main() { func requestHandler(w http.ResponseWriter, r *http.Request) bool { switch r.URL.Path { case "/-/reload": - if !httpserver.CheckAuthFlag(w, r, reloadAuthKey.Get(), "reloadAuthKey") { + if !httpserver.CheckAuthFlag(w, r, reloadAuthKey) { return true } configReloadRequests.Inc() @@ -272,8 +272,7 @@ again: ui.backendErrors.Inc() return true } - // one time retry trivial network errors, such as proxy idle timeout misconfiguration - // or socket close by OS + // one time retry trivial network errors, such as proxy idle timeout misconfiguration or socket close by OS if (netutil.IsTrivialNetworkError(err) || errors.Is(err, io.EOF)) && trivialRetries < 1 { trivialRetries++ goto again @@ -450,7 +449,7 @@ func newRoundTripper(caFileOpt, certFileOpt, keyFileOpt, serverNameOpt string, i if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost { tr.MaxIdleConns = tr.MaxIdleConnsPerHost } - tr.DialContext = netutil.DialMaybeSRV + tr.DialContext = netutil.NewStatDialFunc("vmauth_backend") rt := cfg.NewRoundTripper(tr) return rt, nil diff --git a/app/vmauth/target_url_test.go b/app/vmauth/target_url_test.go index 89fb52a7e..3d72653f3 100644 --- a/app/vmauth/target_url_test.go +++ b/app/vmauth/target_url_test.go @@ -86,6 +86,7 @@ func TestCreateTargetURLSuccess(t *testing.T) { f := func(ui *UserInfo, requestURI, expectedTarget, expectedRequestHeaders, expectedResponseHeaders string, expectedRetryStatusCodes []int, expectedLoadBalancingPolicy string, expectedDropSrcPathPrefixParts int) { t.Helper() + if err := ui.initURLs(); err != nil { t.Fatalf("cannot initialize urls inside UserInfo: %s", err) } @@ -122,6 +123,7 @@ func TestCreateTargetURLSuccess(t *testing.T) { t.Fatalf("unexpected dropSrcPathPrefixParts; got %d; want %d", up.dropSrcPathPrefixParts, expectedDropSrcPathPrefixParts) } } + // Simple routing with `url_prefix` f(&UserInfo{ URLPrefix: mustParseURL("http://foo.bar"), @@ -260,7 +262,32 @@ func TestCreateTargetURLSuccess(t *testing.T) { f(ui, `/api/v1/query?query=up{env="prod"}`, `http://vmselect/1/prometheus/api/v1/query?query=up%7Benv%3D%22prod%22%7D`, "", "", nil, "least_loaded", 0) f(ui, `/api/v1/query?query=up{foo="bar",env="dev",pod!=""}`, `http://vmselect/0/prometheus/api/v1/query?query=up%7Bfoo%3D%22bar%22%2Cenv%3D%22dev%22%2Cpod%21%3D%22%22%7D`, "", "", nil, "least_loaded", 0) f(ui, `/api/v1/query?query=up{foo="bar"}`, `http://default-server/api/v1/query?query=up%7Bfoo%3D%22bar%22%7D`, "", "", nil, "least_loaded", 0) +} +func TestUserInfoGetBackendURL_SRV(t *testing.T) { + f := func(ui *UserInfo, requestURI, expectedTarget string) { + t.Helper() + + u, err := url.Parse(requestURI) + if err != nil { + t.Fatalf("cannot parse %q: %s", requestURI, err) + } + u = normalizeURL(u) + up, _ := ui.getURLPrefixAndHeaders(u, nil) + if up == nil { + t.Fatalf("cannot match available backend: %s", err) + } + bu := up.getBackendURL() + target := mergeURLs(bu.url, u, up.dropSrcPathPrefixParts) + bu.put() + + gotTarget := target.String() + if gotTarget != expectedTarget { + t.Fatalf("unexpected target\ngot:\n%q\nwant\n%q", gotTarget, expectedTarget) + } + } + + // Discover backendURL with SRV hostnames customResolver := &fakeResolver{ Resolver: &net.Resolver{}, lookupSRVResults: map[string][]*net.SRV{ @@ -283,11 +310,14 @@ func TestCreateTargetURLSuccess(t *testing.T) { }, }, } + origResolver := netutil.Resolver netutil.Resolver = customResolver + defer func() { + netutil.Resolver = origResolver + }() - // Discover backendURL allowed := true - ui = &UserInfo{ + ui := &UserInfo{ URLMaps: []URLMap{ { SrcPaths: getRegexs([]string{"/select/.+"}), @@ -301,12 +331,15 @@ func TestCreateTargetURLSuccess(t *testing.T) { DiscoverBackendIPs: &allowed, URLPrefix: mustParseURL("http://non-exist-dns-addr"), } - f(ui, `/select/0/prometheus/api/v1/query?query=up`, "http://10.6.142.51:8481/select/0/prometheus/api/v1/query?query=up", "", "", nil, "least_loaded", 0) - // url_prefix counter will be reset, still go to 10.6.142.51 - f(ui, `/select/0/prometheus/api/v1/query?query=up`, "http://10.6.142.51:8481/select/0/prometheus/api/v1/query?query=up", "", "", nil, "least_loaded", 0) - f(ui, `/insert/0/prometheus/api/v1/write`, "http://10.6.142.52:8480/insert/0/prometheus/api/v1/write", "", "", nil, "least_loaded", 0) + if err := ui.initURLs(); err != nil { + t.Fatalf("cannot initialize urls inside UserInfo: %s", err) + } + + f(ui, `/select/0/prometheus/api/v1/query?query=up`, "http://10.6.142.51:8481/select/0/prometheus/api/v1/query?query=up") + f(ui, `/select/0/prometheus/api/v1/query?query=up`, "http://10.6.142.50:8481/select/0/prometheus/api/v1/query?query=up") + f(ui, `/insert/0/prometheus/api/v1/write`, "http://10.6.142.52:8480/insert/0/prometheus/api/v1/write") // unsuccessful dns resolve - f(ui, `/test`, "http://non-exist-dns-addr/test", "", "", nil, "least_loaded", 0) + f(ui, `/test`, "http://non-exist-dns-addr/test") } func TestCreateTargetURLFailure(t *testing.T) { diff --git a/app/vmctl/backoff/backoff_test.go b/app/vmctl/backoff/backoff_test.go index 3506c4a18..2205b0e34 100644 --- a/app/vmctl/backoff/backoff_test.go +++ b/app/vmctl/backoff/backoff_test.go @@ -7,103 +7,106 @@ import ( "time" ) -func TestRetry_Do(t *testing.T) { - counter := 0 - tests := []struct { - name string - backoffRetries int - backoffFactor float64 - backoffMinDuration time.Duration - retryableFunc retryableFunc - cancelTimeout time.Duration - want uint64 - wantErr bool - }{ - { - name: "return bad request", - retryableFunc: func() error { - return ErrBadRequest - }, - want: 0, - wantErr: true, - }, - { - name: "empty retries values", - retryableFunc: func() error { - time.Sleep(time.Millisecond * 100) - return nil - }, - want: 0, - wantErr: true, - }, - { - name: "only one retry test", - backoffRetries: 5, - backoffFactor: 1.7, - backoffMinDuration: time.Millisecond * 10, - retryableFunc: func() error { - t := time.NewTicker(time.Millisecond * 5) - defer t.Stop() - for range t.C { - counter++ - if counter%2 == 0 { - return fmt.Errorf("got some error") - } - if counter%3 == 0 { - return nil - } - } - return nil - }, - want: 1, - wantErr: false, - }, - { - name: "all retries failed test", - backoffRetries: 5, - backoffFactor: 0.1, - backoffMinDuration: time.Millisecond * 10, - retryableFunc: func() error { - t := time.NewTicker(time.Millisecond * 5) - defer t.Stop() - for range t.C { - return fmt.Errorf("got some error") - } - return nil - }, - want: 5, - wantErr: true, - }, - { - name: "cancel context", - backoffRetries: 5, - backoffFactor: 1.7, - backoffMinDuration: time.Millisecond * 10, - retryableFunc: func() error { - return fmt.Errorf("got some error") - }, - cancelTimeout: time.Millisecond * 40, - want: 3, - wantErr: true, - }, +func TestBackoffRetry_Failure(t *testing.T) { + f := func(backoffFactor float64, backoffRetries int, cancelTimeout time.Duration, retryFunc func() error, resultExpected int) { + t.Helper() + + r := &Backoff{ + retries: backoffRetries, + factor: backoffFactor, + minDuration: time.Millisecond * 10, + } + ctx := context.Background() + if cancelTimeout != 0 { + newCtx, cancelFn := context.WithTimeout(context.Background(), cancelTimeout) + ctx = newCtx + defer cancelFn() + } + + result, err := r.Retry(ctx, retryFunc) + if err == nil { + t.Fatalf("expecting non-nil error") + } + if result != uint64(resultExpected) { + t.Fatalf("unexpected result: got %d; want %d", result, resultExpected) + } } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - r := &Backoff{retries: tt.backoffRetries, factor: tt.backoffFactor, minDuration: tt.backoffMinDuration} - ctx := context.Background() - if tt.cancelTimeout != 0 { - newCtx, cancelFn := context.WithTimeout(context.Background(), tt.cancelTimeout) - ctx = newCtx - defer cancelFn() - } - got, err := r.Retry(ctx, tt.retryableFunc) - if (err != nil) != tt.wantErr { - t.Errorf("Retry() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("Retry() got = %v, want %v", got, tt.want) - } - }) + + // return bad request + retryFunc := func() error { + return ErrBadRequest } + f(0, 0, 0, retryFunc, 0) + + // empty retries values + retryFunc = func() error { + time.Sleep(time.Millisecond * 100) + return nil + } + f(0, 0, 0, retryFunc, 0) + + // all retries failed test + backoffFactor := 0.1 + backoffRetries := 5 + cancelTimeout := time.Second * 0 + retryFunc = func() error { + t := time.NewTicker(time.Millisecond * 5) + defer t.Stop() + for range t.C { + return fmt.Errorf("got some error") + } + return nil + } + resultExpected := 5 + f(backoffFactor, backoffRetries, cancelTimeout, retryFunc, resultExpected) + + // cancel context + backoffFactor = 1.7 + backoffRetries = 5 + cancelTimeout = time.Millisecond * 40 + retryFunc = func() error { + return fmt.Errorf("got some error") + } + resultExpected = 3 + f(backoffFactor, backoffRetries, cancelTimeout, retryFunc, resultExpected) +} + +func TestBackoffRetry_Success(t *testing.T) { + f := func(retryFunc func() error, resultExpected int) { + t.Helper() + + r := &Backoff{ + retries: 5, + factor: 1.7, + minDuration: time.Millisecond * 10, + } + ctx := context.Background() + + result, err := r.Retry(ctx, retryFunc) + if err != nil { + t.Fatalf("Retry() error: %s", err) + } + if result != uint64(resultExpected) { + t.Fatalf("unexpected result: got %d; want %d", result, resultExpected) + } + } + + // only one retry test + counter := 0 + retryFunc := func() error { + t := time.NewTicker(time.Millisecond * 5) + defer t.Stop() + for range t.C { + counter++ + if counter%2 == 0 { + return fmt.Errorf("got some error") + } + if counter%3 == 0 { + return nil + } + } + return nil + } + resultExpected := 1 + f(retryFunc, resultExpected) } diff --git a/app/vmctl/influx/influx_test.go b/app/vmctl/influx/influx_test.go index 94a90bac7..d36fdb642 100644 --- a/app/vmctl/influx/influx_test.go +++ b/app/vmctl/influx/influx_test.go @@ -3,125 +3,97 @@ package influx import "testing" func TestFetchQuery(t *testing.T) { - testCases := []struct { - s Series - timeFilter string - expected string - }{ - { - s: Series{ - Measurement: "cpu", - Field: "value", - LabelPairs: []LabelPair{ - { - Name: "foo", - Value: "bar", - }, - }, - }, - expected: `select "value" from "cpu" where "foo"::tag='bar'`, - }, - { - s: Series{ - Measurement: "cpu", - Field: "value", - LabelPairs: []LabelPair{ - { - Name: "foo", - Value: "bar", - }, - { - Name: "baz", - Value: "qux", - }, - }, - }, - expected: `select "value" from "cpu" where "foo"::tag='bar' and "baz"::tag='qux'`, - }, - { - s: Series{ - Measurement: "cpu", - Field: "value", - LabelPairs: []LabelPair{ - { - Name: "foo", - Value: "b'ar", - }, - }, - }, - timeFilter: "time >= now()", - expected: `select "value" from "cpu" where "foo"::tag='b\'ar' and time >= now()`, - }, - { - s: Series{ - Measurement: "cpu", - Field: "value", - LabelPairs: []LabelPair{ - { - Name: "name", - Value: `dev-mapper-centos\x2dswap.swap`, - }, - { - Name: "state", - Value: "dev-mapp'er-c'en'tos", - }, - }, - }, - timeFilter: "time >= now()", - expected: `select "value" from "cpu" where "name"::tag='dev-mapper-centos\\x2dswap.swap' and "state"::tag='dev-mapp\'er-c\'en\'tos' and time >= now()`, - }, - { - s: Series{ - Measurement: "cpu", - Field: "value", - }, - timeFilter: "time >= now()", - expected: `select "value" from "cpu" where time >= now()`, - }, - { - s: Series{ - Measurement: "cpu", - Field: "value", - }, - expected: `select "value" from "cpu"`, - }, - } + f := func(s *Series, timeFilter, resultExpected string) { + t.Helper() - for _, tc := range testCases { - query := tc.s.fetchQuery(tc.timeFilter) - if query != tc.expected { - t.Fatalf("got: \n%s;\nexpected: \n%s", query, tc.expected) + result := s.fetchQuery(timeFilter) + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%s\nwant\n%s", result, resultExpected) } } + + f(&Series{ + Measurement: "cpu", + Field: "value", + LabelPairs: []LabelPair{ + { + Name: "foo", + Value: "bar", + }, + }, + }, "", `select "value" from "cpu" where "foo"::tag='bar'`) + + f(&Series{ + Measurement: "cpu", + Field: "value", + LabelPairs: []LabelPair{ + { + Name: "foo", + Value: "bar", + }, + { + Name: "baz", + Value: "qux", + }, + }, + }, "", `select "value" from "cpu" where "foo"::tag='bar' and "baz"::tag='qux'`) + + f(&Series{ + Measurement: "cpu", + Field: "value", + LabelPairs: []LabelPair{ + { + Name: "foo", + Value: "b'ar", + }, + }, + }, "time >= now()", `select "value" from "cpu" where "foo"::tag='b\'ar' and time >= now()`) + + f(&Series{ + Measurement: "cpu", + Field: "value", + LabelPairs: []LabelPair{ + { + Name: "name", + Value: `dev-mapper-centos\x2dswap.swap`, + }, + { + Name: "state", + Value: "dev-mapp'er-c'en'tos", + }, + }, + }, "time >= now()", `select "value" from "cpu" where "name"::tag='dev-mapper-centos\\x2dswap.swap' and "state"::tag='dev-mapp\'er-c\'en\'tos' and time >= now()`) + + f(&Series{ + Measurement: "cpu", + Field: "value", + }, "time >= now()", `select "value" from "cpu" where time >= now()`) + + f(&Series{ + Measurement: "cpu", + Field: "value", + }, "", `select "value" from "cpu"`) } func TestTimeFilter(t *testing.T) { - testCases := []struct { - start string - end string - expected string - }{ - { - start: "2020-01-01T20:07:00Z", - end: "2020-01-01T21:07:00Z", - expected: "time >= '2020-01-01T20:07:00Z' and time <= '2020-01-01T21:07:00Z'", - }, - { - expected: "", - }, - { - start: "2020-01-01T20:07:00Z", - expected: "time >= '2020-01-01T20:07:00Z'", - }, - { - end: "2020-01-01T21:07:00Z", - expected: "time <= '2020-01-01T21:07:00Z'", - }, - } - for _, tc := range testCases { - f := timeFilter(tc.start, tc.end) - if f != tc.expected { - t.Fatalf("got: \n%q;\nexpected: \n%q", f, tc.expected) + f := func(start, end, resultExpected string) { + t.Helper() + + result := timeFilter(start, end) + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%s", result, resultExpected) } } + + // no start and end filters + f("", "", "") + + // missing end filter + f("2020-01-01T20:07:00Z", "", "time >= '2020-01-01T20:07:00Z'") + + // missing start filter + f("", "2020-01-01T21:07:00Z", "time <= '2020-01-01T21:07:00Z'") + + // both start and end filters + f("2020-01-01T20:07:00Z", "2020-01-01T21:07:00Z", "time >= '2020-01-01T20:07:00Z' and time <= '2020-01-01T21:07:00Z'") } diff --git a/app/vmctl/influx/parser_test.go b/app/vmctl/influx/parser_test.go index 861f379cf..cc89eeef9 100644 --- a/app/vmctl/influx/parser_test.go +++ b/app/vmctl/influx/parser_test.go @@ -6,71 +6,71 @@ import ( "testing" ) -func TestSeries_Unmarshal(t *testing.T) { +func TestSeriesUnmarshal(t *testing.T) { + f := func(s string, resultExpected *Series) { + t.Helper() + + result := &Series{} + if err := result.unmarshal(s); err != nil { + t.Fatalf("cannot unmarshal series from %q: %s", s, err) + } + if !reflect.DeepEqual(result, resultExpected) { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%v", result, resultExpected) + } + } + tag := func(name, value string) LabelPair { return LabelPair{ Name: name, Value: value, } } - series := func(measurement string, lp ...LabelPair) Series { - return Series{ + series := func(measurement string, lp ...LabelPair) *Series { + return &Series{ Measurement: measurement, LabelPairs: lp, } } - testCases := []struct { - got string - want Series - }{ - { - got: "cpu", - want: series("cpu"), - }, - { - got: "cpu,host=localhost", - want: series("cpu", tag("host", "localhost")), - }, - { - got: "cpu,host=localhost,instance=instance", - want: series("cpu", tag("host", "localhost"), tag("instance", "instance")), - }, - { - got: `fo\,bar\=baz,x\=\b=\\a\,\=\q\ `, - want: series("fo,bar=baz", tag(`x=\b`, `\a,=\q `)), - }, - { - got: "cpu,host=192.168.0.1,instance=fe80::fdc8:5e36:c2c6:baac%utun1", - want: series("cpu", tag("host", "192.168.0.1"), tag("instance", "fe80::fdc8:5e36:c2c6:baac%utun1")), - }, - { - got: `cpu,db=db1,host=localhost,server=host\=localhost\ user\=user\ `, - want: series("cpu", tag("db", "db1"), - tag("host", "localhost"), tag("server", "host=localhost user=user ")), - }, - } - for _, tc := range testCases { - s := Series{} - if err := s.unmarshal(tc.got); err != nil { - t.Fatalf("%q: unmarshal err: %s", tc.got, err) - } - if !reflect.DeepEqual(s, tc.want) { - t.Fatalf("%q: expected\n%#v\nto be equal\n%#v", tc.got, s, tc.want) - } - } + + f("cpu", series("cpu")) + + f("cpu,host=localhost", series("cpu", tag("host", "localhost"))) + + f("cpu,host=localhost,instance=instance", series("cpu", tag("host", "localhost"), tag("instance", "instance"))) + + f(`fo\,bar\=baz,x\=\b=\\a\,\=\q\ `, series("fo,bar=baz", tag(`x=\b`, `\a,=\q `))) + + f("cpu,host=192.168.0.1,instance=fe80::fdc8:5e36:c2c6:baac%utun1", series("cpu", tag("host", "192.168.0.1"), tag("instance", "fe80::fdc8:5e36:c2c6:baac%utun1"))) + + f(`cpu,db=db1,host=localhost,server=host\=localhost\ user\=user\ `, series("cpu", tag("db", "db1"), tag("host", "localhost"), tag("server", "host=localhost user=user "))) } -func TestToFloat64(t *testing.T) { - f := func(in any, want float64) { +func TestToFloat64_Failure(t *testing.T) { + f := func(in any) { t.Helper() - got, err := toFloat64(in) - if err != nil { - t.Fatalf("unexpected err: %s", err) - } - if got != want { - t.Errorf("got %v; want %v", got, want) + + _, err := toFloat64(in) + if err == nil { + t.Fatalf("expecting non-nil error") } } + + f("text") +} + +func TestToFloat64_Success(t *testing.T) { + f := func(in any, resultExpected float64) { + t.Helper() + + result, err := toFloat64(in) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + if result != resultExpected { + t.Fatalf("unexpected result: got %v; want %v", result, resultExpected) + } + } + f("123.4", 123.4) f(float64(123.4), 123.4) f(float32(12), 12) @@ -78,9 +78,4 @@ func TestToFloat64(t *testing.T) { f(true, 1) f(false, 0) f(json.Number("123456.789"), 123456.789) - - _, err := toFloat64("text") - if err == nil { - t.Fatalf("expected to get err; got nil instead") - } } diff --git a/app/vmctl/prometheus/prometheus_test.go b/app/vmctl/prometheus/prometheus_test.go index 6fd738f4e..43ca81d2a 100644 --- a/app/vmctl/prometheus/prometheus_test.go +++ b/app/vmctl/prometheus/prometheus_test.go @@ -5,30 +5,27 @@ import ( ) func TestInRange(t *testing.T) { - testCases := []struct { - filterMin, filterMax int64 - blockMin, blockMax int64 - expected bool - }{ - {0, 0, 1, 2, true}, - {0, 3, 1, 2, true}, - {0, 3, 4, 5, false}, - {3, 0, 1, 2, false}, - {3, 0, 2, 4, true}, - {3, 10, 1, 2, false}, - {3, 10, 1, 4, true}, - {3, 10, 5, 9, true}, - {3, 10, 9, 12, true}, - {3, 10, 12, 15, false}, - } - for _, tc := range testCases { + f := func(filterMin, filterMax, blockMin, blockMax int64, resultExpected bool) { + t.Helper() + f := filter{ - min: tc.filterMin, - max: tc.filterMax, + min: filterMin, + max: filterMax, } - got := f.inRange(tc.blockMin, tc.blockMax) - if got != tc.expected { - t.Fatalf("got %v; expected %v: %v", got, tc.expected, tc) + result := f.inRange(blockMin, blockMax) + if result != resultExpected { + t.Fatalf("unexpected result; got %v; want %v", result, resultExpected) } } + + f(0, 0, 1, 2, true) + f(0, 3, 1, 2, true) + f(0, 3, 4, 5, false) + f(3, 0, 1, 2, false) + f(3, 0, 2, 4, true) + f(3, 10, 1, 2, false) + f(3, 10, 1, 4, true) + f(3, 10, 5, 9, true) + f(3, 10, 9, 12, true) + f(3, 10, 12, 15, false) } diff --git a/app/vmctl/prometheus_test.go b/app/vmctl/prometheus_test.go index 13281d025..ba1a4e508 100644 --- a/app/vmctl/prometheus_test.go +++ b/app/vmctl/prometheus_test.go @@ -23,7 +23,7 @@ const ( ) // This test simulates close process if user abort it -func Test_prometheusProcessor_run(t *testing.T) { +func TestPrometheusProcessorRun(t *testing.T) { t.Skip() defer func() { isSilent = false }() @@ -139,11 +139,11 @@ func Test_prometheusProcessor_run(t *testing.T) { _, err = w.Write(input) if err != nil { - t.Error(err) + t.Fatalf("cannot send 'Y' to importer: %s", err) } err = w.Close() if err != nil { - t.Error(err) + t.Fatalf("cannot close writer: %s", err) } stdin := os.Stdin @@ -151,7 +151,6 @@ func Test_prometheusProcessor_run(t *testing.T) { defer func() { os.Stdin = stdin _ = r.Close() - _ = w.Close() }() os.Stdin = r } @@ -162,7 +161,7 @@ func Test_prometheusProcessor_run(t *testing.T) { } if err := pp.run(); (err != nil) != tt.wantErr { - t.Errorf("run() error = %v, wantErr %v", err, tt.wantErr) + t.Fatalf("run() error = %v, wantErr %v", err, tt.wantErr) } }) } diff --git a/app/vmctl/stepper/split_test.go b/app/vmctl/stepper/split_test.go index a5f10cc8a..3586bdf82 100644 --- a/app/vmctl/stepper/split_test.go +++ b/app/vmctl/stepper/split_test.go @@ -16,535 +16,358 @@ func mustParseDatetime(t string) time.Time { return result } -func Test_splitDateRange(t *testing.T) { - type args struct { - start string - end string - granularity string - } - tests := []struct { - name string - args args - want []testTimeRange - wantErr bool - }{ - { - name: "validates start is before end", - args: args{ - start: "2022-02-01T00:00:00Z", - end: "2022-01-01T00:00:00Z", - granularity: StepMonth, - }, - want: nil, - wantErr: true, - }, - { - name: "validates granularity value", - args: args{ - start: "2022-01-01T00:00:00Z", - end: "2022-02-01T00:00:00Z", - granularity: "non-existent-format", - }, - want: nil, - wantErr: true, - }, - { - name: "month chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-03-03T12:12:12Z", - granularity: StepMonth, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-31T23:59:59.999999999Z", - }, - { - "2022-02-01T00:00:00Z", - "2022-02-28T23:59:59.999999999Z", - }, - { - "2022-03-01T00:00:00Z", - "2022-03-03T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "daily chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-05T12:12:12Z", - granularity: StepDay, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-04T11:11:11Z", - }, - { - "2022-01-04T11:11:11Z", - "2022-01-05T11:11:11Z", - }, - { - "2022-01-05T11:11:11Z", - "2022-01-05T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "hourly chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-03T14:14:14Z", - granularity: StepHour, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-03T12:11:11Z", - }, - { - "2022-01-03T12:11:11Z", - "2022-01-03T13:11:11Z", - }, - { - "2022-01-03T13:11:11Z", - "2022-01-03T14:11:11Z", - }, - { - "2022-01-03T14:11:11Z", - "2022-01-03T14:14:14Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with one day time range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-04T12:12:12Z", - granularity: StepMonth, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-04T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with same day time range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-03T12:12:12Z", - granularity: StepMonth, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-03T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with one month and two days range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-02-03T00:00:00Z", - granularity: StepMonth, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-31T23:59:59.999999999Z", - }, - { - "2022-02-01T00:00:00Z", - "2022-02-03T00:00:00Z", - }, - }, - wantErr: false, - }, - { - name: "week chunking with not full week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-05T23:59:59.999999999Z", - granularity: StepWeek, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-05T23:59:59.999999999Z", - }, - }, - }, - { - name: "week chunking with start of the week and end of the week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-06T00:00:00Z", - granularity: StepWeek, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - }, - }, - { - name: "week chunking with next one day week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-07T01:12:00Z", - granularity: StepWeek, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - { - "2023-08-06T00:00:00Z", - "2023-08-07T01:12:00Z", - }, - }, - }, - { - name: "week chunking with month and not full week representation", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-09-01T01:12:00Z", - granularity: StepWeek, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - { - "2023-08-06T00:00:00Z", - "2023-08-13T00:00:00Z", - }, - { - "2023-08-13T00:00:00Z", - "2023-08-20T00:00:00Z", - }, - { - "2023-08-20T00:00:00Z", - "2023-08-27T00:00:00Z", - }, - { - "2023-08-27T00:00:00Z", - "2023-09-01T01:12:00Z", - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - start := mustParseDatetime(tt.args.start) - end := mustParseDatetime(tt.args.end) +func TestSplitDateRange_Failure(t *testing.T) { + f := func(startStr, endStr, granularity string) { + t.Helper() - got, err := SplitDateRange(start, end, tt.args.granularity, false) - if (err != nil) != tt.wantErr { - t.Errorf("splitDateRange() error = %v, wantErr %v", err, tt.wantErr) - return - } + start := mustParseDatetime(startStr) + end := mustParseDatetime(endStr) - var testExpectedResults [][]time.Time - if tt.want != nil { - testExpectedResults = make([][]time.Time, 0) - for _, dr := range tt.want { - testExpectedResults = append(testExpectedResults, []time.Time{ - mustParseDatetime(dr[0]), - mustParseDatetime(dr[1]), - }) - } - } - - if !reflect.DeepEqual(got, testExpectedResults) { - t.Errorf("splitDateRange() got = %v, want %v", got, testExpectedResults) - } - }) + _, err := SplitDateRange(start, end, granularity, false) + if err == nil { + t.Fatalf("expecting non-nil result") + } } + + // validates start is before end + f("2022-02-01T00:00:00Z", "2022-01-01T00:00:00Z", StepMonth) + + // validates granularity value + f("2022-01-01T00:00:00Z", "2022-02-01T00:00:00Z", "non-existent-format") } -func Test_splitDateRange_reverse(t *testing.T) { - type args struct { - start string - end string - granularity string - timeReverse bool - } - tests := []struct { - name string - args args - want []testTimeRange - wantErr bool - }{ - { - name: "validates start is before end", - args: args{ - start: "2022-02-01T00:00:00Z", - end: "2022-01-01T00:00:00Z", - granularity: StepMonth, - timeReverse: true, - }, - want: nil, - wantErr: true, - }, - { - name: "validates granularity value", - args: args{ - start: "2022-01-01T00:00:00Z", - end: "2022-02-01T00:00:00Z", - granularity: "non-existent-format", - timeReverse: true, - }, - want: nil, - wantErr: true, - }, - { - name: "month chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-03-03T12:12:12Z", - granularity: StepMonth, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-03-01T00:00:00Z", - "2022-03-03T12:12:12Z", - }, - { - "2022-02-01T00:00:00Z", - "2022-02-28T23:59:59.999999999Z", - }, - { - "2022-01-03T11:11:11Z", - "2022-01-31T23:59:59.999999999Z", - }, - }, - wantErr: false, - }, - { - name: "daily chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-05T12:12:12Z", - granularity: StepDay, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-01-05T11:11:11Z", - "2022-01-05T12:12:12Z", - }, - { - "2022-01-04T11:11:11Z", - "2022-01-05T11:11:11Z", - }, - { - "2022-01-03T11:11:11Z", - "2022-01-04T11:11:11Z", - }, - }, - wantErr: false, - }, - { - name: "hourly chunking", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-03T14:14:14Z", - granularity: StepHour, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-01-03T14:11:11Z", - "2022-01-03T14:14:14Z", - }, - { - "2022-01-03T13:11:11Z", - "2022-01-03T14:11:11Z", - }, - { - "2022-01-03T12:11:11Z", - "2022-01-03T13:11:11Z", - }, - { - "2022-01-03T11:11:11Z", - "2022-01-03T12:11:11Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with one day time range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-04T12:12:12Z", - granularity: StepMonth, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-04T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with same day time range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-01-03T12:12:12Z", - granularity: StepMonth, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-01-03T11:11:11Z", - "2022-01-03T12:12:12Z", - }, - }, - wantErr: false, - }, - { - name: "month chunking with one month and two days range", - args: args{ - start: "2022-01-03T11:11:11Z", - end: "2022-02-03T00:00:00Z", - granularity: StepMonth, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2022-02-01T00:00:00Z", - "2022-02-03T00:00:00Z", - }, - { - "2022-01-03T11:11:11Z", - "2022-01-31T23:59:59.999999999Z", - }, - }, - wantErr: false, - }, - { - name: "week chunking with not full week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-05T23:59:59.999999999Z", - granularity: StepWeek, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-05T23:59:59.999999999Z", - }, - }, - }, - { - name: "week chunking with start of the week and end of the week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-06T00:00:00Z", - granularity: StepWeek, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - }, - }, - { - name: "week chunking with next one day week", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-08-07T01:12:00Z", - granularity: StepWeek, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2023-08-06T00:00:00Z", - "2023-08-07T01:12:00Z", - }, - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - }, - }, - { - name: "week chunking with month and not full week representation", - args: args{ - start: "2023-07-30T00:00:00Z", - end: "2023-09-01T01:12:00Z", - granularity: StepWeek, - timeReverse: true, - }, - want: []testTimeRange{ - { - "2023-08-27T00:00:00Z", - "2023-09-01T01:12:00Z", - }, - { - "2023-08-20T00:00:00Z", - "2023-08-27T00:00:00Z", - }, - { - "2023-08-13T00:00:00Z", - "2023-08-20T00:00:00Z", - }, - { - "2023-08-06T00:00:00Z", - "2023-08-13T00:00:00Z", - }, - { - "2023-07-30T00:00:00Z", - "2023-08-06T00:00:00Z", - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - start := mustParseDatetime(tt.args.start) - end := mustParseDatetime(tt.args.end) +func TestSplitDateRange_Success(t *testing.T) { + f := func(startStr, endStr, granularity string, resultExpected []testTimeRange) { + t.Helper() - got, err := SplitDateRange(start, end, tt.args.granularity, tt.args.timeReverse) - if (err != nil) != tt.wantErr { - t.Errorf("splitDateRange() error = %v, wantErr %v", err, tt.wantErr) - return - } + start := mustParseDatetime(startStr) + end := mustParseDatetime(endStr) - var testExpectedResults [][]time.Time - if tt.want != nil { - testExpectedResults = make([][]time.Time, 0) - for _, dr := range tt.want { - testExpectedResults = append(testExpectedResults, []time.Time{ - mustParseDatetime(dr[0]), - mustParseDatetime(dr[1]), - }) - } - } + result, err := SplitDateRange(start, end, granularity, false) + if err != nil { + t.Fatalf("SplitDateRange() error: %s", err) + } - if !reflect.DeepEqual(got, testExpectedResults) { - t.Errorf("splitDateRange() got = %v, want %v", got, testExpectedResults) - } - }) + var testExpectedResults [][]time.Time + for _, dr := range resultExpected { + testExpectedResults = append(testExpectedResults, []time.Time{ + mustParseDatetime(dr[0]), + mustParseDatetime(dr[1]), + }) + } + + if !reflect.DeepEqual(result, testExpectedResults) { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%v", result, testExpectedResults) + } } + + // month chunking + f("2022-01-03T11:11:11Z", "2022-03-03T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-31T23:59:59.999999999Z", + }, + { + "2022-02-01T00:00:00Z", + "2022-02-28T23:59:59.999999999Z", + }, + { + "2022-03-01T00:00:00Z", + "2022-03-03T12:12:12Z", + }, + }) + + // daily chunking + f("2022-01-03T11:11:11Z", "2022-01-05T12:12:12Z", StepDay, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-04T11:11:11Z", + }, + { + "2022-01-04T11:11:11Z", + "2022-01-05T11:11:11Z", + }, + { + "2022-01-05T11:11:11Z", + "2022-01-05T12:12:12Z", + }, + }) + + // hourly chunking + f("2022-01-03T11:11:11Z", "2022-01-03T14:14:14Z", StepHour, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-03T12:11:11Z", + }, + { + "2022-01-03T12:11:11Z", + "2022-01-03T13:11:11Z", + }, + { + "2022-01-03T13:11:11Z", + "2022-01-03T14:11:11Z", + }, + { + "2022-01-03T14:11:11Z", + "2022-01-03T14:14:14Z", + }, + }) + + // month chunking with one day time range + f("2022-01-03T11:11:11Z", "2022-01-04T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-04T12:12:12Z", + }, + }) + + // month chunking with same day time range + f("2022-01-03T11:11:11Z", "2022-01-03T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-03T12:12:12Z", + }, + }) + + // month chunking with one month and two days range + f("2022-01-03T11:11:11Z", "2022-02-03T00:00:00Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-31T23:59:59.999999999Z", + }, + { + "2022-02-01T00:00:00Z", + "2022-02-03T00:00:00Z", + }, + }) + + // week chunking with not full week + f("2023-07-30T00:00:00Z", "2023-08-05T23:59:59.999999999Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-05T23:59:59.999999999Z", + }, + }) + + // week chunking with start of the week and end of the week + f("2023-07-30T00:00:00Z", "2023-08-06T00:00:00Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + }) + + // week chunking with next one day week + f("2023-07-30T00:00:00Z", "2023-08-07T01:12:00Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + { + "2023-08-06T00:00:00Z", + "2023-08-07T01:12:00Z", + }, + }) + + // week chunking with month and not full week representation + f("2023-07-30T00:00:00Z", "2023-09-01T01:12:00Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + { + "2023-08-06T00:00:00Z", + "2023-08-13T00:00:00Z", + }, + { + "2023-08-13T00:00:00Z", + "2023-08-20T00:00:00Z", + }, + { + "2023-08-20T00:00:00Z", + "2023-08-27T00:00:00Z", + }, + { + "2023-08-27T00:00:00Z", + "2023-09-01T01:12:00Z", + }, + }) +} + +func TestSplitDateRange_Reverse_Failure(t *testing.T) { + f := func(startStr, endStr, granularity string) { + t.Helper() + + start := mustParseDatetime(startStr) + end := mustParseDatetime(endStr) + + _, err := SplitDateRange(start, end, granularity, true) + if err == nil { + t.Fatalf("expecting non-nil error") + } + } + + // validates start is before end + f("2022-02-01T00:00:00Z", "2022-01-01T00:00:00Z", StepMonth) + + // validates granularity value + f("2022-01-01T00:00:00Z", "2022-02-01T00:00:00Z", "non-existent-format") +} + +func TestSplitDateRange_Reverse_Success(t *testing.T) { + f := func(startStr, endStr, granularity string, resultExpected []testTimeRange) { + t.Helper() + + start := mustParseDatetime(startStr) + end := mustParseDatetime(endStr) + + result, err := SplitDateRange(start, end, granularity, true) + if err != nil { + t.Fatalf("SplitDateRange() error: %s", err) + } + + var testExpectedResults [][]time.Time + for _, dr := range resultExpected { + testExpectedResults = append(testExpectedResults, []time.Time{ + mustParseDatetime(dr[0]), + mustParseDatetime(dr[1]), + }) + } + + if !reflect.DeepEqual(result, testExpectedResults) { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%v", result, testExpectedResults) + } + } + + // month chunking + f("2022-01-03T11:11:11Z", "2022-03-03T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-03-01T00:00:00Z", + "2022-03-03T12:12:12Z", + }, + { + "2022-02-01T00:00:00Z", + "2022-02-28T23:59:59.999999999Z", + }, + { + "2022-01-03T11:11:11Z", + "2022-01-31T23:59:59.999999999Z", + }, + }) + + // daily chunking + f("2022-01-03T11:11:11Z", "2022-01-05T12:12:12Z", StepDay, []testTimeRange{ + { + "2022-01-05T11:11:11Z", + "2022-01-05T12:12:12Z", + }, + { + "2022-01-04T11:11:11Z", + "2022-01-05T11:11:11Z", + }, + { + "2022-01-03T11:11:11Z", + "2022-01-04T11:11:11Z", + }, + }) + + // hourly chunking + f("2022-01-03T11:11:11Z", "2022-01-03T14:14:14Z", StepHour, []testTimeRange{ + { + "2022-01-03T14:11:11Z", + "2022-01-03T14:14:14Z", + }, + { + "2022-01-03T13:11:11Z", + "2022-01-03T14:11:11Z", + }, + { + "2022-01-03T12:11:11Z", + "2022-01-03T13:11:11Z", + }, + { + "2022-01-03T11:11:11Z", + "2022-01-03T12:11:11Z", + }, + }) + + // month chunking with one day time range + f("2022-01-03T11:11:11Z", "2022-01-04T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-04T12:12:12Z", + }, + }) + + // month chunking with same day time range + f("2022-01-03T11:11:11Z", "2022-01-03T12:12:12Z", StepMonth, []testTimeRange{ + { + "2022-01-03T11:11:11Z", + "2022-01-03T12:12:12Z", + }, + }) + + // month chunking with one month and two days range + f("2022-01-03T11:11:11Z", "2022-02-03T00:00:00Z", StepMonth, []testTimeRange{ + { + "2022-02-01T00:00:00Z", + "2022-02-03T00:00:00Z", + }, + { + "2022-01-03T11:11:11Z", + "2022-01-31T23:59:59.999999999Z", + }, + }) + + // week chunking with not full week + f("2023-07-30T00:00:00Z", "2023-08-05T23:59:59.999999999Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-05T23:59:59.999999999Z", + }, + }) + + // week chunking with start of the week and end of the week + f("2023-07-30T00:00:00Z", "2023-08-06T00:00:00Z", StepWeek, []testTimeRange{ + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + }) + + // week chunking with next one day week + f("2023-07-30T00:00:00Z", "2023-08-07T01:12:00Z", StepWeek, []testTimeRange{ + { + "2023-08-06T00:00:00Z", + "2023-08-07T01:12:00Z", + }, + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + }) + + // week chunking with month and not full week representation + f("2023-07-30T00:00:00Z", "2023-09-01T01:12:00Z", StepWeek, []testTimeRange{ + { + "2023-08-27T00:00:00Z", + "2023-09-01T01:12:00Z", + }, + { + "2023-08-20T00:00:00Z", + "2023-08-27T00:00:00Z", + }, + { + "2023-08-13T00:00:00Z", + "2023-08-20T00:00:00Z", + }, + { + "2023-08-06T00:00:00Z", + "2023-08-13T00:00:00Z", + }, + { + "2023-07-30T00:00:00Z", + "2023-08-06T00:00:00Z", + }, + }) } diff --git a/app/vmctl/testdata/servers_integration_test/remote_write_server.go b/app/vmctl/testdata/servers_integration_test/remote_write_server.go index ece2bde21..500acc114 100644 --- a/app/vmctl/testdata/servers_integration_test/remote_write_server.go +++ b/app/vmctl/testdata/servers_integration_test/remote_write_server.go @@ -255,8 +255,7 @@ func (rws *RemoteWriteServer) importNativeHandler(t *testing.T) http.Handler { if !reflect.DeepEqual(gotTimeSeries, rws.expectedSeries) { w.WriteHeader(http.StatusInternalServerError) - t.Errorf("datasets not equal, expected: %#v;\n got: %#v", rws.expectedSeries, gotTimeSeries) - return + t.Fatalf("datasets not equal, expected: %#v;\n got: %#v", rws.expectedSeries, gotTimeSeries) } w.WriteHeader(http.StatusNoContent) diff --git a/app/vmctl/utils/time_test.go b/app/vmctl/utils/time_test.go index 95f5fbdf4..b469a5899 100644 --- a/app/vmctl/utils/time_test.go +++ b/app/vmctl/utils/time_test.go @@ -5,175 +5,87 @@ import ( "time" ) -func TestGetTime(t *testing.T) { - tests := []struct { - name string - s string - want func() time.Time - wantErr bool - }{ - { - name: "empty string", - s: "", - want: func() time.Time { return time.Time{} }, - wantErr: true, - }, - { - name: "only year", - s: "2019", - want: func() time.Time { - t := time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year and month", - s: "2019-01", - want: func() time.Time { - t := time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year and not first month", - s: "2019-02", - want: func() time.Time { - t := time.Date(2019, 2, 1, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year, month and day", - s: "2019-02-01", - want: func() time.Time { - t := time.Date(2019, 2, 1, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year, month and not first day", - s: "2019-02-10", - want: func() time.Time { - t := time.Date(2019, 2, 10, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year, month, day and time", - s: "2019-02-02T00", - want: func() time.Time { - t := time.Date(2019, 2, 2, 0, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "year, month, day and one hour time", - s: "2019-02-02T01", - want: func() time.Time { - t := time.Date(2019, 2, 2, 1, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "time with zero minutes", - s: "2019-02-02T01:00", - want: func() time.Time { - t := time.Date(2019, 2, 2, 1, 0, 0, 0, time.UTC) - return t - }, - }, - { - name: "time with one minute", - s: "2019-02-02T01:01", - want: func() time.Time { - t := time.Date(2019, 2, 2, 1, 1, 0, 0, time.UTC) - return t - }, - }, - { - name: "time with zero seconds", - s: "2019-02-02T01:01:00", - want: func() time.Time { - t := time.Date(2019, 2, 2, 1, 1, 0, 0, time.UTC) - return t - }, - }, - { - name: "timezone with one second", - s: "2019-02-02T01:01:01", - want: func() time.Time { - t := time.Date(2019, 2, 2, 1, 1, 1, 0, time.UTC) - return t - }, - }, - { - name: "time with two second and timezone", - s: "2019-07-07T20:01:02Z", - want: func() time.Time { - t := time.Date(2019, 7, 7, 20, 1, 02, 0, time.UTC) - return t - }, - }, - { - name: "time with seconds and timezone", - s: "2019-07-07T20:47:40+03:00", - want: func() time.Time { - l, _ := time.LoadLocation("Europe/Kiev") - t := time.Date(2019, 7, 7, 20, 47, 40, 0, l) - return t - }, - }, - { - name: "negative time", - s: "-292273086-05-16T16:47:06Z", - want: func() time.Time { return time.Time{} }, - wantErr: true, - }, - { - name: "float timestamp representation", - s: "1562529662.324", - want: func() time.Time { - t := time.Date(2019, 7, 7, 20, 01, 02, 324e6, time.UTC) - return t - }, - }, - { - name: "negative timestamp", - s: "-9223372036.855", - want: func() time.Time { - return time.Date(1970, 01, 01, 00, 00, 00, 00, time.UTC) - }, - wantErr: false, - }, - { - name: "big timestamp", - s: "1223372036855", - want: func() time.Time { - t := time.Date(2008, 10, 7, 9, 33, 56, 855e6, time.UTC) - return t - }, - wantErr: false, - }, - { - name: "duration time", - s: "1h5m", - want: func() time.Time { - t := time.Now().Add(-1 * time.Hour).Add(-5 * time.Minute) - return t - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseTime(tt.s) - if (err != nil) != tt.wantErr { - t.Errorf("ParseTime() error = %v, wantErr %v", err, tt.wantErr) - return - } - w := tt.want() - if got.Unix() != w.Unix() { - t.Errorf("ParseTime() got = %v, want %v", got, w) - } - }) +func TestGetTime_Failure(t *testing.T) { + f := func(s string) { + t.Helper() + + _, err := ParseTime(s) + if err == nil { + t.Fatalf("expecting non-nil error") + } } + + // empty string + f("") + + // negative time + f("-292273086-05-16T16:47:06Z") +} + +func TestGetTime_Success(t *testing.T) { + f := func(s string, resultExpected time.Time) { + t.Helper() + + result, err := ParseTime(s) + if err != nil { + t.Fatalf("ParseTime() error: %s", err) + } + if result.Unix() != resultExpected.Unix() { + t.Fatalf("unexpected result; got %s; want %s", result, resultExpected) + } + } + + // only year + f("2019", time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC)) + + // year and month + f("2019-01", time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC)) + + // year and not first month + f("2019-02", time.Date(2019, 2, 1, 0, 0, 0, 0, time.UTC)) + + // year, month and day + f("2019-02-01", time.Date(2019, 2, 1, 0, 0, 0, 0, time.UTC)) + + // year, month and not first day + f("2019-02-10", time.Date(2019, 2, 10, 0, 0, 0, 0, time.UTC)) + + // year, month, day and time + f("2019-02-02T00", time.Date(2019, 2, 2, 0, 0, 0, 0, time.UTC)) + + // year, month, day and one hour time + f("2019-02-02T01", time.Date(2019, 2, 2, 1, 0, 0, 0, time.UTC)) + + // time with zero minutes + f("2019-02-02T01:00", time.Date(2019, 2, 2, 1, 0, 0, 0, time.UTC)) + + // time with one minute + f("2019-02-02T01:01", time.Date(2019, 2, 2, 1, 1, 0, 0, time.UTC)) + + // time with zero seconds + f("2019-02-02T01:01:00", time.Date(2019, 2, 2, 1, 1, 0, 0, time.UTC)) + + // timezone with one second + f("2019-02-02T01:01:01", time.Date(2019, 2, 2, 1, 1, 1, 0, time.UTC)) + + // time with two second and timezone + f("2019-07-07T20:01:02Z", time.Date(2019, 7, 7, 20, 1, 02, 0, time.UTC)) + + // time with seconds and timezone + f("2019-07-07T20:47:40+03:00", func() time.Time { + l, _ := time.LoadLocation("Europe/Kiev") + return time.Date(2019, 7, 7, 20, 47, 40, 0, l) + }()) + + // float timestamp representation", + f("1562529662.324", time.Date(2019, 7, 7, 20, 01, 02, 324e6, time.UTC)) + + // negative timestamp + f("-9223372036.855", time.Date(1970, 01, 01, 00, 00, 00, 00, time.UTC)) + + // big timestamp + f("1223372036855", time.Date(2008, 10, 7, 9, 33, 56, 855e6, time.UTC)) + + // duration time + f("1h5m", time.Now().Add(-1*time.Hour).Add(-5*time.Minute)) } diff --git a/app/vmctl/vm/timeseries_test.go b/app/vmctl/vm/timeseries_test.go index f020df96f..e983f76cc 100644 --- a/app/vmctl/vm/timeseries_test.go +++ b/app/vmctl/vm/timeseries_test.go @@ -7,83 +7,68 @@ import ( "testing" ) -func TestTimeSeries_Write(t *testing.T) { - var testCases = []struct { - name string - ts *TimeSeries - exp string - }{ - { - name: "one datapoint", - ts: &TimeSeries{ - Name: "foo", - LabelPairs: []LabelPair{ - { - Name: "key", - Value: "val", - }, - }, - Timestamps: []int64{1577877162200}, - Values: []float64{1}, - }, - exp: `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200],"values":[1]}`, - }, - { - name: "multiple samples", - ts: &TimeSeries{ - Name: "foo", - LabelPairs: []LabelPair{ - { - Name: "key", - Value: "val", - }, - }, - Timestamps: []int64{1577877162200, 15778771622400, 15778771622600}, - Values: []float64{1, 1.6263, 32.123}, - }, - exp: `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200,15778771622400,15778771622600],"values":[1,1.6263,32.123]}`, - }, - { - name: "no samples", - ts: &TimeSeries{ - Name: "foo", - LabelPairs: []LabelPair{ - { - Name: "key", - Value: "val", - }, - }, - }, - exp: ``, - }, - { - name: "inf values", - ts: &TimeSeries{ - Name: "foo", - LabelPairs: []LabelPair{ - { - Name: "key", - Value: "val", - }, - }, - Timestamps: []int64{1577877162200, 1577877162200, 1577877162200}, - Values: []float64{0, math.Inf(-1), math.Inf(1)}, - }, - exp: `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200,1577877162200,1577877162200],"values":[0,-Inf,+Inf]}`, - }, +func TestTimeSeriesWrite(t *testing.T) { + f := func(ts *TimeSeries, resultExpected string) { + t.Helper() + + var b bytes.Buffer + _, err := ts.write(&b) + if err != nil { + t.Fatalf("error in TimeSeries.write: %s", err) + } + result := strings.TrimSpace(b.String()) + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%v\nwant\n%v", result, resultExpected) + } } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - b := &bytes.Buffer{} - _, err := tc.ts.write(b) - if err != nil { - t.Error(err) - } - got := strings.TrimSpace(b.String()) - if got != tc.exp { - t.Fatalf("\ngot: %q\nwant: %q", got, tc.exp) - } - }) - } + // one datapoint + f(&TimeSeries{ + Name: "foo", + LabelPairs: []LabelPair{ + { + Name: "key", + Value: "val", + }, + }, + Timestamps: []int64{1577877162200}, + Values: []float64{1}, + }, `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200],"values":[1]}`) + + // multiple samples + f(&TimeSeries{ + Name: "foo", + LabelPairs: []LabelPair{ + { + Name: "key", + Value: "val", + }, + }, + Timestamps: []int64{1577877162200, 15778771622400, 15778771622600}, + Values: []float64{1, 1.6263, 32.123}, + }, `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200,15778771622400,15778771622600],"values":[1,1.6263,32.123]}`) + + // no samples + f(&TimeSeries{ + Name: "foo", + LabelPairs: []LabelPair{ + { + Name: "key", + Value: "val", + }, + }, + }, ``) + + // inf values + f(&TimeSeries{ + Name: "foo", + LabelPairs: []LabelPair{ + { + Name: "key", + Value: "val", + }, + }, + Timestamps: []int64{1577877162200, 1577877162200, 1577877162200}, + Values: []float64{0, math.Inf(-1), math.Inf(1)}, + }, `{"metric":{"__name__":"foo","key":"val"},"timestamps":[1577877162200,1577877162200,1577877162200],"values":[0,-Inf,+Inf]}`) } diff --git a/app/vmctl/vm/vm_test.go b/app/vmctl/vm/vm_test.go index 1d9d42523..3e9a1b85b 100644 --- a/app/vmctl/vm/vm_test.go +++ b/app/vmctl/vm/vm_test.go @@ -2,68 +2,42 @@ package vm import "testing" -func TestAddExtraLabelsToImportPath(t *testing.T) { - type args struct { - path string - extraLabels []string - } - tests := []struct { - name string - args args - want string - wantErr bool - }{ - { - name: "ok w/o extra labels", - args: args{ - path: "/api/v1/import", - }, - want: "/api/v1/import", - }, - { - name: "ok one extra label", - args: args{ - path: "/api/v1/import", - extraLabels: []string{"instance=host-1"}, - }, - want: "/api/v1/import?extra_label=instance=host-1", - }, - { - name: "ok two extra labels", - args: args{ - path: "/api/v1/import", - extraLabels: []string{"instance=host-2", "job=vmagent"}, - }, - want: "/api/v1/import?extra_label=instance=host-2&extra_label=job=vmagent", - }, - { - name: "ok two extra with exist param", - args: args{ - path: "/api/v1/import?timeout=50", - extraLabels: []string{"instance=host-2", "job=vmagent"}, - }, - want: "/api/v1/import?timeout=50&extra_label=instance=host-2&extra_label=job=vmagent", - }, - { - name: "bad incorrect format for extra label", - args: args{ - path: "/api/v1/import", - extraLabels: []string{"label=value", "bad_label_wo_value"}, - }, - want: "/api/v1/import", - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := AddExtraLabelsToImportPath(tt.args.path, tt.args.extraLabels) - if (err != nil) != tt.wantErr { - t.Errorf("AddExtraLabelsToImportPath() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("AddExtraLabelsToImportPath() got = %v, want %v", got, tt.want) - } - }) +func TestAddExtraLabelsToImportPath_Failure(t *testing.T) { + f := func(path string, extraLabels []string) { + t.Helper() + + _, err := AddExtraLabelsToImportPath(path, extraLabels) + if err == nil { + t.Fatalf("expecting non-nil error") + } } + + // bad incorrect format for extra label + f("/api/v1/import", []string{"label=value", "bad_label_wo_value"}) +} + +func TestAddExtraLabelsToImportPath_Success(t *testing.T) { + f := func(path string, extraLabels []string, resultExpected string) { + t.Helper() + + result, err := AddExtraLabelsToImportPath(path, extraLabels) + if err != nil { + t.Fatalf("AddExtraLabelsToImportPath() error: %s", err) + } + if result != resultExpected { + t.Fatalf("unexpected result; got %q; want %q", result, resultExpected) + } + } + + // ok w/o extra labels + f("/api/v1/import", nil, "/api/v1/import") + + // ok one extra label + f("/api/v1/import", []string{"instance=host-1"}, "/api/v1/import?extra_label=instance=host-1") + + // ok two extra labels + f("/api/v1/import", []string{"instance=host-2", "job=vmagent"}, "/api/v1/import?extra_label=instance=host-2&extra_label=job=vmagent") + + // ok two extra with exist param + f("/api/v1/import?timeout=50", []string{"instance=host-2", "job=vmagent"}, "/api/v1/import?timeout=50&extra_label=instance=host-2&extra_label=job=vmagent") } diff --git a/app/vmctl/vm_native_test.go b/app/vmctl/vm_native_test.go index 1f24dad56..6060d79c4 100644 --- a/app/vmctl/vm_native_test.go +++ b/app/vmctl/vm_native_test.go @@ -13,7 +13,6 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/backoff" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/native" - "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/stepper" remote_read_integration "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/testdata/servers_integration_test" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql" @@ -27,7 +26,82 @@ const ( retentionPeriod = "100y" ) -func Test_vmNativeProcessor_run(t *testing.T) { +func TestVMNativeProcessorRun(t *testing.T) { + f := func(startStr, endStr string, numOfSeries, numOfSamples int, resultExpected []vm.TimeSeries) { + t.Helper() + + src := remote_read_integration.NewRemoteWriteServer(t) + dst := remote_read_integration.NewRemoteWriteServer(t) + + defer func() { + src.Close() + dst.Close() + }() + + start, err := time.Parse(time.RFC3339, startStr) + if err != nil { + t.Fatalf("cannot parse start time: %s", err) + } + + end, err := time.Parse(time.RFC3339, endStr) + if err != nil { + t.Fatalf("cannot parse end time: %s", err) + } + + matchName := "__name__" + matchValue := ".*" + filter := native.Filter{ + Match: fmt.Sprintf("{%s=~%q}", matchName, matchValue), + TimeStart: startStr, + TimeEnd: endStr, + } + + rws := remote_read_integration.GenerateVNSeries(start.Unix(), end.Unix(), int64(numOfSeries), int64(numOfSamples)) + + src.Series(rws) + dst.ExpectedSeries(resultExpected) + + if err := fillStorage(rws); err != nil { + t.Fatalf("cannot add series to storage: %s", err) + } + + srcClient := &native.Client{ + AuthCfg: nil, + Addr: src.URL(), + ExtraLabels: []string{}, + HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}}, + } + dstClient := &native.Client{ + AuthCfg: nil, + Addr: dst.URL(), + ExtraLabels: []string{}, + HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}}, + } + + isSilent = true + defer func() { isSilent = false }() + + p := &vmNativeProcessor{ + filter: filter, + dst: dstClient, + src: srcClient, + backoff: backoff.New(), + cc: 1, + isNative: true, + } + + ctx := context.Background() + if err := p.run(ctx); err != nil { + t.Fatalf("run() error: %s", err) + } + deleted, err := deleteSeries(matchName, matchValue) + if err != nil { + t.Fatalf("cannot delete series: %s", err) + } + if deleted != numOfSeries { + t.Fatalf("unexpected number of deleted series; got %d; want %d", deleted, numOfSeries) + } + } processFlags() vmstorage.Init(promql.ResetRollupResultCacheIfNeeded) @@ -42,214 +116,78 @@ func Test_vmNativeProcessor_run(t *testing.T) { defer func() { barpool.Disable(false) }() - defer func() { isSilent = false }() - type fields struct { - filter native.Filter - dst *native.Client - src *native.Client - backoff *backoff.Backoff - s *stats - rateLimit int64 - interCluster bool - cc int - matchName string - matchValue string - } - type args struct { - ctx context.Context - silent bool - } - - tests := []struct { - name string - fields fields - args args - vmSeries func(start, end, numOfSeries, numOfSamples int64) []vm.TimeSeries - expectedSeries []vm.TimeSeries - start string - end string - numOfSamples int64 - numOfSeries int64 - chunk string - wantErr bool - }{ + // step minute on minute time range + start := "2022-11-25T11:23:05+02:00" + end := "2022-11-27T11:24:05+02:00" + numOfSeries := 3 + numOfSamples := 2 + resultExpected := []vm.TimeSeries{ { - name: "step minute on minute time range", - start: "2022-11-25T11:23:05+02:00", - end: "2022-11-27T11:24:05+02:00", - numOfSamples: 2, - numOfSeries: 3, - chunk: stepper.StepMinute, - fields: fields{ - filter: native.Filter{}, - backoff: backoff.New(), - rateLimit: 0, - interCluster: false, - cc: 1, - matchName: "__name__", - matchValue: ".*", - }, - args: args{ - ctx: context.Background(), - silent: true, - }, - vmSeries: remote_read_integration.GenerateVNSeries, - expectedSeries: []vm.TimeSeries{ - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, - Timestamps: []int64{1669368185000, 1669454615000}, - Values: []float64{0, 0}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, - Timestamps: []int64{1669368185000, 1669454615000}, - Values: []float64{100, 100}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, - Timestamps: []int64{1669368185000, 1669454615000}, - Values: []float64{200, 200}, - }, - }, - wantErr: false, + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, + Timestamps: []int64{1669368185000, 1669454615000}, + Values: []float64{0, 0}, }, { - name: "step month on month time range", - start: "2022-09-26T11:23:05+02:00", - end: "2022-11-26T11:24:05+02:00", - numOfSamples: 2, - numOfSeries: 3, - chunk: stepper.StepMonth, - fields: fields{ - filter: native.Filter{}, - backoff: backoff.New(), - rateLimit: 0, - interCluster: false, - cc: 1, - matchName: "__name__", - matchValue: ".*", - }, - args: args{ - ctx: context.Background(), - silent: true, - }, - vmSeries: remote_read_integration.GenerateVNSeries, - expectedSeries: []vm.TimeSeries{ - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, - Timestamps: []int64{1664184185000}, - Values: []float64{0}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, - Timestamps: []int64{1666819415000}, - Values: []float64{0}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, - Timestamps: []int64{1664184185000}, - Values: []float64{100}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, - Timestamps: []int64{1666819415000}, - Values: []float64{100}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, - Timestamps: []int64{1664184185000}, - Values: []float64{200}, - }, - { - Name: "vm_metric_1", - LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, - Timestamps: []int64{1666819415000}, - Values: []float64{200}, - }, - }, - wantErr: false, + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, + Timestamps: []int64{1669368185000, 1669454615000}, + Values: []float64{100, 100}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, + Timestamps: []int64{1669368185000, 1669454615000}, + Values: []float64{200, 200}, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - src := remote_read_integration.NewRemoteWriteServer(t) - dst := remote_read_integration.NewRemoteWriteServer(t) + f(start, end, numOfSeries, numOfSamples, resultExpected) - defer func() { - src.Close() - dst.Close() - }() - - start, err := time.Parse(time.RFC3339, tt.start) - if err != nil { - t.Fatalf("Error parse start time: %s", err) - } - - end, err := time.Parse(time.RFC3339, tt.end) - if err != nil { - t.Fatalf("Error parse end time: %s", err) - } - - tt.fields.filter.Match = fmt.Sprintf("{%s=~%q}", tt.fields.matchName, tt.fields.matchValue) - tt.fields.filter.TimeStart = tt.start - tt.fields.filter.TimeEnd = tt.end - - rws := tt.vmSeries(start.Unix(), end.Unix(), tt.numOfSeries, tt.numOfSamples) - - src.Series(rws) - dst.ExpectedSeries(tt.expectedSeries) - - if err := fillStorage(rws); err != nil { - t.Fatalf("error add series to storage: %s", err) - } - - tt.fields.src = &native.Client{ - AuthCfg: nil, - Addr: src.URL(), - ExtraLabels: []string{}, - HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}}, - } - tt.fields.dst = &native.Client{ - AuthCfg: nil, - Addr: dst.URL(), - ExtraLabels: []string{}, - HTTPClient: &http.Client{Transport: &http.Transport{DisableKeepAlives: false}}, - } - - isSilent = tt.args.silent - p := &vmNativeProcessor{ - filter: tt.fields.filter, - dst: tt.fields.dst, - src: tt.fields.src, - backoff: tt.fields.backoff, - s: tt.fields.s, - rateLimit: tt.fields.rateLimit, - interCluster: tt.fields.interCluster, - cc: tt.fields.cc, - isNative: true, - } - - if err := p.run(tt.args.ctx); (err != nil) != tt.wantErr { - t.Errorf("run() error = %v, wantErr %v", err, tt.wantErr) - } - deleted, err := deleteSeries(tt.fields.matchName, tt.fields.matchValue) - if err != nil { - t.Fatalf("error delete series: %s", err) - } - if int64(deleted) != tt.numOfSeries { - t.Fatalf("expected deleted series %d; got deleted series %d", tt.numOfSeries, deleted) - } - }) + // step month on month time range + start = "2022-09-26T11:23:05+02:00" + end = "2022-11-26T11:24:05+02:00" + numOfSeries = 3 + numOfSamples = 2 + resultExpected = []vm.TimeSeries{ + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, + Timestamps: []int64{1664184185000}, + Values: []float64{0}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "0"}}, + Timestamps: []int64{1666819415000}, + Values: []float64{0}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, + Timestamps: []int64{1664184185000}, + Values: []float64{100}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "1"}}, + Timestamps: []int64{1666819415000}, + Values: []float64{100}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, + Timestamps: []int64{1664184185000}, + Values: []float64{200}, + }, + { + Name: "vm_metric_1", + LabelPairs: []vm.LabelPair{{Name: "job", Value: "2"}}, + Timestamps: []int64{1666819415000}, + Values: []float64{200}, + }, } + f(start, end, numOfSeries, numOfSamples, resultExpected) } func processFlags() { @@ -311,95 +249,57 @@ func deleteSeries(name, value string) (int, error) { return vmstorage.DeleteSeries(nil, []*storage.TagFilters{tfs}) } -func Test_buildMatchWithFilter(t *testing.T) { - tests := []struct { - name string - filter string - metricName string - want string - wantErr bool - }{ - { - name: "parsed metric with label", - filter: `{__name__="http_request_count_total",cluster="kube1"}`, - metricName: "http_request_count_total", - want: `{cluster="kube1",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "metric name with label", - filter: `http_request_count_total{cluster="kube1"}`, - metricName: "http_request_count_total", - want: `{cluster="kube1",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "parsed metric with regexp value", - filter: `{__name__="http_request_count_total",cluster=~"kube.*"}`, - metricName: "http_request_count_total", - want: `{cluster=~"kube.*",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "only label with regexp", - filter: `{cluster=~".*"}`, - metricName: "http_request_count_total", - want: `{cluster=~".*",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "many labels in filter with regexp", - filter: `{cluster=~".*",job!=""}`, - metricName: "http_request_count_total", - want: `{cluster=~".*",job!="",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "match with error", - filter: `{cluster~=".*"}`, - metricName: "http_request_count_total", - want: ``, - wantErr: true, - }, - { - name: "all names", - filter: `{__name__!=""}`, - metricName: "http_request_count_total", - want: `{__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "with many underscores labels", - filter: `{__name__!="", __meta__!=""}`, - metricName: "http_request_count_total", - want: `{__meta__!="",__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "metric name has regexp", - filter: `{__name__=~".*"}`, - metricName: "http_request_count_total", - want: `{__name__="http_request_count_total"}`, - wantErr: false, - }, - { - name: "metric name has negative regexp", - filter: `{__name__!~".*"}`, - metricName: "http_request_count_total", - want: `{__name__="http_request_count_total"}`, - wantErr: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := buildMatchWithFilter(tt.filter, tt.metricName) - if (err != nil) != tt.wantErr { - t.Errorf("buildMatchWithFilter() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("buildMatchWithFilter() got = %v, want %v", got, tt.want) - } - }) +func TestBuildMatchWithFilter_Failure(t *testing.T) { + f := func(filter, metricName string) { + t.Helper() + + _, err := buildMatchWithFilter(filter, metricName) + if err == nil { + t.Fatalf("expecting non-nil error") + } } + + // match with error + f(`{cluster~=".*"}`, "http_request_count_total") +} + +func TestBuildMatchWithFilter_Success(t *testing.T) { + f := func(filter, metricName, resultExpected string) { + t.Helper() + + result, err := buildMatchWithFilter(filter, metricName) + if err != nil { + t.Fatalf("buildMatchWithFilter() error: %s", err) + } + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%s\nwant\n%s", result, resultExpected) + } + } + + // parsed metric with label + f(`{__name__="http_request_count_total",cluster="kube1"}`, "http_request_count_total", `{cluster="kube1",__name__="http_request_count_total"}`) + + // metric name with label + f(`http_request_count_total{cluster="kube1"}`, "http_request_count_total", `{cluster="kube1",__name__="http_request_count_total"}`) + + // parsed metric with regexp value + f(`{__name__="http_request_count_total",cluster=~"kube.*"}`, "http_request_count_total", `{cluster=~"kube.*",__name__="http_request_count_total"}`) + + // only label with regexp + f(`{cluster=~".*"}`, "http_request_count_total", `{cluster=~".*",__name__="http_request_count_total"}`) + + // many labels in filter with regexp + f(`{cluster=~".*",job!=""}`, "http_request_count_total", `{cluster=~".*",job!="",__name__="http_request_count_total"}`) + + // all names + f(`{__name__!=""}`, "http_request_count_total", `{__name__="http_request_count_total"}`) + + // with many underscores labels + f(`{__name__!="", __meta__!=""}`, "http_request_count_total", `{__meta__!="",__name__="http_request_count_total"}`) + + // metric name has regexp + f(`{__name__=~".*"}`, "http_request_count_total", `{__name__="http_request_count_total"}`) + + // metric name has negative regexp + f(`{__name__!~".*"}`, "http_request_count_total", `{__name__="http_request_count_total"}`) } diff --git a/app/vminsert/common/streamaggr.go b/app/vminsert/common/streamaggr.go index 851656d08..3cc649c52 100644 --- a/app/vminsert/common/streamaggr.go +++ b/app/vminsert/common/streamaggr.go @@ -57,14 +57,13 @@ func CheckStreamAggrConfig() error { return nil } pushNoop := func(_ []prompbmarshal.TimeSeries) {} - opts := streamaggr.Options{ + opts := &streamaggr.Options{ DedupInterval: *streamAggrDedupInterval, DropInputLabels: *streamAggrDropInputLabels, IgnoreOldSamples: *streamAggrIgnoreOldSamples, IgnoreFirstIntervals: *streamAggrIgnoreFirstIntervals, - Alias: "global", } - sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushNoop, opts) + sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushNoop, opts, "global") if err != nil { return fmt.Errorf("error when loading -streamAggr.config=%q: %w", *streamAggrConfig, err) } @@ -77,25 +76,22 @@ func CheckStreamAggrConfig() error { // MustStopStreamAggr must be called when stream aggr is no longer needed. func InitStreamAggr() { saCfgReloaderStopCh = make(chan struct{}) - rwctx := "global" - if *streamAggrConfig == "" { if *streamAggrDedupInterval > 0 { - deduplicator = streamaggr.NewDeduplicator(pushAggregateSeries, *streamAggrDedupInterval, *streamAggrDropInputLabels, rwctx) + deduplicator = streamaggr.NewDeduplicator(pushAggregateSeries, *streamAggrDedupInterval, *streamAggrDropInputLabels, "global") } return } sighupCh := procutil.NewSighupChan() - opts := streamaggr.Options{ + opts := &streamaggr.Options{ DedupInterval: *streamAggrDedupInterval, DropInputLabels: *streamAggrDropInputLabels, IgnoreOldSamples: *streamAggrIgnoreOldSamples, IgnoreFirstIntervals: *streamAggrIgnoreFirstIntervals, - Alias: rwctx, } - sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, opts) + sas, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, opts, "global") if err != nil { logger.Fatalf("cannot load -streamAggr.config=%q: %s", *streamAggrConfig, err) } @@ -123,14 +119,13 @@ func reloadStreamAggrConfig() { logger.Infof("reloading -streamAggr.config=%q", *streamAggrConfig) saCfgReloads.Inc() - opts := streamaggr.Options{ + opts := &streamaggr.Options{ DedupInterval: *streamAggrDedupInterval, DropInputLabels: *streamAggrDropInputLabels, IgnoreOldSamples: *streamAggrIgnoreOldSamples, IgnoreFirstIntervals: *streamAggrIgnoreFirstIntervals, - Alias: "global", } - sasNew, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, opts) + sasNew, err := streamaggr.LoadFromFile(*streamAggrConfig, pushAggregateSeries, opts, "global") if err != nil { saCfgSuccess.Set(0) saCfgReloadErr.Inc() diff --git a/app/vminsert/main.go b/app/vminsert/main.go index 6dac8da31..e5c639654 100644 --- a/app/vminsert/main.go +++ b/app/vminsert/main.go @@ -64,10 +64,10 @@ var ( "See also -opentsdbHTTPListenAddr.useProxyProtocol") opentsdbHTTPUseProxyProtocol = flag.Bool("opentsdbHTTPListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted "+ "at -opentsdbHTTPListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt") - configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings.") + configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.*") reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings.") maxLabelsPerTimeseries = flag.Int("maxLabelsPerTimeseries", 30, "The maximum number of labels accepted per time series. Superfluous labels are dropped. In this case the vm_metrics_with_dropped_labels_total metric at /metrics page is incremented") - maxLabelValueLen = flag.Int("maxLabelValueLen", 1024, "The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented") + maxLabelValueLen = flag.Int("maxLabelValueLen", 4*1024, "The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented") ) var ( @@ -327,7 +327,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } return true case "/prometheus/config", "/config": - if !httpserver.CheckAuthFlag(w, r, configAuthKey.Get(), "configAuthKey") { + if !httpserver.CheckAuthFlag(w, r, configAuthKey) { return true } promscrapeConfigRequests.Inc() @@ -336,7 +336,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true case "/prometheus/api/v1/status/config", "/api/v1/status/config": // See https://prometheus.io/docs/prometheus/latest/querying/api/#config - if !httpserver.CheckAuthFlag(w, r, configAuthKey.Get(), "configAuthKey") { + if !httpserver.CheckAuthFlag(w, r, configAuthKey) { return true } promscrapeStatusConfigRequests.Inc() @@ -346,7 +346,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { fmt.Fprintf(w, `{"status":"success","data":{"yaml":%q}}`, bb.B) return true case "/prometheus/-/reload", "/-/reload": - if !httpserver.CheckAuthFlag(w, r, reloadAuthKey.Get(), "reloadAuthKey") { + if !httpserver.CheckAuthFlag(w, r, reloadAuthKey) { return true } promscrapeConfigReloadRequests.Inc() diff --git a/app/vmselect/main.go b/app/vmselect/main.go index 6746ecd41..42d039f7b 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -30,13 +30,13 @@ import ( ) var ( - deleteAuthKey = flagutil.NewPassword("deleteAuthKey", "authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series and /tags/delSeries") + deleteAuthKey = flagutil.NewPassword("deleteAuthKey", "authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series and /tags/delSeries. It overrides -httpAuth.*") maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+ "It shouldn't be high, since a single request can saturate all the CPU cores, while many concurrently executed requests may require high amounts of memory. "+ "See also -search.maxQueueDuration and -search.maxMemoryPerQuery") maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests "+ "limit is reached; see also -search.maxQueryDuration") - resetCacheAuthKey = flagutil.NewPassword("search.resetCacheAuthKey", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call") + resetCacheAuthKey = flagutil.NewPassword("search.resetCacheAuthKey", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call. It overrides -httpAuth.*") logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging. "+ "See also -search.logQueryMemoryUsage") vmalertProxyURL = flag.String("vmalert.proxyURL", "", "Optional URL for proxying requests to vmalert. For example, if -vmalert.proxyURL=http://vmalert:8880 , then alerting API requests such as /api/v1/rules from Grafana will be proxied to http://vmalert:8880/api/v1/rules") @@ -172,7 +172,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } if path == "/internal/resetRollupResultCache" { - if !httpserver.CheckAuthFlag(w, r, resetCacheAuthKey.Get(), "resetCacheAuthKey") { + if !httpserver.CheckAuthFlag(w, r, resetCacheAuthKey) { return true } promql.ResetRollupResultCache() @@ -369,7 +369,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } return true case "/tags/delSeries": - if !httpserver.CheckAuthFlag(w, r, deleteAuthKey.Get(), "deleteAuthKey") { + if !httpserver.CheckAuthFlag(w, r, deleteAuthKey) { return true } graphiteTagsDelSeriesRequests.Inc() @@ -388,7 +388,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } return true case "/api/v1/admin/tsdb/delete_series": - if !httpserver.CheckAuthFlag(w, r, deleteAuthKey.Get(), "deleteAuthKey") { + if !httpserver.CheckAuthFlag(w, r, deleteAuthKey) { return true } deleteRequests.Inc() diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index c40ab47bd..e1a72bc5c 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -27,9 +27,9 @@ import ( var ( retentionPeriod = flagutil.NewDuration("retentionPeriod", "1", "Data with timestamps outside the retentionPeriod is automatically deleted. The minimum retentionPeriod is 24h or 1d. See also -retentionFilter") - snapshotAuthKey = flagutil.NewPassword("snapshotAuthKey", "authKey, which must be passed in query string to /snapshot* pages") - forceMergeAuthKey = flagutil.NewPassword("forceMergeAuthKey", "authKey, which must be passed in query string to /internal/force_merge pages") - forceFlushAuthKey = flagutil.NewPassword("forceFlushAuthKey", "authKey, which must be passed in query string to /internal/force_flush pages") + snapshotAuthKey = flagutil.NewPassword("snapshotAuthKey", "authKey, which must be passed in query string to /snapshot* pages. It overrides -httpAuth.*") + forceMergeAuthKey = flagutil.NewPassword("forceMergeAuthKey", "authKey, which must be passed in query string to /internal/force_merge pages. It overrides -httpAuth.*") + forceFlushAuthKey = flagutil.NewPassword("forceFlushAuthKey", "authKey, which must be passed in query string to /internal/force_flush pages. It overrides -httpAuth.*") snapshotsMaxAge = flagutil.NewDuration("snapshotsMaxAge", "0", "Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted") _ = flag.Duration("snapshotCreateTimeout", 0, "Deprecated: this flag does nothing") @@ -240,7 +240,7 @@ func GetSeriesCount(deadline uint64) (uint64, error) { // Stop stops the vmstorage func Stop() { // deregister storage metrics - metrics.UnregisterSet(storageMetrics) + metrics.UnregisterSet(storageMetrics, true) storageMetrics = nil logger.Infof("gracefully closing the storage at %s", *DataPath) @@ -257,7 +257,7 @@ func Stop() { func RequestHandler(w http.ResponseWriter, r *http.Request) bool { path := r.URL.Path if path == "/internal/force_merge" { - if !httpserver.CheckAuthFlag(w, r, forceMergeAuthKey.Get(), "forceMergeAuthKey") { + if !httpserver.CheckAuthFlag(w, r, forceMergeAuthKey) { return true } // Run force merge in background @@ -275,7 +275,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true } if path == "/internal/force_flush" { - if !httpserver.CheckAuthFlag(w, r, forceFlushAuthKey.Get(), "forceFlushAuthKey") { + if !httpserver.CheckAuthFlag(w, r, forceFlushAuthKey) { return true } logger.Infof("flushing storage to make pending data available for reading") @@ -288,10 +288,10 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { prometheusCompatibleResponse = true path = "/snapshot/create" } - if !strings.HasPrefix(path, "/snapshot") { + if !strings.HasPrefix(path, "/snapshot/") { return false } - if !httpserver.CheckAuthFlag(w, r, snapshotAuthKey.Get(), "snapshotAuthKey") { + if !httpserver.CheckAuthFlag(w, r, snapshotAuthKey) { return true } path = path[len("/snapshot"):] diff --git a/dashboards/vm/vmagent.json b/dashboards/vm/vmagent.json index 27ad605a1..9afcc9109 100644 --- a/dashboards/vm/vmagent.json +++ b/dashboards/vm/vmagent.json @@ -5178,7 +5178,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(rate(vm_streamaggr_flushed_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) without (instance, pod) > 0", + "expr": "sum(rate(vm_streamaggr_output_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) without (instance, pod) > 0", "instant": false, "legendFormat": "{{url}} ({{job}}): match={{match}}; output={{output}}", "range": true, @@ -5496,103 +5496,6 @@ "title": "Dedup flush duration 0.99 quantile ($instance)", "type": "timeseries" }, - { - "datasource": { - "type": "victoriametrics-datasource", - "uid": "$ds" - }, - "description": "Shows the eviction rate of time series because of staleness.\n\nThere are two stages where series can be marked as stale.\n1. Input. Aggregator keeps in memory each received unique time series. The time series becomes stale and gets removed if no samples were received during [staleness interval](https://docs.victoriametrics.com/stream-aggregation/#staleness) for this series. \n\n2. Output. The output key is a resulting time series produced by aggregating many input series. The time series becomes stale and gets removed if no samples were received during [staleness interval](https://docs.victoriametrics.com/stream-aggregation/#staleness) for any of input series for this aggregation.\n\nIncrease in `input` keys shows that series previously matched by the aggregation rule now became stale.\n\nIncrease in `output` keys shows that series previously produced by the aggregation rule now became stale.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 31 - }, - "id": 144, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "victoriametrics-datasource", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "increase(vm_streamaggr_stale_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval]) > 0", - "instant": false, - "legendFormat": "{{url}} ({{job}}): match={{match}}; key={{key}}", - "range": true, - "refId": "A" - } - ], - "title": "Staleness rate ($instance)", - "type": "timeseries" - }, { "datasource": { "type": "victoriametrics-datasource", diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 3cf857e91..557cdc55b 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -5177,7 +5177,7 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(rate(vm_streamaggr_flushed_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) without (instance, pod) > 0", + "expr": "sum(rate(vm_streamaggr_output_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) without (instance, pod) > 0", "instant": false, "legendFormat": "{{url}} ({{job}}): match={{match}}; output={{output}}", "range": true, @@ -5495,103 +5495,6 @@ "title": "Dedup flush duration 0.99 quantile ($instance)", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "description": "Shows the eviction rate of time series because of staleness.\n\nThere are two stages where series can be marked as stale.\n1. Input. Aggregator keeps in memory each received unique time series. The time series becomes stale and gets removed if no samples were received during [staleness interval](https://docs.victoriametrics.com/stream-aggregation/#staleness) for this series. \n\n2. Output. The output key is a resulting time series produced by aggregating many input series. The time series becomes stale and gets removed if no samples were received during [staleness interval](https://docs.victoriametrics.com/stream-aggregation/#staleness) for any of input series for this aggregation.\n\nIncrease in `input` keys shows that series previously matched by the aggregation rule now became stale.\n\nIncrease in `output` keys shows that series previously produced by the aggregation rule now became stale.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 31 - }, - "id": 144, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "$ds" - }, - "editorMode": "code", - "expr": "increase(vm_streamaggr_stale_samples_total{job=~\"$job\",instance=~\"$instance\", url=~\"$url\"}[$__rate_interval]) > 0", - "instant": false, - "legendFormat": "{{url}} ({{job}}): match={{match}}; key={{key}}", - "range": true, - "refId": "A" - } - ], - "title": "Staleness rate ($instance)", - "type": "timeseries" - }, { "datasource": { "type": "prometheus", diff --git a/deployment/docker/Makefile b/deployment/docker/Makefile index 00ee79f5f..07470e48d 100644 --- a/deployment/docker/Makefile +++ b/deployment/docker/Makefile @@ -1,24 +1,19 @@ # All these commands must run from repository root. -DOCKER_REGISTRY ?= docker.io DOCKER_NAMESPACE ?= victoriametrics ROOT_IMAGE ?= alpine:3.20.1 ROOT_IMAGE_SCRATCH ?= scratch -SKIP_SCRATCH_BUILD ?= false CERTS_IMAGE := alpine:3.20.1 GO_BUILDER_IMAGE := golang:1.22.5-alpine BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr :/ __)-1 BASE_IMAGE := local/base:1.1.4-$(shell echo $(ROOT_IMAGE) | tr :/ __)-$(shell echo $(CERTS_IMAGE) | tr :/ __) DOCKER ?= docker -DOCKER_BUILD_OPTS ?= DOCKER_RUN ?= $(DOCKER) run DOCKER_BUILD ?= $(DOCKER) build DOCKER_COMPOSE ?= $(DOCKER) compose DOCKER_IMAGE_LS ?= $(DOCKER) image ls --format '{{.Repository}}:{{.Tag}}' -TARGET_PLATFORM ?= linux/amd64,linux/arm,linux/arm64,linux/ppc64le,linux/386 -COMMA := , package-base: ($(DOCKER_IMAGE_LS) | grep -q '$(BASE_IMAGE)$$') \ @@ -26,7 +21,6 @@ package-base: --build-arg root_image=$(ROOT_IMAGE) \ --build-arg certs_image=$(CERTS_IMAGE) \ --tag $(BASE_IMAGE) \ - $(DOCKER_BUILD_OPTS) \ deployment/docker/base package-builder: @@ -34,7 +28,6 @@ package-builder: || $(DOCKER_BUILD) \ --build-arg go_builder_image=$(GO_BUILDER_IMAGE) \ --tag $(BUILDER_IMAGE) \ - $(DOCKER_BUILD_OPTS) \ deployment/docker/builder app-via-docker: package-builder @@ -68,52 +61,57 @@ app-via-docker-windows: package-builder -o bin/$(APP_NAME)-windows$(APP_SUFFIX)-prod.exe $(PKG_PREFIX)/app/$(APP_NAME) package-via-docker: package-base - ($(DOCKER_IMAGE_LS) | grep -q '$(DOCKER_REGISTRY)/$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE)$$') || (\ + ($(DOCKER_IMAGE_LS) | grep -q '$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE)$$') || (\ $(MAKE) app-via-docker && \ $(DOCKER_BUILD) \ --build-arg src_binary=$(APP_NAME)$(APP_SUFFIX)-prod \ --build-arg base_image=$(BASE_IMAGE) \ - --tag $(DOCKER_REGISTRY)/$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) \ + --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) \ -f app/$(APP_NAME)/deployment/Dockerfile bin) publish-via-docker: - $(eval TARGET_PLATFORMS := $(subst $(COMMA), ,$(TARGET_PLATFORM))) - $(MAKE_PARALLEL) $(foreach PLATFORM,$(TARGET_PLATFORMS),app-via-docker-$(subst /,-,$(PLATFORM))) + $(MAKE_PARALLEL) app-via-docker-linux-amd64 \ + app-via-docker-linux-arm \ + app-via-docker-linux-arm64 \ + app-via-docker-linux-ppc64le \ + app-via-docker-linux-386 $(DOCKER) buildx build \ - --platform=$(TARGET_PLATFORM) \ + --platform=linux/amd64,linux/arm,linux/arm64,linux/ppc64le,linux/386 \ --build-arg certs_image=$(CERTS_IMAGE) \ --build-arg root_image=$(ROOT_IMAGE) \ --build-arg APP_NAME=$(APP_NAME) \ - --tag $(DOCKER_REGISTRY)/$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) \ - --tag $(DOCKER_REGISTRY)/$(DOCKER_NAMESPACE)/$(APP_NAME):$(LATEST_TAG)$(RACE) \ + --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) \ + --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(LATEST_TAG)$(RACE) \ -o type=image \ --provenance=false \ - $(DOCKER_BUILD_OPTS) \ -f app/$(APP_NAME)/multiarch/Dockerfile \ --push \ bin - $(if $(findstring $(SKIP_SCRATCH_BUILD),true),, \ - $(DOCKER) buildx build \ - --platform=$(TARGET_PLATFORM) \ - --build-arg certs_image=$(CERTS_IMAGE) \ - --build-arg root_image=$(ROOT_IMAGE_SCRATCH) \ - --build-arg APP_NAME=$(APP_NAME) \ - --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE)-scratch \ - --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(LATEST_TAG)$(RACE)-scratch \ - -o type=image \ - --provenance=false \ - $(DOCKER_BUILD_OPTS) \ - -f app/$(APP_NAME)/multiarch/Dockerfile \ - --push \ - bin) - cd bin && rm -rf $(foreach PLATFORM,$(TARGET_PLATFORMS),$(APP_NAME)-$(subst /,-,$(PLATFORM))-prod) + $(DOCKER) buildx build \ + --platform=linux/amd64,linux/arm,linux/arm64,linux/ppc64le,linux/386 \ + --build-arg certs_image=$(CERTS_IMAGE) \ + --build-arg root_image=$(ROOT_IMAGE_SCRATCH) \ + --build-arg APP_NAME=$(APP_NAME) \ + --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE)-scratch \ + --tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(LATEST_TAG)$(RACE)-scratch \ + -o type=image \ + --provenance=false \ + -f app/$(APP_NAME)/multiarch/Dockerfile \ + --push \ + bin + cd bin && rm -rf \ + $(APP_NAME)-linux-amd64-prod \ + $(APP_NAME)-linux-arm-prod \ + $(APP_NAME)-linux-arm64-prod \ + $(APP_NAME)-linux-ppc64le-prod \ + $(APP_NAME)-linux-386-prod run-via-docker: package-via-docker $(DOCKER_RUN) -it --rm \ --user $(shell id -u):$(shell id -g) \ --net host \ $(DOCKER_OPTS) \ - $(DOCKER_REGISTRY)/$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) $(ARGS) + $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) $(ARGS) app-via-docker-goos-goarch: APP_SUFFIX='-$(GOOS)-$(GOARCH)' \ diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index c97af984d..3a25c3b62 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -37,19 +37,22 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * SECURITY: upgrade Go builder from Go1.22.4 to Go1.22.5. See the list of issues addressed in [Go1.22.5](https://github.com/golang/go/issues?q=milestone%3AGo1.22.5+label%3ACherryPickApproved). * SECURITY: upgrade base docker image (Alpine) from 3.20.0 to 3.20.1. See [alpine 3.20.1 release notes](https://www.alpinelinux.org/posts/Alpine-3.20.1-released.html). -* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): allow overriding `Host` header with a target host before sending to a downstream. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6453) +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): allow overriding `Host` header with backend host before sending the request to the configured backend. See [these docs](https://docs.victoriametrics.com/vmauth/#modifying-http-headers) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6453) * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): reduces CPU usage by reusing request body buffer. Allows to disable requests caching with `-maxRequestBodySizeToRetry=0`. See this [PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6533) for details. * FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): add [Grafana dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vmauth.json) and [alerting rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmauth.yml) for [vmauth](https://docs.victoriametrics.com/vmauth/) dashboard. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4313) for details. * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): reduces CPU usage by reusing request body buffer. Allows to disable requests caching with `-maxRequestBodySizeToRetry=0`. See this [PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6533) for details. -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): added `yandexcloud_sd` AWS API IMDSv2 support. -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): expose metrics related to [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): - * `vm_streamaggr_matched_samples_total` - shows the number of samples matched by the aggregation rule; - * `vm_streamaggr_flushed_samples_total` - shows the number of samples produced by the aggregation rule; - * `vm_streamaggr_samples_lag_seconds` - shows the max lag between samples timestamps within one batch received by the aggregation; - * `vm_streamaggr_stale_samples_total` - shows the number of time series that became [stale](https://docs.victoriametrics.com/stream-aggregation/#staleness) during aggregation; - * metrics related to stream aggregation got additional labels `match` (matching param), `group` (`by` or `without` param), `url` (address of `remoteWrite.url` where aggregation is applied), `position` (the position of the aggregation rule in config file). - * These and other metrics were reflected on the [vmagent dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vmagent.json) in `stream aggregation` section. +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): [`yandexcloud_sd_configs`](https://docs.victoriametrics.com/sd_configs/#yandexcloud_sd_configs): add support for obtaining IAM token in [GCE format](https://yandex.cloud/en-ru/docs/compute/operations/vm-connect/auth-inside-vm#auth-inside-vm) additionally to the [deprecated Amazon EC2 IMDSv1 format](https://yandex.cloud/en/docs/security/standard/authentication#aws-token). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5513). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/) and [Single-node VictoriaMetrics](https://docs.victoriametrics.com/): add `-graphite.sanitizeMetricName` cmd-line flag for sanitizing metrics ingested via [Graphite protocol](https://docs.victoriametrics.com/#how-to-send-data-from-graphite-compatible-agents-such-as-statsd). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6077). +* FEATURE: [streaming aggregation](https://docs.victoriametrics.com/stream-aggregation/): expose the following metrics at `/metrics` page of [vmagent](https://docs.victoriametrics.com/vmagent/) and [single-node VictoriaMetrics](https://docs.victoriametrics.com/): + * `vm_streamaggr_matched_samples_total` - the number of input samples matched by the corresponding aggregation rule + * `vm_streamaggr_output_samples_total` - the number of output samples produced by the corresponding aggregation rule + * `vm_streamaggr_samples_lag_seconds` - [histogram](https://docs.victoriametrics.com/keyconcepts/#histogram) with the lag between the current time and the timestamp seen in the aggregated input samples +* FEATURE: [steaming aggregation](https://docs.victoriametrics.com/stream-aggregation/): add new labels to `vm_streamaggr_*` metrics: + * `name` - the name of the streaming aggregation rule, which can be configured via `name` option - see [these docs](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). + * `url` - `-remoteWrite.url` for the corresponding `-remoteWrite.streamAggr.config` + * `path` - path to the corresponding streaming aggregation config file + * `position` - the position of the aggregation rule in the corresponding streaming aggregation config file +* FEATURE: [vmagent dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vmagent.json): `stream aggregation` section: add graphs based on newly exposed streaming aggregation metrics. * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): do not retry RPC calls to vmstorage nodes if [complexity limits](https://docs.victoriametrics.com/#resource-usage-limits) were exceeded. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): make `-replay.timeTo` optional in [replay mode](https://docs.victoriametrics.com/vmalert/#rules-backfilling). When omitted, the current timestamp will be used. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6492). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show compacted result in the JSON tab for query results. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6559). @@ -57,6 +60,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/index.html): allow overriding Azure storage domain when performing backups. See configuration docs [here](https://docs.victoriametrics.com/vmbackup/#providing-credentials-via-env-variables). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5984) for the details. Thanks to @justinrush for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6518). * FEATURE: [streaming aggregation](https://docs.victoriametrics.com/stream-aggregation/): prevent having duplicated aggregation function as `outputs` in one [aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config). It also prevents using `outputs: ["quantiles(0.5)", "quantiles(0.9)"]` instead of `outputs: ["quantiles(0.5, 0.9)"]`, as the former has higher computation cost for producing the same result. +* BUGFIX: [vmgateway](https://docs.victoriametrics.com/vmgateway/): properly apply read and write based rate limits. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6148) for details. * BUGFIX: [docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#docker-compose-environment-for-victoriametrics): fix incorrect link to vmui from [VictoriaMetrics plugin in Grafana](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#grafana). * BUGFIX: [docker-compose](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#docker-compose-environment-for-victoriametrics): fix incorrect link to vmui from [VictoriaMetrics plugin in Grafana](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker#grafana). * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): Fix the dateMetricIDCache consistency issue that leads to duplicate per-day index entries when new time series are inserted concurrently. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6534) for details. @@ -64,6 +68,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix input cursor position reset in modal settings. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6530). * BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): fix `vm_backup_last_run_failed` metric not being properly initialized during startup. Previously, it could imply an error even if the backup have been completed successfully. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6550) for the details. * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate [histogram_quantile](https://docs.victoriametrics.com/MetricsQL.html#histogram_quantile) over Prometheus buckets with inconsistent values. See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4580#issuecomment-2186659102) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6547). Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2819). +* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): fix panic when using multiple topics with the same name when [ingesting metrics from Kafka](https://docs.victoriametrics.com/vmagent/#kafka-integration). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6636) for the details. ## [v1.102.0-rc2](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc2) @@ -75,10 +80,10 @@ Released at 2024-06-24 * FEATURE: [alerts-vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmagent.yml): add new alerting rules `StreamAggrFlushTimeout` and `StreamAggrDedupFlushTimeout` to notify about issues during stream aggregation. * FEATURE: [dashboards/vmagent](https://grafana.com/grafana/dashboards/12683): add row `Streaming aggregation` with panels related to [streaming aggregation](https://docs.victoriametrics.com/stream-aggregation/) process. -* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs. -* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add auto request retry for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `-idleConnTimeout` flag set to 50s by default. It should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmauth logs. +* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add automatic retry for requests to backend for trivial network errors, such as `broken pipe` and `connection reset` for requests to the configured backends. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): increase default value of `-promscrape.maxDroppedTargets` command-line flag to 10_000 from 1000. This makes it easier to track down large number of dropped targets. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6381). -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `max_scrape_size` parameter to a scrape config for setting a custom scrape limit for a job. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` was added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `max_scrape_size` option to [scrape config](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for setting custom limit on the response size target can send. The new [automatically generated metric](https://docs.victoriametrics.com/vmagent/#automatically-generated-metrics) `scrape_response_size_bytes` is added to reflect the response size of the target. See these issues: [1](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6429), [2](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2992), [3](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6123), [4](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5612). * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): check for ranged vector arguments in non-rollup expressions when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `sum(up[5m])` or `absent(up[5m])` will fail to execute if these flags are set. * FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/): validate that rollup expressions has ranged vector arguments passed when `-search.disableImplicitConversion` or `-search.logImplicitConversion` are enabled. For example, `rate(metric)` or `count_over_time(metric)` will fail to execute if these flags are set. * FEATURE: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): support file path with hierarchical patterns and regexpes, and http url in unittest cmd-line flag `-files`, e.g. `-files="http:///path/to/rules"` or `-files="dir/**/*.yaml"`. @@ -96,16 +101,17 @@ Released at 2024-06-24 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): add validation for the max value specified for `-retentionPeriod`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6330) for details. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): **copy row** button in Table view produces unexpected result. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6421) and [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6495). * BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): prevent hanging when processing groups without rules. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6500). +* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): fix discovering backend IPs when `url_prefix` contains hostname with srv+ prefix. Thanks to @shichanglin5 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6401). ## [v1.102.0-rc1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0-rc1) Released at 2024-06-07 -**Update note 1: the `-remoteWrite.multitenantURL` command-line flag at `vmagent` was removed starting from this release. This flag was deprecated since [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0). Use `-enableMultitenantHandlers` instead, as it is easier to use and combine with [multitenant URL at vminsert](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy-via-labels). See these [docs for details](https://docs.victoriametrics.com/vmagent.html#multitenancy).** +**Update note 1: the `-remoteWrite.multitenantURL` command-line flag at `vmagent` was removed starting from this release. This flag was deprecated since [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0). Use `-enableMultitenantHandlers` instead, as it is easier to use and combine with [multitenant URL at vminsert](https://docs.victoriametrics.com/cluster-victoriametrics/#multitenancy-via-labels). See [these docs](https://docs.victoriametrics.com/vmagent/#multitenancy) for details.** **Update note 2: the `-streamAggr.dropInputLabels` command-line flag at `vmagent` was renamed to `-remoteWrite.streamAggr.dropInputLabels`. `-streamAggr.dropInputLabels` is now used for global streaming aggregation.** -**Update note 3: the `-maxLabelValueLen` command-line flag default value was changed from 16kB to 1kB. It may lead to truncating of labels with enormous values.** +**Update note 3: the `-maxLabelValueLen` command-line flag default value was changed from 16KiB to 4KiB. It may lead to truncating of labels with too long values.** * SECURITY: upgrade Go builder from Go1.22.2 to Go1.22.4. See the list of issues addressed in [Go1.22.3](https://github.com/golang/go/issues?q=milestone%3AGo1.22.3+label%3ACherryPickApproved) and [Go1.22.4](https://github.com/golang/go/issues?q=milestone%3AGo1.22.4+label%3ACherryPickApproved). * SECURITY: upgrade base docker image (Alpine) from 3.19.1 to 3.20.0. See [alpine 3.20.0 release notes](https://www.alpinelinux.org/posts/Alpine-3.20.0-released.html). @@ -117,14 +123,13 @@ Released at 2024-06-07 * FEATURE: [dashboards/operator](https://grafana.com/grafana/dashboards/17869), [dashboards/backupmanager](https://grafana.com/grafana/dashboards/17798) and [dashboard/tenant-statistic](https://grafana.com/grafana/dashboards/16399): update dashboard to be compatible with Grafana 10+ version. * FEATURE: [dashboards/cluster](https://grafana.com/grafana/dashboards/11176): add new panel `Concurrent selects` to `vmstorage` row. The panel will show how many ongoing select queries are processed by vmstorage and should help to identify resource bottlenecks. See panel description for more details. * FEATURE: [dashboards](https://grafana.com/orgs/victoriametrics): use `$__interval` variable for offsets and look-behind windows in annotations. This should improve precision of `restarts` and `version change` annotations when zooming-in/zooming-out on the dashboards. -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support aggregation and deduplication configs before replicating data to configured `-remoteWrite.url` destinations. This saves CPU and memory resources when incoming data needs to be aggregated or deduplicated once and then replicated to multiple destinations. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5467). -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add service discovery support for [Vultr](https://www.vultr.com/). See [these docs](https://docs.victoriametrics.com/sd_configs/#vultr_sd_configs) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6041). -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow configuring `-remoteWrite.disableOnDiskQueue` and `-remoteWrite.dropSamplesOnOverload` cmd-line flags per each `-remoteWrite.url`. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementation! -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add labels `path` and `url` to metrics `vmagent_remotewrite_push_failures_total` and `vmagent_remotewrite_samples_dropped_total`. Now number of failed pushes and dropped samples can be tracked per `-remoteWrite.url`. -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support Statsd plaintext protocol. See [these docs](https://docs.victoriametrics.com/vmagent/#how-to-push-data-to-vmagent) and this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5053). Thanks to @Koilanetroc for implementation! +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support aggregation and deduplication configs before replicating data to configured `-remoteWrite.url` destinations. This saves CPU and memory resources when incoming data needs to be aggregated or deduplicated once and then replicated to multiple destinations. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5467). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add service discovery support for [Vultr](https://www.vultr.com/). See [these docs](https://docs.victoriametrics.com/sd_configs/#vultr_sd_configs) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6041). +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): allow specifying `-remoteWrite.disableOnDiskQueue` command-line flag per each `-remoteWrite.url`. If multiple `-remoteWrite.disableOnDiskQueue` command-line flags are configured, then the `-remoteWrite.dropSamplesOnOverload` is automatically set to true, so samples are automatically dropped if they cannot be sent to the corresponding `-remoteWrite.url` in a timely manner. See this [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6065). Thanks to @rbizos for implementation! +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): add `path` and `url` labels to `vmagent_remotewrite_push_failures_total` and `vmagent_remotewrite_samples_dropped_total` [metrics](https://docs.victoriametrics.com/vmagent/#monitoring), so the number of failed pushes and dropped samples can be tracked per each `-remoteWrite.url`. * FEATURE: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): add [rate_sum](https://docs.victoriametrics.com/stream-aggregation/#rate_sum) and [rate_avg](https://docs.victoriametrics.com/stream-aggregation/#rate_avg) aggregation outputs. * FEATURE: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): reduce the number of allocated objects in heap during deduplication and aggregation. The change supposed to reduce pressure on Garbage Collector, as it will need to scan less objects. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6402). -* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): add `datasource.idleConnTimeout`, `remoteWrite.idleConnTimeout` and `remoteRead.idleConnTimeout` flags. These flags are set to 50s by default and should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmalert logs. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661) for details. +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): add `-datasource.idleConnTimeout`, `-remoteWrite.idleConnTimeout` and `-remoteRead.idleConnTimeout` flags. These flags are set to 50s by default and should reduce the probability of `broken pipe` or `connection reset by peer` errors in vmalert logs. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661) for details. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): add auto request retry for trivial network errors, such as `broken pipe` and `connection reset` for requests to `remoteRead`, `remoteWrite` and `datasource` URLs. See this [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5661) for details. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): reduce CPU usage when evaluating high number of alerting and recording rules. * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert/): speed up retrieving rules files from object storages by skipping unchanged objects during reloading. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6210). @@ -148,14 +153,13 @@ Released at 2024-06-07 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): correctly apply `-inmemoryDataFlushInterval` when it's set to minimum supported value 1s. * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): reduce the default value for `-maxLabelValueLen` command-line flag from `16KiB` to `1KiB`. This should prevent from issues like [this one](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6176) when time series with too long labels are ingested into VictoriaMetrics. * BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): properly release memory used for metrics during config reload. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6247). -* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): fix discovering backend IPs when `url_prefix` contains hostname with srv+ prefix. Thanks to @shichanglin5 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6401). * BUGFIX: [dashboards](https://grafana.com/orgs/victoriametrics): fix `AnnotationQueryRunner` error in Grafana when executing annotations query against Prometheus backend. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6309) for details. * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): filter deleted label names and values from [`/api/v1/labels`](https://docs.victoriametrics.com/url-examples/#apiv1labels) and [`/api/v1/label/.../values`](https://docs.victoriametrics.com/url-examples/#apiv1labelvalues) responses when `match[]` filter matches small number of time series. The issue was introduced [v1.81.0](https://docs.victoriametrics.com/changelog_2022/#v1810). * BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): fix float values template in `input_series`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6391). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/): fix misleading error logs in vmalert's stdout when unsupported HTTP path is requested. * BUGFIX: retry files delete attempts on vXFS file system for `EEXIST` error type. See the [related issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6396). Thanks to @pludov for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6398). -* DEPRECATION: [vmagent](https://docs.victoriametrics.com/vmagent/): removed deprecated `-remoteWrite.multitenantURL` flag from vmagent. This flag was deprecated since [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0). Use `-enableMultitenantHandlers` instead, as it is easier to use and combine with [multitenant URL at vminsert](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy-via-labels). See these [docs for details](https://docs.victoriametrics.com/vmagent.html#multitenancy). +* DEPRECATION: [vmagent](https://docs.victoriametrics.com/vmagent/): removed deprecated `-remoteWrite.multitenantURL` flag from vmagent. This flag was deprecated since [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0). Use `-enableMultitenantHandlers` instead, as it is easier to use and combine with [multitenant URL at vminsert](https://docs.victoriametrics.com/cluster-victoriametrics/#multitenancy-via-labels). See [these docs](https://docs.victoriametrics.com/vmagent/#multitenancy) for details. ## [v1.101.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.101.0) @@ -1121,4 +1125,3 @@ See changes [here](https://docs.victoriametrics.com/changelog_2020/#v1420) ## Previous releases See [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases). - diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 1aa287fd2..55c9e33bc 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -12,9 +12,9 @@ aliases: # Cluster version - - - VictoriaMetrics logo + + + VictoriaMetrics logo VictoriaMetrics is a fast, cost-effective and scalable time series database. It can be used as a long-term remote storage for Prometheus. @@ -1084,7 +1084,7 @@ Below is the output for `/path/to/vminsert -help`: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -1189,7 +1189,7 @@ Below is the output for `/path/to/vminsert -help`: The maximum size in bytes of a single Prometheus remote_write API request Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432) -maxLabelValueLen int - The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 16384) + The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 4096) -maxLabelsPerTimeseries int The maximum number of labels accepted per time series. Superfluous labels are dropped. In this case the vm_metrics_with_dropped_labels_total metric at /metrics page is incremented (default 30) -memory.allowedBytes size @@ -1200,7 +1200,7 @@ Below is the output for `/path/to/vminsert -help`: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1231,7 +1231,7 @@ Below is the output for `/path/to/vminsert -help`: -opentsdbhttpTrimTimestamp duration Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -prevCacheRemovalPercent float Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1) @@ -1378,7 +1378,7 @@ Below is the output for `/path/to/vmselect -help`: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -1453,7 +1453,7 @@ Below is the output for `/path/to/vmselect -help`: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1464,7 +1464,7 @@ Below is the output for `/path/to/vmselect -help`: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -prevCacheRemovalPercent float Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1) @@ -1692,7 +1692,7 @@ Below is the output for `/path/to/vmstorage -help`: -finalMergeDelay duration Deprecated: this flag does nothing -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -forceFlushAuthKey value authKey, which must be passed in query string to /internal/force_flush pages @@ -1779,7 +1779,7 @@ Below is the output for `/path/to/vmstorage -help`: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1790,7 +1790,7 @@ Below is the output for `/path/to/vmstorage -help`: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -precisionBits int The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64) diff --git a/docs/README.md b/docs/README.md index 27cc0ba9b..6510b8015 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,7 +5,6 @@ title: VictoriaMetrics [![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) [![Docker Pulls](https://img.shields.io/docker/pulls/victoriametrics/victoria-metrics.svg?maxAge=604800)](https://hub.docker.com/r/victoriametrics/victoria-metrics) -[![victoriametrics](https://snapcraft.io/victoriametrics/badge.svg)](https://snapcraft.io/victoriametrics) [![Slack](https://img.shields.io/badge/join%20slack-%23victoriametrics-brightgreen.svg)](https://slack.victoriametrics.com/) [![GitHub license](https://img.shields.io/github/license/VictoriaMetrics/VictoriaMetrics.svg)](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE) [![Go Report](https://goreportcard.com/badge/github.com/VictoriaMetrics/VictoriaMetrics)](https://goreportcard.com/report/github.com/VictoriaMetrics/VictoriaMetrics) @@ -184,10 +183,6 @@ Additionally, all the VictoriaMetrics components allow setting flag values via e * For repeating flags an alternative syntax can be used by joining the different values into one using `,` char as separator (for example `-storageNode -storageNode ` will translate to `storageNode=,`). * Environment var prefix can be set via `-envflag.prefix` flag. For instance, if `-envflag.prefix=VM_`, then env vars must be prepended with `VM_`. -### Configuration with snap package - -Snap packages for VictoriaMetrics are supported by community and are available at [https://snapcraft.io/victoriametrics](https://snapcraft.io/victoriametrics). - ### Running as Windows service In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw) @@ -2731,7 +2726,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -cacheExpireDuration duration Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s) -configAuthKey value - Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -configAuthKey=file:///abs/path/to/file or -configAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -configAuthKey=http://host/path or -configAuthKey=https://host/path -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) @@ -2768,7 +2763,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -finalMergeDelay duration Deprecated: this flag does nothing -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -forceFlushAuthKey value authKey, which must be passed in query string to /internal/force_flush pages @@ -2883,7 +2878,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum size in bytes of a single Prometheus remote_write API request Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432) -maxLabelValueLen int - The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 1024) + The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 4096) -maxLabelsPerTimeseries int The maximum number of labels accepted per time series. Superfluous labels are dropped. In this case the vm_metrics_with_dropped_labels_total metric at /metrics page is incremented (default 30) -memory.allowedBytes size @@ -2894,7 +2889,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -2925,7 +2920,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -opentsdbhttpTrimTimestamp duration Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -precisionBits int The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64) @@ -3047,7 +3042,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -relabelConfig string Optional path to a file with relabeling rules, which are applied to all the ingested metrics. The path can point either to local file or to http url. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -retentionFilter array Retention filter in the format 'filter:retention'. For example, '{env="dev"}:3d' configures the retention for time series with env="dev" label to 3 days. See https://docs.victoriametrics.com/#retention-filters for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise/ diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index e2dcce274..060702d7b 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -13,7 +13,6 @@ aliases: [![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) [![Docker Pulls](https://img.shields.io/docker/pulls/victoriametrics/victoria-metrics.svg?maxAge=604800)](https://hub.docker.com/r/victoriametrics/victoria-metrics) -[![victoriametrics](https://snapcraft.io/victoriametrics/badge.svg)](https://snapcraft.io/victoriametrics) [![Slack](https://img.shields.io/badge/join%20slack-%23victoriametrics-brightgreen.svg)](https://slack.victoriametrics.com/) [![GitHub license](https://img.shields.io/github/license/VictoriaMetrics/VictoriaMetrics.svg)](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/LICENSE) [![Go Report](https://goreportcard.com/badge/github.com/VictoriaMetrics/VictoriaMetrics)](https://goreportcard.com/report/github.com/VictoriaMetrics/VictoriaMetrics) @@ -192,10 +191,6 @@ Additionally, all the VictoriaMetrics components allow setting flag values via e * For repeating flags an alternative syntax can be used by joining the different values into one using `,` char as separator (for example `-storageNode -storageNode ` will translate to `storageNode=,`). * Environment var prefix can be set via `-envflag.prefix` flag. For instance, if `-envflag.prefix=VM_`, then env vars must be prepended with `VM_`. -### Configuration with snap package - -Snap packages for VictoriaMetrics are supported by community and are available at [https://snapcraft.io/victoriametrics](https://snapcraft.io/victoriametrics). - ### Running as Windows service In order to run VictoriaMetrics as a Windows service it is required to create a service configuration for [WinSW](https://github.com/winsw/winsw) @@ -2739,7 +2734,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -cacheExpireDuration duration Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s) -configAuthKey value - Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -configAuthKey=file:///abs/path/to/file or -configAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -configAuthKey=http://host/path or -configAuthKey=https://host/path -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) @@ -2776,7 +2771,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -finalMergeDelay duration Deprecated: this flag does nothing -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -forceFlushAuthKey value authKey, which must be passed in query string to /internal/force_flush pages @@ -2891,7 +2886,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum size in bytes of a single Prometheus remote_write API request Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 33554432) -maxLabelValueLen int - The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 1024) + The maximum length of label values in the accepted time series. Longer label values are truncated. In this case the vm_too_long_label_values_total metric at /metrics page is incremented (default 4096) -maxLabelsPerTimeseries int The maximum number of labels accepted per time series. Superfluous labels are dropped. In this case the vm_metrics_with_dropped_labels_total metric at /metrics page is incremented (default 30) -memory.allowedBytes size @@ -2902,7 +2897,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -2933,7 +2928,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -opentsdbhttpTrimTimestamp duration Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -precisionBits int The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss (default 64) @@ -3055,7 +3050,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -relabelConfig string Optional path to a file with relabeling rules, which are applied to all the ingested metrics. The path can point either to local file or to http url. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -retentionFilter array Retention filter in the format 'filter:retention'. For example, '{env="dev"}:3d' configures the retention for time series with env="dev" label to 3 days. See https://docs.victoriametrics.com/#retention-filters for details. This flag is available only in VictoriaMetrics enterprise. See https://docs.victoriametrics.com/enterprise/ diff --git a/docs/VictoriaLogs/README.md b/docs/VictoriaLogs/README.md index fe46fa772..7a80eafc4 100644 --- a/docs/VictoriaLogs/README.md +++ b/docs/VictoriaLogs/README.md @@ -181,7 +181,7 @@ Pass `-help` to VictoriaLogs in order to see the list of supported command-line -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -266,10 +266,10 @@ Pass `-help` to VictoriaLogs in order to see the list of supported command-line -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -prevCacheRemovalPercent float Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1) diff --git a/docs/VictoriaLogs/Roadmap.md b/docs/VictoriaLogs/Roadmap.md index d5c6a0079..29878bb20 100644 --- a/docs/VictoriaLogs/Roadmap.md +++ b/docs/VictoriaLogs/Roadmap.md @@ -25,13 +25,14 @@ See [these docs](https://docs.victoriametrics.com/victorialogs/) for details. The following functionality is planned in the future versions of VictoriaLogs: - Support for [data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/) from popular log collectors and formats: - - OpenTelemetry for logs - - Fluentd - - Journald (systemd) - - Datadog protocol for logs -- Integration with Grafana ([partially done](https://github.com/VictoriaMetrics/victorialogs-datasource)). -- Ability to make instant snapshots and backups in the way [similar to VictoriaMetrics](https://docs.victoriametrics.com/#how-to-work-with-snapshots). -- Cluster version of VictoriaLogs. -- Ability to store data to object storage (such as S3, GCS, Minio). -- Alerting on LogsQL queries. -- Data migration tool from Grafana Loki to VictoriaLogs (similar to [vmctl](https://docs.victoriametrics.com/vmctl/)). + - [ ] [OpenTelemetry for logs](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4839) + - [ ] Fluentd + - [ ] [Journald](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4618) (systemd) + - [ ] [Datadog protocol for logs](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6632) + - [ ] [Telegraf http output](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5310) +- [ ] Integration with Grafana. Partially done, check the [documentation](https://docs.victoriametrics.com/victorialogs/victorialogs-datasource/) and [datasource repository](https://github.com/VictoriaMetrics/victorialogs-datasource). +- [ ] Ability to make instant snapshots and backups in the way [similar to VictoriaMetrics](https://docs.victoriametrics.com/#how-to-work-with-snapshots). +- [ ] Cluster version of VictoriaLogs. +- [ ] Ability to store data to object storage (such as S3, GCS, Minio). +- [ ] Alerting on LogsQL queries. +- [ ] Data migration tool from Grafana Loki to VictoriaLogs (similar to [vmctl](https://docs.victoriametrics.com/vmctl/)). diff --git a/docs/VictoriaLogs/querying/README.md b/docs/VictoriaLogs/querying/README.md index 6607ece91..8f4857e71 100644 --- a/docs/VictoriaLogs/querying/README.md +++ b/docs/VictoriaLogs/querying/README.md @@ -639,6 +639,10 @@ There are three modes of displaying query results: See also [command line interface](#command-line). +## Visualization in Grafana + +[VictoriaLogs Grafana Datasource](https://docs.victoriametrics.com/victorialogs/victorialogs-datasource/) allows you to query and visualize VictoriaLogs data in Grafana + ## Command-line VictoriaLogs integrates well with `curl` and other command-line tools during querying because of the following features: diff --git a/docs/VictoriaLogs/victorialogs-datasource.md b/docs/VictoriaLogs/victorialogs-datasource.md index 27d15cf6b..6a85d031d 100644 --- a/docs/VictoriaLogs/victorialogs-datasource.md +++ b/docs/VictoriaLogs/victorialogs-datasource.md @@ -94,7 +94,7 @@ docker-compose -f docker-compose.yaml up After Grafana starts successfully, datasource should be available in the datasources tab -Configuration +Configuration ### Install in Kubernetes diff --git a/docs/anomaly-detection/CHANGELOG.md b/docs/anomaly-detection/CHANGELOG.md index f81360202..73e119ac8 100644 --- a/docs/anomaly-detection/CHANGELOG.md +++ b/docs/anomaly-detection/CHANGELOG.md @@ -17,6 +17,13 @@ Please find the changelog for VictoriaMetrics Anomaly Detection below. > **Important note: Users are strongly encouraged to upgrade to `vmanomaly` [v1.9.2](https://hub.docker.com/repository/docker/victoriametrics/vmanomaly/tags?page=1&ordering=name) or newer for optimal performance and accuracy.

This recommendation is crucial for configurations with a low `infer_every` parameter [in your scheduler](https://docs.victoriametrics.com/anomaly-detection/components/scheduler/#parameters-1), and in scenarios where data exhibits significant high-order seasonality patterns (such as hourly or daily cycles). Previous versions from v1.5.1 to v1.8.0 were identified to contain a critical issue impacting model training, where models were inadvertently trained on limited data subsets, leading to suboptimal fits, affecting the accuracy of anomaly detection.

Upgrading to v1.9.2 addresses this issue, ensuring proper model training and enhanced reliability. For users utilizing Helm charts, it is recommended to upgrade to version [1.0.0](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-anomaly/CHANGELOG.md#100) or newer.** +## v1.13.2 +Released: 2024-07-15 +- IMPROVEMENT: update `node-exporter` [preset](/anomaly-detection/presets/#node-exporter) to reduce [false positives](https://victoriametrics.com/blog/victoriametrics-anomaly-detection-handbook-chapter-1/index.html#false-positive) +- FIX: add `verify_tls` arg for [`push`](/anomaly-detection/components/monitoring/#push-config-parameters) monitoring section. Also, `verify_tls` is now correctly used in [VmWriter](/anomaly-detection/components/writer/#vm-writer). +- FIX: now [`AutoTuned`](/anomaly-detection/components/models/#autotuned) model wrapper works correctly in [on-disk model storage mode](/anomaly-detection/faq/#resource-consumption-of-vmanomaly). +- FIX: now [rolling models](/anomaly-detection/components/models/#rolling-models), like [`RollingQuantile`](/anomaly-detection/components/models/#rolling-quantile) are properly handled in [One-off scheduler](/anomaly-detection/components/scheduler/#oneoff-scheduler), when wrapped in [`AutoTuned`](/anomaly-detection/components/models/#autotuned) + ## v1.13.0 Released: 2024-06-11 - FEATURE: Introduced `preset` [mode to run vmanomaly service](/anomaly-detection/presets) with minimal user input and on widely-known metrics, like those produced by [`node_exporter`](/anomaly-detection/presets#node-exporter). diff --git a/docs/anomaly-detection/components/models.md b/docs/anomaly-detection/components/models.md index eddec6d58..961c42fc1 100644 --- a/docs/anomaly-detection/components/models.md +++ b/docs/anomaly-detection/components/models.md @@ -353,7 +353,10 @@ models: # ... ``` -**Note**: Autotune can't be made on your [custom model](#custom-model-guide). Also, it can't be applied to itself (like `tuned_class_name: 'model.auto.AutoTunedModel'`) +> **Note**: There are some expected limitations of Autotune mode: +> - It can't be made on your [custom model](#custom-model-guide). +> - It can't be applied to itself (like `tuned_class_name: 'model.auto.AutoTunedModel'`) +> - `AutoTunedModel` can't be used on [rolling models](/anomaly-detection/components/models/#rolling-models) like [`RollingQuantile`](/anomaly-detection/components/models/#rolling-quantile) in combination with [on-disk model storage mode](/anomaly-detection/faq/#resource-consumption-of-vmanomaly), as the rolling models exists only during `infer` calls and aren't persisted neither in RAM, nor on disk. ### [Prophet](https://facebook.github.io/prophet/) diff --git a/docs/anomaly-detection/components/monitoring.md b/docs/anomaly-detection/components/monitoring.md index b86981728..27be41d0b 100644 --- a/docs/anomaly-detection/components/monitoring.md +++ b/docs/anomaly-detection/components/monitoring.md @@ -75,6 +75,11 @@ There are 2 models to monitor VictoriaMetrics Anomaly Detection behavior - [push BasicAuth password + + verify_tls + False + Allows disabling TLS verification of the remote certificate. + timeout "5s" @@ -100,6 +105,7 @@ monitoring: tenant_id: "0:0" # For cluster version only user: "USERNAME" password: "PASSWORD" + verify_tls: False timeout: "5s" extra_labels: job: "vmanomaly-push" diff --git a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md index 239d254ec..c9e942584 100644 --- a/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md +++ b/docs/anomaly-detection/guides/guide-vmanomaly-vmalert.md @@ -26,7 +26,7 @@ aliases: - [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/) - [Node exporter](https://github.com/prometheus/node_exporter#node-exporter) (v1.7.0) and [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) (v0.27.0) -vmanomaly typical setup diagram +vmanomaly typical setup diagram > **Note: Configurations used throughout this guide can be found [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/deployment/docker/vmanomaly/vmanomaly-integration/)** @@ -108,7 +108,7 @@ In this context, the metric `node_cpu_seconds_total` provides a comprehensive br The `node_cpu_seconds_total` metric is classified as a [counter](https://docs.victoriametrics.com/keyconcepts/#counter) type. To analyze the duration each CPU core spends in these modes, it is necessary to compute the rate of change per second using the [rate function](https://docs.victoriametrics.com/metricsql/#rate): `rate(node_cpu_seconds_total)`. For a more refined and smoother aggregation of data by mode, we apply the sum function. The resulting query is formulated as follows: `sum(rate(node_cpu_seconds_total[5m])) by (mode, instance, job)`. Below is an illustrative example of how this query might be visualized in Grafana: -node_cpu_rate_graph +node_cpu_rate_graph This query will yield a total of eight time series, each corresponding to a CPU mode. The number of series is unaffected by the number of CPU cores, due to the `by` aggregation applied. These series serve as the input for `vmanomaly`, where the service independently fits a separate instance of the configured model type to each of time series. @@ -452,7 +452,7 @@ networks: Before running our docker-compose make sure that your directory contains all required files: -all files +all files This docker-compose file will pull docker images, set up each service and run them all together with the command: @@ -489,7 +489,7 @@ Each of these metrics will contain same labels our query `sum(rate(node_cpu_seco ### Anomaly scores for each metric with its according labels. Query: `anomaly_score` -Anomaly score graph +Anomaly score graph
Check out if the anomaly score is high for datapoints you think are anomalies. If not, you can try other parameters in the config file or try other model type. @@ -500,7 +500,7 @@ As you may notice a lot of data shows anomaly score greater than 1. It is expect Queries: `yhat_lower`, `yhat_upper` and `yhat` -yhat lower and yhat upper +yhat lower and yhat upper Boundaries of 'normal' metric values according to model inference. @@ -508,10 +508,10 @@ Boundaries of 'normal' metric values according to model inference. On the page `http://localhost:8880/vmalert/groups` you can find our configured Alerting rule: -alert rule +alert rule According to the rule configured for vmalert we will see Alert when anomaly score exceed 1. You will see an alert on Alert tab. `http://localhost:8880/vmalert/alerts`: -alerts firing +alerts firing ## 10. Conclusion diff --git a/docs/data-ingestion/Proxmox.md b/docs/data-ingestion/Proxmox.md new file mode 100644 index 000000000..5411ed88f --- /dev/null +++ b/docs/data-ingestion/Proxmox.md @@ -0,0 +1,70 @@ +--- +title: Proxmox +weight: 1 +sort: 1 +menu: + docs: + identifier: "proxmox" + parent: "data-ingestion" + weight: 1 + # sort: 1 +aliases: + - /data-ingestion/proxmox.html + - /data-ingestion/Proxmox.html +--- + +# Proxmox Data Ingestion +Since Proxmox Virtual Environment(PVE) and Proxmox Backup Server(PBS) support sending data using the InfluxDB We can use the InfluxDB write support built into VictoriaMetrics +Currently PVE and PBS only support using an Authorization Token for authentication and does not support basic auth or a username and password. + +## Proxmox Virtual Environment (PVE) +If want help Sending your data to Managed VictoriaMetrics check out [our blog](https://victoriametrics.com/blog/proxmox-monitoring-with-dbaas/). + +1. Login to PVE as an administrator +2. Go to DataCenter > MetricServer > Add > InfluxDB + +PVE Metric Navigation + +3. Set the parameters as follows: + - Name: VictoriaMetrics (can be changed to any string) + - Server: the hostname or IP of your VictoriaMetrics Instance + - Port: This will vary depending how you are sending data to VictoriaMetrics, but the defaults for all components are listed in the [data ingestion documentation](https://docs.victoriametrics.com/data-ingestion.html) + - Protocol: use HTTPS if you have TLS/SSL configured otherwise use HTTP + - Organization: leave empty since it doesn't get used + - Bucket: leave empty since it doesn't get used + - Token: your token from vmauth or leave blank if you don't have authentication enabled + - If you need to ignore TLS/SSL errors check the advanced box and uncheck the verify certificate box +4. Click the `Create` button + +PVE Metric Form + +5. Run `system_uptime{object="nodes"}` in vmui or in the explore view in Grafana to verify metrics from PVE are being sent to VictoriaMetrics. +You should see 1 time series per node in your PVE cluster. + +## Proxmox Backup Server (PBS) +1. Login to PBS as an administrator +2. Go to Configuration > Metrics Server > Add > InfluxDB + + +PBS Metric Navigation + + +3. Set the parameters as follows: + - Name: VictoriaMetrics (can be set to any string) + - URL: http(s)://: + - set the URL to https if you have TLS enabled and http if you do not + - Port: This will vary depending how you are sending data to VictoriaMetrics, but the defaults for all components are listed in the [data ingestion documentation](https://docs.victoriametrics.com/data-ingestion.html) + - Organization: leave empty since it doesn't get used + - Bucket: leave empty since it doesn't get used + - Token: your token from vmauth or leave blank if you don't have authentication enabled +4. Click the `Create` button + + +PBS Metric Form + + +5. Run `cpustat_idle{object="host"}` in vmui or in the explore view in Grafana to verify metrics from PBS are being to VictoriaMetrics. + + +# References +- [Blog Post for configuring Managed VictoriaMetrics and Proxmox VE](https://victoriametrics.com/blog/proxmox-monitoring-with-dbaas/) diff --git a/docs/data-ingestion/README.md b/docs/data-ingestion/README.md new file mode 100644 index 000000000..ae16413d7 --- /dev/null +++ b/docs/data-ingestion/README.md @@ -0,0 +1,35 @@ +--- +# sort: 14 +title: Data Ingestion +weight: 0 +menu: + docs: + parent: 'victoriametrics' + identifier: 'data-ingestion' + weight: 7 +aliases: +- /data-ingestion.html +- /data-ingestion.html +- /dataingestion/ +--- + +# Data Ingestion +In This Folder you will find instructions for sending data to VictoriaMetrics from a variety of platforms. +If your tool is not listed it is likely you can ingest your data into VictoriaMetrics using one of the protocols listed in our [Prominent features]({{< ref "/Single-server-VictoriaMetrics.md#prominent-features" >}}) section. + +If you are unsure what port number to use when pushing data to VictoriaMetrics single node, vminsert, vmagent, and vmauth we have listed the default ports below. + +- VictoriaMetrics Single: 8428 +- vmagent: 8429 +- vmauth: 8427 +- vminsert: 8482 + +In the rest of the documentation we will assume you have configured your push endpoint to use TLS/SSL on port 443 so the urls in the rest of the documentation will look like `https://` instead of `http://:8428` for VictoriaMetrics single. + +## Documented Collectors/Agents +* [Telegraf]({{< relref "Telegraf.md" >}}) +* [Vector]({{< relref "Vector.md" >}}) + +## Supported Platforms +* [Proxmox Virtual Environment and Proxmox Backup Server]({{< relref "Proxmox.md" >}}) + diff --git a/docs/data-ingestion/Telegraf.md b/docs/data-ingestion/Telegraf.md new file mode 100644 index 000000000..744dcd1cd --- /dev/null +++ b/docs/data-ingestion/Telegraf.md @@ -0,0 +1,105 @@ +--- +title: Telegraf +weight: 1 +sort: 1 +menu: + docs: + identifier: "telegraf" + parent: "data-ingestion" + weight: 1 + # sort: 1 +aliases: + - /data-ingestion/telegraf.html + - /data-ingestion/Telegraf.html +--- +# Telegraf Setup +You will need to add the following output section to a Telegraf configuration file and reload Telegraf to enable shipping data from Telegraf to VictoriaMetrics. +All the options examples below can be combined to fit your use case + +To avoid storing Passwords in configuration files you can store as a key value pair in `/etc/default/telegraf` on Linux as follows +``` +victoriametrics_url="https://metrics.example.com" +victoriametrics_user="telegraf" +victoriametrics_password="password" +victoriametrics_token="my_token" +``` +and they can be referenced in a Telegraf configuration file by prepending the variable name with `$` ex. `$victoriametrics_url` will be translated to `https://metrics.example.com` if it referenced in a Telegraf configuration using the values from `/etc/default/telegraf` in the values seen above. +Otherwise please replace the variables below to fit your setup. + +If you want to mimic this behavior on windows please read [Influx Data's blog on storing variables in the registry](https://www.influxdata.com/blog/using-telegraf-on-windows/) + +## Minimum Configuration with no Authentication +```toml +[[outputs.influxdb]] + urls = ["$victoriametrics_url"] + database = "victoriametrics" + skip_database_creation = true + exclude_retention_policy_tag = true + content_encoding = "gzip" +``` + + +## HTTP Basic Authentication (Username and Password) +This is the same as the minimum configuration, but adds the `username` and `password` options + +```toml +[[outputs.influxdb]] + urls = ["$victoriametrics_url"] + username = "$victoriametrics_user" + password = "$victoriametrics_password" + database = "victoriametrics" + skip_database_creation = true + exclude_retention_policy_tag = true + content_encoding = "gzip" +``` + +## Bearer Authentication (Token) + +This is the same as the minimum configuration but adds the authorization header + +``` +[[outputs.influxdb]] + urls = ["$victoriametrics_url"] + http_headers = {"Authorization" = "Bearer $victoriametrics_token"} + database = "victoriametrics" + skip_database_creation = true + exclude_retention_policy_tag = true + content_encoding = "gzip" +``` + +## Route certain metrics +If you only want to route certain metrics to VictoriaMetrics use the `namepass` option with a comma separated listed of the measurements you wish to send to VictoriaMetrics. + +``` +[[outputs.influxdb]] + urls = ["$victoriametrics_url"] + username = "$victoriametrics_user" + password = "$victoriametrics_password" + database = "victoriametrics" + skip_database_creation = true + exclude_retention_policy_tag = true + content_encoding = "gzip" + namepass = ["cpu","disk","measurement1","measurement2"] +``` + +## Ignore TLS/SSL Certificate errors +This is the same as the minimum configuration but adds `insecure_skip_verify = true` to the configuration to ignore TLS certificate errors. +This is not recommended since it can allow sending metrics to a compromised site. + +``` +[[outputs.influxdb]] + urls = ["$victoriametrics_url"] + username = "$victoriametrics_user" + password = "$victoriametrics_password" + database = "victoriametrics" + skip_database_creation = true + exclude_retention_policy_tag = true + content_encoding = "gzip" + insecure_skip_verify = true +``` + +# References +- [Install Telegraf](https://docs.influxdata.com/telegraf/v1/install/) +- [InfluxDBv1 output for Telegraf](https://github.com/influxdata/telegraf/tree/master/plugins/outputs/influxdb) +- [Storing Telegraf variables in the windows registry](https://www.influxdata.com/blog/using-telegraf-on-windows/) +- [Telegraf variables](https://docs.influxdata.com/telegraf/v1/configuration/#example-telegraf-environment-variables) diff --git a/docs/data-ingestion/Vector.md b/docs/data-ingestion/Vector.md new file mode 100644 index 000000000..482d851ec --- /dev/null +++ b/docs/data-ingestion/Vector.md @@ -0,0 +1,121 @@ +--- +title: Vector +weight: 1 +sort: 1 +menu: + docs: + identifier: "Vector" + parent: "data-ingestion" + weight: 1 + # sort: 1 +aliases: + - /data-ingestion/Vector.html + - /data-ingestion/vector.html +--- +# Vector +To Send data to Vector you need to configure with a Prometheus remote write sink and forward metrics to that sink from at least 1 source. +You will need to replace the values in `<>` with your to match your setup. + +## Minimum Config +```yaml +sources: + host_metrics_source: + type: host_metrics +sinks: + victoriametrics_sink: + type: prometheus_remote_write + inputs: + - host_metrics_source + endpoint: "https:///api/v1/write" + healthcheck: + enabled: false +``` + +## Basic Authentication + +This adds support for basic authentication by defining the auth strategy, user, and password fields: + + +```yaml +sources: + host_metrics_source: + type: host_metrics +sinks: + victoriametrics_sink: + type: prometheus_remote_write + inputs: + - host_metrics_source + endpoint: "https:///api/v1/write" + auth: + strategy: "basic" + user: "" + healthcheck: + enabled: false + +``` + +## Bearer / Token Authentication + +This adds support for bearer/token authentication by defining the auth strategy and token fields: + + +```yaml +sources: + host_metrics_source: + type: host_metrics +sinks: + victoriametrics_sink: + type: prometheus_remote_write + inputs: + - host_metrics_source + endpoint: "https:///api/v1/write" + auth: + strategy: "bearer" + token: "" + healthcheck: + enabled: false +``` + +## VictoriaMetrics and VictoriaLogs + +This combines the Bearer Authentication section with the [VictoriaLogs docs for Vector](https://docs.victoriametrics.com/victorialogs/data-ingestion/vector/), +so you can send metrics and logs with 1 agent to multiple sources: + + +```yaml +sources: + host_metrics_source: + type: host_metrics + journald_source: + type: journald +sinks: + victoriametrics_sink: + type: prometheus_remote_write + inputs: + - host_metrics_source + endpoint: "https:///api/v1/write" + auth: + strategy: "bearer" + token: "" + healthcheck: + enabled: false + victorialogs_sink: + inputs: + - journald_source + type: elasticsearch + endpoints: + - "https:///insert/elasticsearch/" + mode: bulk + api_version: "v8" + healthcheck: + enabled: false + query: + _msg_field: "message" + _time_field: "timestamp" + _stream_fields: "host,container_name" +``` + +# References +- [Vector documentation](https://vector.dev/docs/) +- [VictoriaLogs documenation for using vector]({{< ref "/victorialogs/data-ingestion/vector" >}}) diff --git a/docs/data-ingestion/pbs-form.webp b/docs/data-ingestion/pbs-form.webp new file mode 100644 index 000000000..7a16bb204 Binary files /dev/null and b/docs/data-ingestion/pbs-form.webp differ diff --git a/docs/data-ingestion/pbs-nav.webp b/docs/data-ingestion/pbs-nav.webp new file mode 100644 index 000000000..a77563821 Binary files /dev/null and b/docs/data-ingestion/pbs-nav.webp differ diff --git a/docs/data-ingestion/pve-form.webp b/docs/data-ingestion/pve-form.webp new file mode 100644 index 000000000..7293c5b15 Binary files /dev/null and b/docs/data-ingestion/pve-form.webp differ diff --git a/docs/data-ingestion/pve-nav.webp b/docs/data-ingestion/pve-nav.webp new file mode 100644 index 000000000..ed5d9a13e Binary files /dev/null and b/docs/data-ingestion/pve-nav.webp differ diff --git a/docs/keyConcepts.md b/docs/keyConcepts.md index c6bb3acd1..c44074116 100644 --- a/docs/keyConcepts.md +++ b/docs/keyConcepts.md @@ -349,7 +349,7 @@ This limit can be changed via `-maxLabelsPerTimeseries` command-line flag if nec Every label value can contain an arbitrary string value. The good practice is to use short and meaningful label values to describe the attribute of the metric, not to tell the story about it. For example, label-value pair `environment="prod"` is ok, but `log_message="long log message with a lot of details..."` is not ok. By default, -VictoriaMetrics limits label's value size with 1kB. This limit can be changed via `-maxLabelValueLen` command-line flag. +VictoriaMetrics limits label's value size with 4KiB. This limit can be changed via `-maxLabelValueLen` command-line flag. It is very important to keep under control the number of unique label values, since every unique label value leads to a new [time series](#time-series). Try to avoid using volatile label values such as session ID or query ID in order to diff --git a/docs/sd_configs.md b/docs/sd_configs.md index 9a959d833..dda1248f2 100644 --- a/docs/sd_configs.md +++ b/docs/sd_configs.md @@ -1693,7 +1693,7 @@ scrape_configs: # scrape_timeout: # max_scrape_size is an optional parameter for limiting the response size in bytes from scraped targets. - # By default, uses limit from -promscrape.maxScrapeSize command-line flag. + # If max_scrape_size isn't set, then the limit from -promscrape.maxScrapeSize command-line flag is used instead. # Example values: # - "10MiB" - 10 * 1024 * 1024 bytes # - "100MB" - 100 * 1000 * 1000 bytes diff --git a/docs/stream-aggregation.md b/docs/stream-aggregation.md index b791aaeb2..d72603a88 100644 --- a/docs/stream-aggregation.md +++ b/docs/stream-aggregation.md @@ -19,8 +19,8 @@ The aggregation is applied to all the metrics received via any [supported data i and/or scraped from [Prometheus-compatible targets](https://docs.victoriametrics.com/#how-to-scrape-prometheus-exporters-such-as-node-exporter) after applying all the configured [relabeling stages](https://docs.victoriametrics.com/vmagent/#relabeling). -_By default, stream aggregation ignores timestamps associated with the input [samples](https://docs.victoriametrics.com/keyconcepts/#raw-samples). -It expects that the ingested samples have timestamps close to the current time. See [how to ignore old samples](#ignoring-old-samples)._ +**By default, stream aggregation ignores timestamps associated with the input [samples](https://docs.victoriametrics.com/keyconcepts/#raw-samples). +It expects that the ingested samples have timestamps close to the current time. See [how to ignore old samples](#ignoring-old-samples).** ## Configuration @@ -28,9 +28,9 @@ Stream aggregation can be configured via the following command-line flags: - `-streamAggr.config` at [single-node VictoriaMetrics](https://docs.victoriametrics.com/single-server-victoriametrics/) and at [vmagent](https://docs.victoriametrics.com/vmagent/). -- `-remoteWrite.streamAggr.config` at [vmagent](https://docs.victoriametrics.com/vmagent/) only. - This flag can be specified individually per each `-remoteWrite.url` and aggregation will happen independently for each of them. - This allows writing different aggregates to different remote storage destinations. +- `-remoteWrite.streamAggr.config` at [vmagent](https://docs.victoriametrics.com/vmagent/) only. This flag can be specified individually + per each `-remoteWrite.url`, so the aggregation happens independently per each remote storage destination. + This allows writing different aggregates to different remote storage systems. These flags must point to a file containing [stream aggregation config](#stream-aggregation-config). The file may contain `%{ENV_VAR}` placeholders which are substituted by the corresponding `ENV_VAR` environment variable values. @@ -60,26 +60,24 @@ The processed data is then stored in local storage and **can't be forwarded furt [vmagent](https://docs.victoriametrics.com/vmagent/) supports relabeling, deduplication and stream aggregation for all the received data, scraped or pushed. Then, the collected data will be forwarded to specified `-remoteWrite.url` destinations. The data processing order is the following: -1. All the received data is [relabeled](https://docs.victoriametrics.com/vmagent/#relabeling) according to - specified `-remoteWrite.relabelConfig`; -1. All the received data is [deduplicated](https://docs.victoriametrics.com/stream-aggregation/#deduplication) - according to specified `-streamAggr.dedupInterval`; -1. All the received data is aggregated according to specified `-streamAggr.config`; -1. The resulting data from p1 and p2 is then replicated to each `-remoteWrite.url`; -1. Data sent to each `-remoteWrite.url` can be additionally relabeled according to the - corresponding `-remoteWrite.urlRelabelConfig` (set individually per URL); -1. Data sent to each `-remoteWrite.url` can be additionally deduplicated according to the - corresponding `-remoteWrite.streamAggr.dedupInterval` (set individually per URL); -1. Data sent to each `-remoteWrite.url` can be additionally aggregated according to the - corresponding `-remoteWrite.streamAggr.config` (set individually per URL). Please note, it is not recommended - to use `-streamAggr.config` and `-remoteWrite.streamAggr.config` together, unless you understand the complications. -Typical scenarios for data routing with vmagent: -1. **Aggregate incoming data and replicate to N destinations**. For this one should configure `-streamAggr.config` -to aggregate the incoming data before replicating it to all the configured `-remoteWrite.url` destinations. -2. **Individually aggregate incoming data for each destination**. For this on should configure `-remoteWrite.streamAggr.config` -for each `-remoteWrite.url` destination. [Relabeling](https://docs.victoriametrics.com/vmagent/#relabeling) -via `-remoteWrite.urlRelabelConfig` can be used for routing only selected metrics to each `-remoteWrite.url` destination. +1. all the received data is relabeled according to the specified [`-remoteWrite.relabelConfig`](https://docs.victoriametrics.com/vmagent/#relabeling) (if it is set) +1. all the received data is deduplicated according to specified [`-streamAggr.dedupInterval`](https://docs.victoriametrics.com/stream-aggregation/#deduplication) + (if it is set to duration bigger than 0) +1. all the received data is aggregated according to specified [`-streamAggr.config`](https://docs.victoriametrics.com/stream-aggregation/#configuration) (if it is set) +1. the resulting data is then replicated to each `-remoteWrite.url` +1. data sent to each `-remoteWrite.url` can be additionally relabeled according to the corresponding `-remoteWrite.urlRelabelConfig` (set individually per URL) +1. data sent to each `-remoteWrite.url` can be additionally deduplicated according to the corresponding `-remoteWrite.streamAggr.dedupInterval` (set individually per URL) +1. data sent to each `-remoteWrite.url` can be additionally aggregated according to the corresponding `-remoteWrite.streamAggr.config` (set individually per URL) + It isn't recommended using `-streamAggr.config` and `-remoteWrite.streamAggr.config` simultaneously, unless you understand the complications. + +Typical scenarios for data routing with `vmagent`: + +1. **Aggregate incoming data and replicate to N destinations**. Specify [`-streamAggr.config`](https://docs.victoriametrics.com/stream-aggregation/#configuration) command-line flag + to aggregate the incoming data before replicating it to all the configured `-remoteWrite.url` destinations. +2. **Individually aggregate incoming data for each destination**. Specify [`-remoteWrite.streamAggr.config`](https://docs.victoriametrics.com/stream-aggregation/#configuration) + command-line flag for each `-remoteWrite.url` destination. [Relabeling](https://docs.victoriametrics.com/vmagent/#relabeling) via `-remoteWrite.urlRelabelConfig` + can be used for routing only the selected metrics to each `-remoteWrite.url` destination. ## Deduplication @@ -562,14 +560,14 @@ Below are aggregation functions that can be put in the `outputs` list at [stream * [avg](#avg) * [count_samples](#count_samples) * [count_series](#count_series) +* [histogram_bucket](#histogram_bucket) * [increase](#increase) * [increase_prometheus](#increase_prometheus) -* [rate_sum](#rate_sum) -* [rate_avg](#rate_avg) -* [histogram_bucket](#histogram_bucket) * [last](#last) * [max](#max) * [min](#min) +* [rate_avg](#rate_avg) +* [rate_sum](#rate_sum) * [stddev](#stddev) * [stdvar](#stdvar) * [sum_samples](#sum_samples) @@ -593,7 +591,13 @@ For example, see below time series produced by config with aggregation interval avg aggregation -See also [min](#min), [max](#max), [sum_samples](#sum_samples) and [count_samples](#count_samples). +See also: + +- [max](#max) +- [min](#min) +- [quantiles](#quantiles) +- [sum_samples](#sum_samples) +- [count_samples](#count_samples) ### count_samples @@ -605,7 +609,10 @@ The results of `count_samples` is equal to the following [MetricsQL](https://doc sum(count_over_time(some_metric[interval])) ``` -See also [count_series](#count_series) and [sum_samples](#sum_samples). +See also: + +- [count_series](#count_series) +- [sum_samples](#sum_samples) ### count_series @@ -617,7 +624,33 @@ The results of `count_series` is equal to the following [MetricsQL](https://docs count(last_over_time(some_metric[interval])) ``` -See also [count_samples](#count_samples) and [unique_samples](#unique_samples). +See also: + +- [count_samples](#count_samples) +- [unique_samples](#unique_samples) + +### histogram_bucket + +`histogram_bucket` returns [VictoriaMetrics histogram buckets](https://valyala.medium.com/improving-histogram-usability-for-prometheus-and-grafana-bc7e5df0e350) + for the input [sample values](https://docs.victoriametrics.com/keyconcepts/#raw-samples) over the given `interval`. +`histogram_bucket` makes sense only for aggregating [gauges](https://docs.victoriametrics.com/keyconcepts/#gauge). +See how to aggregate regular histograms [here](#aggregating-histograms). + +The results of `histogram_bucket` is equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: + +Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) +or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. + +```metricsql +sum(histogram_over_time(some_histogram_bucket[interval])) by (vmrange) +``` + +See also: + +- [quantiles](#quantiles) +- [avg](#avg) +- [max](#max) +- [min](#min) ### increase @@ -641,33 +674,12 @@ For example, see below time series produced by config with aggregation interval Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. -See also [increase_prometheus](#increase_prometheus) and [total](#total). +See also: -### rate_sum - -`rate_sum` returns the sum of average per-second change of input [time series](https://docs.victoriametrics.com/keyconcepts/#time-series) over the given `interval`. -`rate_sum` makes sense only for aggregating [counters](https://docs.victoriametrics.com/keyconcepts/#counter). - -The results of `rate_sum` are equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: - -```metricsql -sum(rate(some_counter[interval])) -``` - -See also [rate_avg](#rate_avg) and [total](#total) outputs. - -### rate_avg - -`rate_avg` returns the average of average per-second of input [time series](https://docs.victoriametrics.com/keyconcepts/#time-series) over the given `interval`. -`rate_avg` makes sense only for aggregating [counters](https://docs.victoriametrics.com/keyconcepts/#counter). - -The results of `rate_avg` are equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: - -```metricsql -avg(rate(some_counter[interval])) -``` - -See also [rate_sum](#rate_avg) and [total](#total) outputs. +- [increase_prometheus](#increase_prometheus) +- [total](#total) +- [rate_avg](#rate_avg) +- [rate_sum](#rate_sum) ### increase_prometheus @@ -686,25 +698,13 @@ If you need taking into account the first sample per time series, then take a lo Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. -See also [increase](#increase), [total](#total) and [total_prometheus](#total_prometheus). +See also: -### histogram_bucket - -`histogram_bucket` returns [VictoriaMetrics histogram buckets](https://valyala.medium.com/improving-histogram-usability-for-prometheus-and-grafana-bc7e5df0e350) - for the input [sample values](https://docs.victoriametrics.com/keyconcepts/#raw-samples) over the given `interval`. -`histogram_bucket` makes sense only for aggregating [gauges](https://docs.victoriametrics.com/keyconcepts/#gauge). -See how to aggregate regular histograms [here](#aggregating-histograms). - -The results of `histogram_bucket` is equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: - -Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) -or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. - -```metricsql -sum(histogram_over_time(some_histogram_bucket[interval])) by (vmrange) -``` - -See also [quantiles](#quantiles), [min](#min), [max](#max) and [avg](#avg). +- [increase](#increase) +- [rate_avg](#rate_avg) +- [rate_sum](#rate_sum) +- [total](#total) +- [total_prometheus](#total_prometheus) ### last @@ -716,7 +716,12 @@ The results of `last` is roughly equal to the following [MetricsQL](https://docs last_over_time(some_metric[interval]) ``` -See also [min](#min), [max](#max) and [avg](#avg). +See also: + +- [avg](#avg) +- [max](#max) +- [min](#min) +- [quantiles](#quantiles) ### max @@ -732,7 +737,12 @@ For example, see below time series produced by config with aggregation interval total aggregation -See also [min](#min) and [avg](#avg). +See also: + +- [min](#min) +- [avg](#avg) +- [last](#last) +- [quantiles](#quantiles) ### min @@ -748,7 +758,46 @@ For example, see below time series produced by config with aggregation interval min aggregation -See also [max](#max) and [avg](#avg). +See also: + +- [max](#max) +- [avg](#avg) +- [last](#last) +- [quantiles](#quantiles) + +### rate_avg + +`rate_avg` returns the average of average per-second increase rates across input [time series](https://docs.victoriametrics.com/keyconcepts/#time-series) over the given `interval`. +`rate_avg` makes sense only for aggregating [counters](https://docs.victoriametrics.com/keyconcepts/#counter). + +The results of `rate_avg` are equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: + +```metricsql +avg(rate(some_counter[interval])) +``` + +See also: + +- [rate_sum](#rate_sum) +- [increase](#increase) +- [total](#total) + +### rate_sum + +`rate_sum` returns the sum of average per-second increase rates across input [time series](https://docs.victoriametrics.com/keyconcepts/#time-series) over the given `interval`. +`rate_sum` makes sense only for aggregating [counters](https://docs.victoriametrics.com/keyconcepts/#counter). + +The results of `rate_sum` are equal to the following [MetricsQL](https://docs.victoriametrics.com/metricsql/) query: + +```metricsql +sum(rate(some_counter[interval])) +``` + +See also: + +- [rate_avg](#rate_avg) +- [increase](#increase) +- [total](#total) ### stddev @@ -762,7 +811,11 @@ The results of `stddev` is roughly equal to the following [MetricsQL](https://do histogram_stddev(sum(histogram_over_time(some_metric[interval])) by (vmrange)) ``` -See also [stdvar](#stdvar) and [avg](#avg). +See also: + +- [stdvar](#stdvar) +- [avg](#avg) +- [quantiles](#quantiles) ### stdvar @@ -780,7 +833,11 @@ For example, see below time series produced by config with aggregation interval stdvar aggregation -See also [stddev](#stddev) and [avg](#avg). +See also: + +- [stddev](#stddev) +- [avg](#avg) +- [quantiles](#quantiles) ### sum_samples @@ -797,7 +854,10 @@ For example, see below time series produced by config with aggregation interval sum_samples aggregation -See also [count_samples](#count_samples) and [count_series](#count_series). +See also: + +- [count_samples](#count_samples) +- [count_series](#count_series) ### total @@ -834,7 +894,13 @@ This changes pod name label, but the `total` accounts for such a scenario and do Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. -See also [total_prometheus](#total_prometheus), [increase](#increase) and [increase_prometheus](#increase_prometheus). +See also: + +- [total_prometheus](#total_prometheus) +- [increase](#increase) +- [increase_prometheus](#increase_prometheus) +- [rate_sum](#rate_sum) +- [rate_avg](#rate_avg) ### total_prometheus @@ -857,7 +923,13 @@ The counters are most often reset when the application is restarted. Aggregating irregular and sporadic metrics (received from [Lambdas](https://aws.amazon.com/lambda/) or [Cloud Functions](https://cloud.google.com/functions)) can be controlled via [staleness_interval](#staleness) option. -See also [total](#total), [increase](#increase) and [increase_prometheus](#increase_prometheus). +See also: + +- [total](#total) +- [increase](#increase) +- [increase_prometheus](#increase_prometheus) +- [rate_sum](#rate_sum) +- [rate_avg](#rate_avg) ### unique_samples @@ -870,7 +942,10 @@ The results of `unique_samples` is equal to the following [MetricsQL](https://do count(count_values_over_time(some_metric[interval])) ``` -See also [sum_samples](#sum_samples) and [count_series](#count_series). +See also: + +- [sum_samples](#sum_samples) +- [count_series](#count_series) ### quantiles @@ -885,7 +960,12 @@ The results of `quantiles(phi1, ..., phiN)` is equal to the following [MetricsQL histogram_quantiles("quantile", phi1, ..., phiN, sum(histogram_over_time(some_metric[interval])) by (vmrange)) ``` -See also [histogram_bucket](#histogram_bucket), [min](#min), [max](#max) and [avg](#avg). +See also: + +- [histogram_bucket](#histogram_bucket) +- [avg](#avg) +- [max](#max) +- [min](#min) ## Aggregating by labels @@ -936,6 +1016,13 @@ At [vmagent](https://docs.victoriametrics.com/vmagent/) `-remoteWrite.streamAggr specified individually per each `-remoteWrite.url`: ```yaml + # name is an optional name of the given streaming aggregation config. + # + # If it is set, then it is used as `name` label in the exposed metrics + # for the given aggregation config at /metrics page. + # See https://docs.victoriametrics.com/vmagent/#monitoring and https://docs.victoriametrics.com/#monitoring +- name: 'foobar' + # match is an optional filter for incoming samples to aggregate. # It can contain arbitrary Prometheus series selector # according to https://docs.victoriametrics.com/keyconcepts/#filtering . @@ -962,11 +1049,13 @@ specified individually per each `-remoteWrite.url`: # staleness_interval is an optional interval for resetting the per-series state if no new samples # are received during this interval for the following outputs: - # - total - # - total_prometheus + # - histogram_bucket # - increase # - increase_prometheus - # - histogram_bucket + # - rate_avg + # - rate_sum + # - total + # - total_prometheus # See https://docs.victoriametrics.com/stream-aggregation/#staleness for more details. # # staleness_interval: 2m @@ -1071,13 +1160,13 @@ support the following approaches for hot reloading stream aggregation configs fr The following outputs track the last seen per-series values in order to properly calculate output values: -- [rate_sum](#rate_sum) -- [rate_avg](#rate_avg) -- [total](#total) -- [total_prometheus](#total_prometheus) +- [histogram_bucket](#histogram_bucket) - [increase](#increase) - [increase_prometheus](#increase_prometheus) -- [histogram_bucket](#histogram_bucket) +- [rate_avg](#rate_avg) +- [rate_sum](#rate_sum) +- [total](#total) +- [total_prometheus](#total_prometheus) The last seen per-series value is dropped if no new samples are received for the given time series during two consecutive aggregation intervals specified in [stream aggregation config](#stream-aggregation-config) via `interval` option. diff --git a/docs/vmagent.md b/docs/vmagent.md index ccf9a1544..f0732d54c 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -486,14 +486,6 @@ and attaches `instance`, `job` and other target-specific labels to these metrics scrape_duration_seconds > 1.5 ``` -* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped - and to adjust `max_scrape_size` for scraped targets. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) - returns targets with scrape response > 10MiB: - - ```metricsql - max_scrape_size > 10MiB - ``` - * `scrape_timeout_seconds` - the configured timeout for the current scrape target (aka `scrape_timeout`). This allows detecting targets with scrape durations close to the configured scrape timeout. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets (identified by `instance` label), @@ -503,6 +495,15 @@ and attaches `instance`, `job` and other target-specific labels to these metrics scrape_duration_seconds / scrape_timeout_seconds > 0.8 ``` +* `scrape_response_size_bytes` - response size in bytes for the given target. This allows to monitor amount of data scraped + and to adjust [`max_scrape_size` option](https://docs.victoriametrics.com/sd_configs/#scrape_configs) for scraped targets. + For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets with scrape response + bigger than `10MiB`: + + ```metricsql + scrape_response_size_bytes > 10MiB + ``` + * `scrape_samples_scraped` - the number of samples (aka metrics) parsed per each scrape. This allows detecting targets, which expose too many metrics. For example, the following [MetricsQL query](https://docs.victoriametrics.com/metricsql/) returns targets, which expose more than 10000 metrics: @@ -1021,9 +1022,9 @@ scrape_configs: ## Disabling on-disk persistence -By default `vmagent` stores pending data, which cannot be sent to the configured remote storage systems in a timely manner, in the folder configured -via `-remoteWrite.tmpDataPath` command-line flag. By default `vmagent` writes all the pending data to this folder until this data is sent to the configured -remote storage systems or until the folder becomes full. The maximum data size, which can be saved to `-remoteWrite.tmpDataPath` +By default `vmagent` stores pending data, which cannot be sent to the configured remote storage systems in a timely manner, in the folder set +by `-remoteWrite.tmpDataPath` command-line flag. By default `vmagent` writes all the pending data to this folder until this data is sent to the configured +`-remoteWrite.url` systems or until the folder becomes full. The maximum data size, which can be saved to `-remoteWrite.tmpDataPath` per every configured `-remoteWrite.url`, can be limited via `-remoteWrite.maxDiskUsagePerURL` command-line flag. When this limit is reached, `vmagent` drops the oldest data from disk in order to save newly ingested data. @@ -1031,21 +1032,25 @@ There are cases when it is better disabling on-disk persistence for pending data - When the persistent disk performance isn't enough for the given data processing rate. - When it is better to buffer pending data at the client side instead of bufferring it at `vmagent` side in the `-remoteWrite.tmpDataPath` folder. -- When the data is already buffered at [Kafka side](#reading-metrics-from-kafka) or [Google PubSub side](#reading-metrics-from-pubsub). +- When the data is already buffered at [Kafka side](#reading-metrics-from-kafka) or at [Google PubSub side](#reading-metrics-from-pubsub). - When it is better to drop pending data instead of buffering it. -In this case `-remoteWrite.disableOnDiskQueue` command-line flag can be passed to `vmagent`. -When this flag is specified, `vmagent` works in the following way if the configured remote storage systems cannot keep up with the data ingestion rate: +In this case `-remoteWrite.disableOnDiskQueue` command-line flag can be passed to `vmagent` per each configured `-remoteWrite.url`. +`vmagent` works in the following way if the corresponding remote storage system at `-remoteWrite.url` cannot keep up with the data ingestion rate +and the `-remoteWrite.disableOnDiskQueue` command-line flag is set: - It returns `429 Too Many Requests` HTTP error to clients, which send data to `vmagent` via [supported HTTP endpoints](#how-to-push-data-to-vmagent). - You can specify `-remoteWrite.dropSamplesOnOverload` command-line flag in order to drop the ingested samples instead of returning the error to clients in this case. + If `-remoteWrite.dropSamplesOnOverload` command-line flag is set or if multiple `-remoteWrite.disableOnDiskQueue` command-line flags are set + for different `-remoteWrite.url` options, then the ingested samples are silently dropped instead of returning the error to clients. - It suspends consuming data from [Kafka side](#reading-metrics-from-kafka) or [Google PubSub side](#google-pubsub-integration) until the remote storage becomes available. - You can specify `-remoteWrite.dropSamplesOnOverload` command-line flag in order to drop the fetched samples instead of suspending data consumption from Kafka or Google PubSub. -- It drops samples pushed to `vmagent` via non-HTTP protocols and logs the error. Pass `-remoteWrite.dropSamplesOnOverload` on order to suppress error messages in this case. -- It drops samples [scraped from Prometheus-compatible targets](#how-to-collect-metrics-in-prometheus-format), because it is better to drop samples - instead of blocking the scrape process. -- It drops [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/) output samples, because it is better to drop output samples - instead of blocking the stream aggregation process. + If `-remoteWrite.dropSamplesOnOverload` command-line flag is set or if multiple `-remoteWrite.disableOnDiskQueue` command-line flags are set + for different `-remoteWrite.url` options, then the fetched samples are silently dropped instead of suspending data consumption from Kafka or Google PubSub. +- It drops samples pushed to `vmagent` via non-HTTP protocols and logs the error. Pass `-remoteWrite.dropSamplesOnOverload` command-line flag in order + to suppress error messages in this case. +- It drops samples [scraped from Prometheus-compatible targets](#how-to-collect-metrics-in-prometheus-format), because it is better from operations perspective + to drop samples instead of blocking the scrape process. +- It drops [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/) output samples, because it is better from operations perspective + to drop output samples instead of blocking the stream aggregation process. The number of dropped samples because of overloaded remote storage can be [monitored](#monitoring) via `vmagent_remotewrite_samples_dropped_total` metric. The number of unsuccessful attempts to send data to overloaded remote storage can be [monitored](#monitoring) via `vmagent_remotewrite_push_failures_total` metric. @@ -1057,7 +1062,7 @@ on spiky workloads, since `vmagent` may buffer more data in memory before return if `-remoteWrite.disableOnDiskQueue` command-line flag is specified. It may also read buffered data from `-remoteWrite.tmpDataPath` on startup. -When `-remoteWrite.disableOnDiskQueue` command-line flag is set, then `vmagent` may send the same samples multiple times to the configured remote storage +When `-remoteWrite.disableOnDiskQueue` command-line flag is set, `vmagent` may send the same samples multiple times to the configured remote storage if it cannot keep up with the data ingestion rate. In this case the [deduplication](https://docs.victoriametrics.com/#deduplication) must be enabled on all the configured remote storage systems. @@ -1196,7 +1201,7 @@ If you have suggestions for improvements or have found a bug - please open an is with `-remoteWrite.maxDiskUsagePerURL` command-line flag. If you don't want to send all the buffered data from the directory to remote storage then simply stop `vmagent` and delete the directory. -* If `vmagent` runs on a host with slow persistent storage, which cannot keep up with the volume of processed samples, then is possible to disable +* If `vmagent` runs on a host with slow persistent storage, which cannot keep up with the volume of processed samples, then it is possible to disable the persistent storage with `-remoteWrite.disableOnDiskQueue` command-line flag. See [these docs](#disabling-on-disk-persistence) for more details. * By default `vmagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because @@ -1439,7 +1444,25 @@ by passing multiple `-kafka.consumer.topic` command-line flags to `vmagent`. `vmagent` consumes messages from Kafka brokers specified by `-kafka.consumer.topic.brokers` command-line flag. Multiple brokers can be specified per each `-kafka.consumer.topic` by passing a list of brokers delimited by `;`. -For example, `-kafka.consumer.topic.brokers='host1:9092;host2:9092'`. +For example: +```sh +./bin/vmagent + -kafka.consumer.topic='topic-a' + -kafka.consumer.topic.brokers='host1:9092;host2:9092' + -kafka.consumer.topic='topic-b' + -kafka.consumer.topic.brokers='host3:9092;host4:9092' +``` +This command starts `vmagent` which reads messages from `topic-a` at `host1:9092` and `host2:9092` brokers and messages +from `topic-b` at `host3:9092` and `host4:9092` brokers. + +Note that when using YAML configuration (for example, when using [Helm charts](https://github.com/VictoriaMetrics/helm-charts) or [Kubernetes operator](https://docs.victoriametrics.com/operator/)) +keys provided in `extraArgs` must be unique, so in order to achieve the same configuration as in the example above, the following configuration must be used: +```yaml +extraArgs: + "kafka.consumer.topic": "topic-a,topic-b" + "kafka.consumer.topic.brokers": "host1:9092;host2:9092,host3:9092;host4:9092" +``` +Note that list of brokers for the same topic is separated by `;` and different groups of brokers are separated by `,`. The following command starts `vmagent`, which reads metrics in InfluxDB line protocol format from Kafka broker at `localhost:9092` from the topic `metrics-by-telegraf` and sends them to remote storage at `http://localhost:8428/api/v1/write`: @@ -1645,7 +1668,7 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . -cacheExpireDuration duration Items are removed from in-memory caches after they aren't accessed for this duration. Lower values may reduce memory usage at the cost of higher CPU usage. See also -prevCacheRemovalPercent (default 30m0s) -configAuthKey value - Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -configAuthKey=file:///abs/path/to/file or -configAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -configAuthKey=http://host/path or -configAuthKey=https://host/path -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) @@ -1671,7 +1694,7 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -1857,7 +1880,7 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1888,7 +1911,7 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . -opentsdbhttpTrimTimestamp duration Trim timestamps for OpenTSDB HTTP data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -prevCacheRemovalPercent float Items in the previous caches are removed when the percent of requests it serves becomes lower than this value. Higher values reduce memory usage at the cost of higher CPU usage. See also -cacheExpireDuration (default 0.1) @@ -2010,7 +2033,7 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -remoteWrite.aws.accessKey array Optional AWS AccessKey to use for the corresponding -remoteWrite.url if -remoteWrite.aws.useSigv4 is set @@ -2065,13 +2088,11 @@ See the docs at https://docs.victoriametrics.com/vmagent/ . Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -remoteWrite.disableOnDiskQueue array - Whether to disable storing pending data to -remoteWrite.tmpDataPath when the configured remote storage systems cannot keep up with the data ingestion rate. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence .See also -remoteWrite.dropSamplesOnOverload - Supports array of values separated by comma or specified via multiple flags. - Empty values are set to false. - -remoteWrite.dropSamplesOnOverload array - Whether to drop samples when -remoteWrite.disableOnDiskQueue is set and if the samples cannot be pushed into the configured remote storage systems in a timely manner. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence + Whether to disable storing pending data to -remoteWrite.tmpDataPath when the remote storage system at the corresponding -remoteWrite.url cannot keep up with the data ingestion rate. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence . See also -remoteWrite.dropSamplesOnOverload Supports array of values separated by comma or specified via multiple flags. Empty values are set to false. + -remoteWrite.dropSamplesOnOverload + Whether to drop samples when -remoteWrite.disableOnDiskQueue is set and if the samples cannot be pushed into the configured -remoteWrite.url systems in a timely manner. See https://docs.victoriametrics.com/vmagent#disabling-on-disk-persistence -remoteWrite.flushInterval duration Interval for flushing the data to remote storage. This option takes effect only when less than 10K data points per second are pushed to -remoteWrite.url (default 1s) -remoteWrite.forcePromProto array diff --git a/docs/vmalert.md b/docs/vmalert.md index a15f73bf1..b1f5dce73 100644 --- a/docs/vmalert.md +++ b/docs/vmalert.md @@ -1106,7 +1106,7 @@ The shortlist of configuration flags is the following: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -1179,7 +1179,7 @@ The shortlist of configuration flags is the following: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1266,7 +1266,7 @@ The shortlist of configuration flags is the following: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -promscrape.consul.waitTime duration Wait time used by Consul service discovery. Default value is used if not set @@ -1295,7 +1295,7 @@ The shortlist of configuration flags is the following: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -remoteRead.basicAuth.password string Optional basic auth password for -remoteRead.url diff --git a/docs/vmauth.md b/docs/vmauth.md index d1ae36cb4..ee36f6209 100644 --- a/docs/vmauth.md +++ b/docs/vmauth.md @@ -647,13 +647,13 @@ unauthorized_user: - "X-Forwarded-For:" ``` -it's also possible to update `Host` header to a backend's host name +It is also possible to update `Host` request header to the backend host specified in `url_prefix` by setting an empty value for `Host` header: ```yaml unauthorized_user: url_prefix: "http://backend:1234/" headers: - - "Host:" # Update host header to a backend's host + - "Host:" # Update host header to backend:1234 ``` `vmauth` also supports the ability to set and remove HTTP response headers before returning the response from the backend to client. @@ -1178,7 +1178,7 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -1218,7 +1218,7 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . Supports array of values separated by comma or specified via multiple flags. Empty values are set to false. -idleConnTimeout duration - Defines a duration for idle (keep-alive connections) to exist. Consider setting this value less than "-http.idleConnTimeout". It must prevent possible "write: broken pipe" and "read: connection reset by peer" errors. (default 50s) + The timeout for HTTP keep-alive connections to backend services. It is recommended setting this value to values smaller than -http.idleConnTimeout set at backend services (default 50s) -internStringCacheExpireDuration duration The expiry duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s) -internStringDisableCache @@ -1270,7 +1270,7 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -1281,7 +1281,7 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -pushmetrics.disableCompression Whether to disable request body compression when pushing metrics to every -pushmetrics.url @@ -1300,7 +1300,7 @@ See the docs at https://docs.victoriametrics.com/vmauth/ . Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -reloadAuthKey value - Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings. + Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -reloadAuthKey=file:///abs/path/to/file or -reloadAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -reloadAuthKey=http://host/path or -reloadAuthKey=https://host/path -responseTimeout duration The timeout for receiving a response from backend (default 5m0s) diff --git a/docs/vmbackup.md b/docs/vmbackup.md index 9ccd2c420..134152661 100644 --- a/docs/vmbackup.md +++ b/docs/vmbackup.md @@ -337,7 +337,7 @@ Run `vmbackup -help` in order to see all the available options: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -407,7 +407,7 @@ Run `vmbackup -help` in order to see all the available options: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -420,7 +420,7 @@ Run `vmbackup -help` in order to see all the available options: -origin string Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -pushmetrics.disableCompression Whether to disable request body compression when pushing metrics to every -pushmetrics.url diff --git a/docs/vmbackupmanager.md b/docs/vmbackupmanager.md index 580f46e23..fecbe199b 100644 --- a/docs/vmbackupmanager.md +++ b/docs/vmbackupmanager.md @@ -462,7 +462,7 @@ command-line flags: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -541,7 +541,7 @@ command-line flags: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -552,7 +552,7 @@ command-line flags: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -pushmetrics.disableCompression Whether to disable request body compression when pushing metrics to every -pushmetrics.url diff --git a/docs/vmgateway.md b/docs/vmgateway.md index d84b4dd81..a9f89b867 100644 --- a/docs/vmgateway.md +++ b/docs/vmgateway.md @@ -363,7 +363,7 @@ Below is the list of configuration flags (it can be viewed by running `./vmgatew -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -438,7 +438,7 @@ Below is the list of configuration flags (it can be viewed by running `./vmgatew -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -449,7 +449,7 @@ Below is the list of configuration flags (it can be viewed by running `./vmgatew Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -pushmetrics.disableCompression Whether to disable request body compression when pushing metrics to every -pushmetrics.url diff --git a/docs/vmrestore.md b/docs/vmrestore.md index b500752db..dcb0a1bc2 100644 --- a/docs/vmrestore.md +++ b/docs/vmrestore.md @@ -80,7 +80,7 @@ Run `vmrestore -help` in order to see all the available options: -filestream.disableFadvise Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU -flagsAuthKey value - Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path -fs.disableMmap Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() @@ -150,7 +150,7 @@ Run `vmrestore -help` in order to see all the available options: -metrics.exposeMetadata Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type -metricsAuthKey value - Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path -mtls array Whether to require valid client certificate for https requests to the corresponding -httpListenAddr . This flag works only if -tls flag is set. See also -mtlsCAFile . This flag is available only in Enterprise binaries. See https://docs.victoriametrics.com/enterprise/ @@ -161,7 +161,7 @@ Run `vmrestore -help` in order to see all the available options: Supports an array of values separated by comma or specified via multiple flags. Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. -pprofAuthKey value - Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path -pushmetrics.disableCompression Whether to disable request body compression when pushing metrics to every -pushmetrics.url diff --git a/go.mod b/go.mod index eeaa83abc..073dcd31f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.3.2 github.com/VictoriaMetrics/easyproto v0.1.4 github.com/VictoriaMetrics/fastcache v1.12.2 - github.com/VictoriaMetrics/metrics v1.34.1 + github.com/VictoriaMetrics/metrics v1.35.1 github.com/VictoriaMetrics/metricsql v0.76.0 github.com/aws/aws-sdk-go-v2 v1.30.1 github.com/aws/aws-sdk-go-v2/config v1.27.24 diff --git a/go.sum b/go.sum index cc4b286ed..d3ecc1393 100644 --- a/go.sum +++ b/go.sum @@ -72,8 +72,8 @@ github.com/VictoriaMetrics/easyproto v0.1.4/go.mod h1:QlGlzaJnDfFd8Lk6Ci/fuLxfTo github.com/VictoriaMetrics/fastcache v1.12.2 h1:N0y9ASrJ0F6h0QaC3o6uJb3NIZ9VKLjCM7NQbSmF7WI= github.com/VictoriaMetrics/fastcache v1.12.2/go.mod h1:AmC+Nzz1+3G2eCPapF6UcsnkThDcMsQicp4xDukwJYI= github.com/VictoriaMetrics/metrics v1.34.0/go.mod h1:r7hveu6xMdUACXvB8TYdAj8WEsKzWB0EkpJN+RDtOf8= -github.com/VictoriaMetrics/metrics v1.34.1 h1:7EUEObv45ekfyY6PWat0K/ytluZ4q6aujzXN3g41g/A= -github.com/VictoriaMetrics/metrics v1.34.1/go.mod h1:r7hveu6xMdUACXvB8TYdAj8WEsKzWB0EkpJN+RDtOf8= +github.com/VictoriaMetrics/metrics v1.35.1 h1:o84wtBKQbzLdDy14XeskkCZih6anG+veZ1SwJHFGwrU= +github.com/VictoriaMetrics/metrics v1.35.1/go.mod h1:r7hveu6xMdUACXvB8TYdAj8WEsKzWB0EkpJN+RDtOf8= github.com/VictoriaMetrics/metricsql v0.76.0 h1:hl7vqJqyH2d8zKImzalkFrkFiD5q4ACF8gl3s86DqKA= github.com/VictoriaMetrics/metricsql v0.76.0/go.mod h1:1g4hdCwlbJZ851PU9VN65xy9Rdlzupo6fx3SNZ8Z64U= github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow= diff --git a/lib/backup/azremote/azblob_test.go b/lib/backup/azremote/azblob_test.go index 13fda039e..7ca219ccb 100644 --- a/lib/backup/azremote/azblob_test.go +++ b/lib/backup/azremote/azblob_test.go @@ -5,20 +5,22 @@ import ( "testing" ) -func Test_cleanDirectory(t *testing.T) { +func TestCleanDirectory(t *testing.T) { f := func(dir, exp string) { t.Helper() + got := cleanDirectory(dir) if got != exp { - t.Errorf("expected dir %q, got %q", exp, got) + t.Fatalf("expected dir %q, got %q", exp, got) } } + f("/foo/", "foo/") f("//foo/", "foo/") f("foo", "foo/") } -func Test_FSInit(t *testing.T) { +func TestFSInit(t *testing.T) { f := func(expErr string, params ...string) { t.Helper() diff --git a/lib/flagutil/password.go b/lib/flagutil/password.go index 0830e8bbf..28761b135 100644 --- a/lib/flagutil/password.go +++ b/lib/flagutil/password.go @@ -46,6 +46,11 @@ type Password struct { sourcePath string } +// Name returns the name of p flag. +func (p *Password) Name() string { + return p.flagname +} + // Get returns the current p value. // // It re-reads p value from the file:///path/to/file or http://host/path diff --git a/lib/httpserver/httpserver.go b/lib/httpserver/httpserver.go index fe5f08753..c3078d72d 100644 --- a/lib/httpserver/httpserver.go +++ b/lib/httpserver/httpserver.go @@ -47,9 +47,9 @@ var ( "See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus") httpAuthUsername = flag.String("httpAuth.username", "", "Username for HTTP server's Basic Auth. The authentication is disabled if empty. See also -httpAuth.password") httpAuthPassword = flagutil.NewPassword("httpAuth.password", "Password for HTTP server's Basic Auth. The authentication is disabled if -httpAuth.username is empty") - metricsAuthKey = flagutil.NewPassword("metricsAuthKey", "Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings") - flagsAuthKey = flagutil.NewPassword("flagsAuthKey", "Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides httpAuth.* settings") - pprofAuthKey = flagutil.NewPassword("pprofAuthKey", "Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides httpAuth.* settings") + metricsAuthKey = flagutil.NewPassword("metricsAuthKey", "Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*") + flagsAuthKey = flagutil.NewPassword("flagsAuthKey", "Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*") + pprofAuthKey = flagutil.NewPassword("pprofAuthKey", "Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It -httpAuth.*") disableResponseCompression = flag.Bool("http.disableResponseCompression", false, "Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth") maxGracefulShutdownDuration = flag.Duration("http.maxGracefulShutdownDuration", 7*time.Second, `The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown`) @@ -366,7 +366,7 @@ func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh Reques return case "/metrics": metricsRequests.Inc() - if !CheckAuthFlag(w, r, metricsAuthKey.Get(), "metricsAuthKey") { + if !CheckAuthFlag(w, r, metricsAuthKey) { return } startTime := time.Now() @@ -375,7 +375,7 @@ func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh Reques metricsHandlerDuration.UpdateDuration(startTime) return case "/flags": - if !CheckAuthFlag(w, r, flagsAuthKey.Get(), "flagsAuthKey") { + if !CheckAuthFlag(w, r, flagsAuthKey) { return } h.Set("Content-Type", "text/plain; charset=utf-8") @@ -396,29 +396,17 @@ func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh Reques // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4128 fmt.Fprintf(w, "User-agent: *\nDisallow: /\n") return - case "/config", "/-/reload": - // only some components (vmagent, vmalert, etc.) support these handlers - // these components are responsible for CheckAuthFlag call - // see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6329 - w = &responseWriterWithAbort{ - ResponseWriter: w, - } - if !rh(w, r) { - Errorf(w, r, "unsupported path requested: %q", r.URL.Path) - unsupportedRequestErrors.Inc() - } - return default: if strings.HasPrefix(r.URL.Path, "/debug/pprof/") { pprofRequests.Inc() - if !CheckAuthFlag(w, r, pprofAuthKey.Get(), "pprofAuthKey") { + if !CheckAuthFlag(w, r, pprofAuthKey) { return } pprofHandler(r.URL.Path[len("/debug/pprof/"):], w, r) return } - if !CheckBasicAuth(w, r) { + if !isProtectedByAuthFlag(r.URL.Path) && !CheckBasicAuth(w, r) { return } @@ -435,16 +423,26 @@ func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh Reques } } +func isProtectedByAuthFlag(path string) bool { + // These paths must explicitly call CheckAuthFlag(). + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6329 + return strings.HasSuffix(path, "/config") || strings.HasSuffix(path, "/reload") || + strings.HasSuffix(path, "/resetRollupResultCache") || strings.HasSuffix(path, "/delSeries") || strings.HasSuffix(path, "/delete_series") || + strings.HasSuffix(path, "/force_merge") || strings.HasSuffix(path, "/force_flush") || strings.HasSuffix(path, "/snapshot") || + strings.HasPrefix(path, "/snapshot/") +} + // CheckAuthFlag checks whether the given authKey is set and valid // // Falls back to checkBasicAuth if authKey is not set -func CheckAuthFlag(w http.ResponseWriter, r *http.Request, flagValue string, flagName string) bool { - if flagValue == "" { +func CheckAuthFlag(w http.ResponseWriter, r *http.Request, expectedKey *flagutil.Password) bool { + expectedValue := expectedKey.Get() + if expectedValue == "" { return CheckBasicAuth(w, r) } - if r.FormValue("authKey") != flagValue { + if r.FormValue("authKey") != expectedValue { authKeyRequestErrors.Inc() - http.Error(w, fmt.Sprintf("The provided authKey doesn't match -%s", flagName), http.StatusUnauthorized) + http.Error(w, fmt.Sprintf("The provided authKey doesn't match -%s", expectedKey.Name()), http.StatusUnauthorized) return false } return true diff --git a/lib/httpserver/httpserver_test.go b/lib/httpserver/httpserver_test.go index 198cfffa4..3cca86e27 100644 --- a/lib/httpserver/httpserver_test.go +++ b/lib/httpserver/httpserver_test.go @@ -6,6 +6,8 @@ import ( "net/http/httptest" "strings" "testing" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" ) func TestGetQuotedRemoteAddr(t *testing.T) { @@ -97,7 +99,11 @@ func TestAuthKeyMetrics(t *testing.T) { req.Header.Set("Content-Type", "application/x-www-form-urlencoded;param=value") w := httptest.NewRecorder() - CheckAuthFlag(w, req, "rightKey", "metricsAuthkey") + p := &flagutil.Password{} + if err := p.Set("rightKey"); err != nil { + t.Fatalf("cannot set password: %s", err) + } + CheckAuthFlag(w, req, p) res := w.Result() defer res.Body.Close() @@ -115,7 +121,11 @@ func TestAuthKeyMetrics(t *testing.T) { req.SetBasicAuth(user, pass) w := httptest.NewRecorder() - CheckAuthFlag(w, req, "", "metricsAuthkey") + p := &flagutil.Password{} + if err := p.Set(""); err != nil { + t.Fatalf("cannot set password: %s", err) + } + CheckAuthFlag(w, req, p) res := w.Result() _ = res.Body.Close() diff --git a/lib/httputils/statconn.go b/lib/httputils/statconn.go deleted file mode 100644 index a7494a87c..000000000 --- a/lib/httputils/statconn.go +++ /dev/null @@ -1,146 +0,0 @@ -package httputils - -import ( - "context" - "fmt" - "net" - "strconv" - "strings" - "sync" - "sync/atomic" - - "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" - "github.com/VictoriaMetrics/metrics" -) - -var statConnMetricsRegistry sync.Map - -type statConnMetrics struct { - dialsTotal *metrics.Counter - dialErrors *metrics.Counter - conns *metrics.Counter - - connReadsTotal *metrics.Counter - connWritesTotal *metrics.Counter - connReadErrors *metrics.Counter - connWriteErrors *metrics.Counter - connBytesRead *metrics.Counter - connBytesWritten *metrics.Counter -} - -func newStatConnMetrics(metricPrefix string) statConnMetrics { - scm := statConnMetrics{} - - scm.dialsTotal = metrics.NewCounter(fmt.Sprintf(`%s_dials_total`, metricPrefix)) - scm.dialErrors = metrics.NewCounter(fmt.Sprintf(`%s_dial_errors_total`, metricPrefix)) - scm.conns = metrics.NewCounter(fmt.Sprintf(`%s_conns`, metricPrefix)) - - scm.connReadsTotal = metrics.NewCounter(fmt.Sprintf(`%s_conn_reads_total`, metricPrefix)) - scm.connWritesTotal = metrics.NewCounter(fmt.Sprintf(`%s_conn_writes_total`, metricPrefix)) - scm.connReadErrors = metrics.NewCounter(fmt.Sprintf(`%s_conn_read_errors_total`, metricPrefix)) - scm.connWriteErrors = metrics.NewCounter(fmt.Sprintf(`%s_conn_write_errors_total`, metricPrefix)) - scm.connBytesRead = metrics.NewCounter(fmt.Sprintf(`%s_conn_bytes_read_total`, metricPrefix)) - scm.connBytesWritten = metrics.NewCounter(fmt.Sprintf(`%s_conn_bytes_written_total`, metricPrefix)) - - return scm -} - -// GetStatDialFunc returns dial function that supports DNS SRV records, -// and register stats metrics for conns. -func GetStatDialFunc(metricPrefix string) func(ctx context.Context, network, addr string) (net.Conn, error) { - v, ok := statConnMetricsRegistry.Load(metricPrefix) - if !ok { - v = newStatConnMetrics(metricPrefix) - statConnMetricsRegistry.Store(metricPrefix, v) - } - sm := v.(statConnMetrics) - return func(ctx context.Context, _, addr string) (net.Conn, error) { - network := netutil.GetTCPNetwork() - conn, err := netutil.DialMaybeSRV(ctx, network, addr) - sm.dialsTotal.Inc() - if err != nil { - sm.dialErrors.Inc() - if !netutil.TCP6Enabled() && !isTCPv4Addr(addr) { - err = fmt.Errorf("%w; try -enableTCP6 command-line flag for dialing ipv6 addresses", err) - } - return nil, err - } - sm.conns.Inc() - sc := &statConn{ - Conn: conn, - statConnMetrics: sm, - } - return sc, nil - } -} - -type statConn struct { - closed atomic.Int32 - net.Conn - statConnMetrics -} - -func (sc *statConn) Read(p []byte) (int, error) { - n, err := sc.Conn.Read(p) - sc.connReadsTotal.Inc() - if err != nil { - sc.connReadErrors.Inc() - } - sc.connBytesRead.Add(n) - return n, err -} - -func (sc *statConn) Write(p []byte) (int, error) { - n, err := sc.Conn.Write(p) - sc.connWritesTotal.Inc() - if err != nil { - sc.connWriteErrors.Inc() - } - sc.connBytesWritten.Add(n) - return n, err -} - -func (sc *statConn) Close() error { - err := sc.Conn.Close() - if sc.closed.Add(1) == 1 { - sc.conns.Dec() - } - return err -} - -func isTCPv4Addr(addr string) bool { - s := addr - for i := 0; i < 3; i++ { - n := strings.IndexByte(s, '.') - if n < 0 { - return false - } - if !isUint8NumString(s[:n]) { - return false - } - s = s[n+1:] - } - n := strings.IndexByte(s, ':') - if n < 0 { - return false - } - if !isUint8NumString(s[:n]) { - return false - } - s = s[n+1:] - - // Verify TCP port - n, err := strconv.Atoi(s) - if err != nil { - return false - } - return n >= 0 && n < (1<<16) -} - -func isUint8NumString(s string) bool { - n, err := strconv.Atoi(s) - if err != nil { - return false - } - return n >= 0 && n < (1<<8) -} diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index 69911bddf..f4b7edcdc 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -47,8 +47,7 @@ const maxPartSize = 400e9 // The interval for flushing buffered data to parts, so it becomes visible to search. const pendingItemsFlushInterval = time.Second -// The interval for guaranteed flush of recently ingested data from memory to on-disk parts, -// so they survive process crash. +// The interval for guaranteed flush of recently ingested data from memory to on-disk parts so they survive process crash. var dataFlushInterval = 5 * time.Second // SetDataFlushInterval sets the interval for guaranteed flush of recently ingested data from memory to disk. @@ -57,9 +56,13 @@ var dataFlushInterval = 5 * time.Second // // This function must be called before initializing the indexdb. func SetDataFlushInterval(d time.Duration) { - if d >= time.Second { - dataFlushInterval = d + if d < pendingItemsFlushInterval { + // There is no sense in setting dataFlushInterval to values smaller than pendingItemsFlushInterval, + // since pending rows unconditionally remain in memory for up to pendingItemsFlushInterval. + d = pendingItemsFlushInterval } + + dataFlushInterval = d } // maxItemsPerCachedPart is the maximum items per created part by the merge, @@ -638,7 +641,7 @@ func (tb *Table) UpdateMetrics(m *TableMetrics) { m.IndexBlocksCacheRequests = idxbCache.Requests() m.IndexBlocksCacheMisses = idxbCache.Misses() - m.TooLongItemsDroppedTotal += tooLongItemsTotal.Load() + m.TooLongItemsDroppedTotal = tooLongItemsTotal.Load() } // AddItems adds the given items to the tb. diff --git a/lib/netutil/conn.go b/lib/netutil/conn.go index 41aefb0b9..b5c7b1e0c 100644 --- a/lib/netutil/conn.go +++ b/lib/netutil/conn.go @@ -24,7 +24,7 @@ type connMetrics struct { closeErrors *metrics.Counter - conns *metrics.Counter + conns *metrics.Gauge } func (cm *connMetrics) init(ms *metrics.Set, group, name, addr string) { @@ -40,7 +40,7 @@ func (cm *connMetrics) init(ms *metrics.Set, group, name, addr string) { cm.closeErrors = ms.NewCounter(fmt.Sprintf(`%s_errors_total{name=%q, addr=%q, type="close"}`, group, name, addr)) - cm.conns = ms.NewCounter(fmt.Sprintf(`%s_conns{name=%q, addr=%q}`, group, name, addr)) + cm.conns = ms.NewGauge(fmt.Sprintf(`%s_conns{name=%q, addr=%q}`, group, name, addr), nil) } type statConn struct { diff --git a/lib/netutil/statdial.go b/lib/netutil/statdial.go new file mode 100644 index 000000000..bd3f339bc --- /dev/null +++ b/lib/netutil/statdial.go @@ -0,0 +1,125 @@ +package netutil + +import ( + "context" + "fmt" + "net" + "strconv" + "strings" + "sync/atomic" + + "github.com/VictoriaMetrics/metrics" +) + +// NewStatDialFunc returns dialer function that supports DNS SRV records and registers stats metrics for conns. +func NewStatDialFunc(metricPrefix string) func(ctx context.Context, network, addr string) (net.Conn, error) { + return func(ctx context.Context, _, addr string) (net.Conn, error) { + sc := &statDialConn{ + dialsTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_dials_total`, metricPrefix)), + dialErrors: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_dial_errors_total`, metricPrefix)), + conns: metrics.GetOrCreateGauge(fmt.Sprintf(`%s_conns`, metricPrefix), nil), + + readsTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_reads_total`, metricPrefix)), + writesTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_writes_total`, metricPrefix)), + readErrorsTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_read_errors_total`, metricPrefix)), + writeErrorsTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_write_errors_total`, metricPrefix)), + bytesReadTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_bytes_read_total`, metricPrefix)), + bytesWrittenTotal: metrics.GetOrCreateCounter(fmt.Sprintf(`%s_conn_bytes_written_total`, metricPrefix)), + } + + network := GetTCPNetwork() + conn, err := DialMaybeSRV(ctx, network, addr) + sc.dialsTotal.Inc() + if err != nil { + sc.dialErrors.Inc() + if !TCP6Enabled() && !isTCPv4Addr(addr) { + err = fmt.Errorf("%w; try -enableTCP6 command-line flag for dialing ipv6 addresses", err) + } + return nil, err + } + sc.Conn = conn + sc.conns.Inc() + return sc, nil + } +} + +type statDialConn struct { + closed atomic.Int32 + net.Conn + + dialsTotal *metrics.Counter + dialErrors *metrics.Counter + conns *metrics.Gauge + + readsTotal *metrics.Counter + writesTotal *metrics.Counter + readErrorsTotal *metrics.Counter + writeErrorsTotal *metrics.Counter + bytesReadTotal *metrics.Counter + bytesWrittenTotal *metrics.Counter +} + +func (sc *statDialConn) Read(p []byte) (int, error) { + n, err := sc.Conn.Read(p) + sc.readsTotal.Inc() + if err != nil { + sc.readErrorsTotal.Inc() + } + sc.bytesReadTotal.Add(n) + return n, err +} + +func (sc *statDialConn) Write(p []byte) (int, error) { + n, err := sc.Conn.Write(p) + sc.writesTotal.Inc() + if err != nil { + sc.writeErrorsTotal.Inc() + } + sc.bytesWrittenTotal.Add(n) + return n, err +} + +func (sc *statDialConn) Close() error { + err := sc.Conn.Close() + if sc.closed.Add(1) == 1 { + sc.conns.Dec() + } + return err +} + +func isTCPv4Addr(addr string) bool { + s := addr + for i := 0; i < 3; i++ { + n := strings.IndexByte(s, '.') + if n < 0 { + return false + } + if !isUint8NumString(s[:n]) { + return false + } + s = s[n+1:] + } + n := strings.IndexByte(s, ':') + if n < 0 { + return false + } + if !isUint8NumString(s[:n]) { + return false + } + s = s[n+1:] + + // Verify TCP port + n, err := strconv.Atoi(s) + if err != nil { + return false + } + return n >= 0 && n < (1<<16) +} + +func isUint8NumString(s string) bool { + n, err := strconv.Atoi(s) + if err != nil { + return false + } + return n >= 0 && n < (1<<8) +} diff --git a/lib/httputils/statconn_test.go b/lib/netutil/statdial_test.go similarity index 97% rename from lib/httputils/statconn_test.go rename to lib/netutil/statdial_test.go index 1978b09bf..83f48e827 100644 --- a/lib/httputils/statconn_test.go +++ b/lib/netutil/statdial_test.go @@ -1,4 +1,4 @@ -package httputils +package netutil import ( "testing" @@ -7,6 +7,7 @@ import ( func TestIsTCPv4Addr(t *testing.T) { f := func(addr string, resultExpected bool) { t.Helper() + result := isTCPv4Addr(addr) if result != resultExpected { t.Fatalf("unexpected result for isIPv4Addr(%q); got %v; want %v", addr, result, resultExpected) diff --git a/lib/persistentqueue/fastqueue.go b/lib/persistentqueue/fastqueue.go index 396d7a69b..c1ba7b8ce 100644 --- a/lib/persistentqueue/fastqueue.go +++ b/lib/persistentqueue/fastqueue.go @@ -62,11 +62,21 @@ func MustOpenFastQueue(path, name string, maxInmemoryBlocks int, maxPendingBytes fq.mu.Unlock() return float64(n) }) + pendingBytes := fq.GetPendingBytes() - logger.Infof("opened fast persistent queue at %q with maxInmemoryBlocks=%d isPQDisabled=%t, it contains %d pending bytes", path, maxInmemoryBlocks, isPQDisabled, pendingBytes) + persistenceStatus := "enabled" + if isPQDisabled { + persistenceStatus = "disabled" + } + logger.Infof("opened fast queue at %q with maxInmemoryBlocks=%d, it contains %d pending bytes, persistence is %s", path, maxInmemoryBlocks, pendingBytes, persistenceStatus) return fq } +// IsPersistentQueueDisabled returns true if persistend queue at fq is disabled. +func (fq *FastQueue) IsPersistentQueueDisabled() bool { + return fq.isPQDisabled +} + // IsWriteBlocked checks if data can be pushed into fq func (fq *FastQueue) IsWriteBlocked() bool { if !fq.isPQDisabled { diff --git a/lib/promrelabel/if_expression.go b/lib/promrelabel/if_expression.go index e49010ebf..f7f2f3741 100644 --- a/lib/promrelabel/if_expression.go +++ b/lib/promrelabel/if_expression.go @@ -1,7 +1,6 @@ package promrelabel import ( - "bytes" "encoding/json" "fmt" @@ -144,13 +143,13 @@ func (ie *IfExpression) String() string { if len(ie.ies) == 1 { return ie.ies[0].String() } - var buf bytes.Buffer - buf.WriteString(ie.ies[0].String()) + + b := append([]byte{}, ie.ies[0].String()...) for _, e := range ie.ies[1:] { - buf.WriteString(",") - buf.WriteString(e.String()) + b = append(b, ',') + b = append(b, e.String()...) } - return buf.String() + return string(b) } type ifExpression struct { diff --git a/lib/promrelabel/if_expression_test.go b/lib/promrelabel/if_expression_test.go index 434dfa6ad..6938abdba 100644 --- a/lib/promrelabel/if_expression_test.go +++ b/lib/promrelabel/if_expression_test.go @@ -12,11 +12,13 @@ import ( func TestIfExpressionParseFailure(t *testing.T) { f := func(s string) { t.Helper() + var ie IfExpression if err := ie.Parse(s); err == nil { t.Fatalf("expecting non-nil error when parsing %q", s) } } + f(`{`) f(`{foo`) f(`foo{`) @@ -26,11 +28,13 @@ func TestIfExpressionParseFailure(t *testing.T) { func TestIfExpressionParseSuccess(t *testing.T) { f := func(s string) { t.Helper() + var ie IfExpression if err := ie.Parse(s); err != nil { t.Fatalf("unexpected error: %s", err) } } + f(`foo`) f(`{foo="bar"}`) f(`foo{bar=~"baz", x!="y"}`) @@ -45,6 +49,7 @@ func TestIfExpressionParseSuccess(t *testing.T) { func TestIfExpressionMarshalUnmarshalJSON(t *testing.T) { f := func(s, jsonExpected string) { t.Helper() + var ie IfExpression if err := ie.Parse(s); err != nil { t.Fatalf("cannot parse ifExpression %q: %s", s, err) @@ -68,6 +73,7 @@ func TestIfExpressionMarshalUnmarshalJSON(t *testing.T) { t.Fatalf("unexpected data after unmarshal/marshal cycle;\ngot\n%s\nwant\n%s", data2, jsonExpected) } } + f("foo", `"foo"`) f(`{foo="bar",baz=~"x.*"}`, `"{foo=\"bar\",baz=~\"x.*\"}"`) f(`{a="b" or c="d",x="z"}`, `"{a=\"b\" or c=\"d\",x=\"z\"}"`) @@ -76,12 +82,14 @@ func TestIfExpressionMarshalUnmarshalJSON(t *testing.T) { func TestIfExpressionUnmarshalFailure(t *testing.T) { f := func(s string) { t.Helper() + var ie IfExpression err := yaml.UnmarshalStrict([]byte(s), &ie) if err == nil { t.Fatalf("expecting non-nil error") } } + f(`{`) f(`{x:y}`) f(`[1]`) @@ -103,6 +111,7 @@ func TestIfExpressionUnmarshalFailure(t *testing.T) { func TestIfExpressionUnmarshalSuccess(t *testing.T) { f := func(s string) { t.Helper() + var ie IfExpression if err := yaml.UnmarshalStrict([]byte(s), &ie); err != nil { t.Fatalf("unexpected error during unmarshal: %s", err) @@ -116,6 +125,7 @@ func TestIfExpressionUnmarshalSuccess(t *testing.T) { t.Fatalf("unexpected marshaled data;\ngot\n%s\nwant\n%s", b, s) } } + f(`'{}'`) f(`foo`) f(`foo{bar="baz"}`) @@ -126,9 +136,35 @@ func TestIfExpressionUnmarshalSuccess(t *testing.T) { - bar{baz="abc"}`) } +func TestIfExpressionString(t *testing.T) { + f := func(s, resultExpected string) { + t.Helper() + + var ie IfExpression + if err := yaml.UnmarshalStrict([]byte(s), &ie); err != nil { + t.Fatalf("cannot unmarshal if expression: %s", err) + } + result := ie.String() + if result != resultExpected { + t.Fatalf("unexpected result\ngot\n%s\nwant\n%s", result, resultExpected) + } + } + + // empty filters + f(`'{}'`, `{}`) + + // multiple fiters + f(`foo{bar="baz",a=~"bc.+",d!="e",g!~".*qwe"}`, `foo{bar="baz",a=~"bc.+",d!="e",g!~".*qwe"}`) + + // multiple if expressions + f(`- foo +- bar{baz="abc"}`, `foo,bar{baz="abc"}`) +} + func TestIfExpressionMatch(t *testing.T) { f := func(ifExpr, metricWithLabels string) { t.Helper() + var ie IfExpression if err := yaml.UnmarshalStrict([]byte(ifExpr), &ie); err != nil { t.Fatalf("unexpected error during unmarshal: %s", err) @@ -138,6 +174,7 @@ func TestIfExpressionMatch(t *testing.T) { t.Fatalf("unexpected mismatch of ifExpr=%s for %s", ifExpr, metricWithLabels) } } + f(`foo`, `foo`) f(`foo`, `foo{bar="baz",a="b"}`) f(`foo{bar="a"}`, `foo{bar="a"}`) @@ -165,6 +202,7 @@ func TestIfExpressionMatch(t *testing.T) { func TestIfExpressionMismatch(t *testing.T) { f := func(ifExpr, metricWithLabels string) { t.Helper() + var ie IfExpression if err := yaml.UnmarshalStrict([]byte(ifExpr), &ie); err != nil { t.Fatalf("unexpected error during unmarshal: %s", err) @@ -174,6 +212,7 @@ func TestIfExpressionMismatch(t *testing.T) { t.Fatalf("unexpected match of ifExpr=%s for %s", ifExpr, metricWithLabels) } } + f(`foo`, `bar`) f(`foo`, `a{foo="bar"}`) f(`foo{bar="a"}`, `foo`) diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go index 239f2da7e..bf3bb6e21 100644 --- a/lib/promscrape/client.go +++ b/lib/promscrape/client.go @@ -14,7 +14,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" - "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputils" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil" ) var ( @@ -70,7 +70,7 @@ func newClient(ctx context.Context, sw *ScrapeWork) (*client, error) { IdleConnTimeout: 2 * sw.ScrapeInterval, DisableCompression: *disableCompression || sw.DisableCompression, DisableKeepAlives: *disableKeepAlive || sw.DisableKeepAlive, - DialContext: httputils.GetStatDialFunc("vm_promscrape"), + DialContext: netutil.NewStatDialFunc("vm_promscrape"), MaxIdleConnsPerHost: 100, MaxResponseHeaderBytes: int64(maxResponseHeadersSize.N), }), @@ -155,9 +155,9 @@ func (c *client) ReadData(dst *bytesutil.ByteBuffer) error { } if int64(len(dst.B)) >= c.maxScrapeSize { maxScrapeSizeExceeded.Inc() - return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize=%d or max_scrape_size in a scrape config. "+ + return fmt.Errorf("the response from %q exceeds -promscrape.maxScrapeSize or max_scrape_size in the scrape config (%d bytes). "+ "Possible solutions are: reduce the response size for the target, increase -promscrape.maxScrapeSize command-line flag, "+ - "increase max_scrape_size value in scrape config", c.scrapeURL, maxScrapeSize.N) + "increase max_scrape_size value in scrape config for the given target", c.scrapeURL, maxScrapeSize.N) } return nil } diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go index 2c1ac2051..7c94f6ae5 100644 --- a/lib/promscrape/config.go +++ b/lib/promscrape/config.go @@ -78,7 +78,7 @@ var ( "then each cluster must have unique name in order to properly de-duplicate samples received from these clusters. "+ "See https://docs.victoriametrics.com/vmagent/#scraping-big-number-of-targets for more info") maxScrapeSize = flagutil.NewBytes("promscrape.maxScrapeSize", 16*1024*1024, "The maximum size of scrape response in bytes to process from Prometheus targets. "+ - "Bigger responses are rejected") + "Bigger responses are rejected. See also max_scrape_size option at https://docs.victoriametrics.com/sd_configs/#scrape_configs") ) var clusterMemberID int @@ -852,12 +852,14 @@ func getScrapeWorkConfig(sc *ScrapeConfig, baseDir string, globalCfg *GlobalConf // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1281#issuecomment-840538907 scrapeTimeout = scrapeInterval } - var err error mss := maxScrapeSize.N - if len(sc.MaxScrapeSize) > 0 { - mss, err = flagutil.ParseBytes(sc.MaxScrapeSize) + if sc.MaxScrapeSize != "" { + n, err := flagutil.ParseBytes(sc.MaxScrapeSize) if err != nil { - return nil, fmt.Errorf("unexpected `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err) + return nil, fmt.Errorf("cannot parse `max_scrape_size` value %q for `job_name` %q`: %w", sc.MaxScrapeSize, jobName, err) + } + if n > 0 { + mss = n } } honorLabels := sc.HonorLabels diff --git a/lib/promscrape/config_test.go b/lib/promscrape/config_test.go index ac1b12146..bd4a39c44 100644 --- a/lib/promscrape/config_test.go +++ b/lib/promscrape/config_test.go @@ -993,7 +993,7 @@ scrape_configs: scrape_configs: - job_name: foo scheme: https - max_scrape_size: 0 + max_scrape_size: 1 relabel_configs: - action: keep source_labels: [__address__] @@ -1015,7 +1015,7 @@ scrape_configs: ScrapeURL: "http://foo.bar:1234/metrics", ScrapeInterval: defaultScrapeInterval, ScrapeTimeout: defaultScrapeTimeout, - MaxScrapeSize: 0, + MaxScrapeSize: 1, Labels: promutils.NewLabelsFromMap(map[string]string{ "instance": "foo.bar:1234", "job": "3", diff --git a/lib/promscrape/discovery/yandexcloud/api.go b/lib/promscrape/discovery/yandexcloud/api.go index 3d5d81341..6ceb5df8d 100644 --- a/lib/promscrape/discovery/yandexcloud/api.go +++ b/lib/promscrape/discovery/yandexcloud/api.go @@ -7,7 +7,6 @@ import ( "io" "net/http" "net/url" - "strconv" "sync" "time" @@ -37,10 +36,6 @@ type apiConfig struct { // credsLock protects the refresh of creds credsLock sync.Mutex creds *apiCredentials - - // metadataCredsLock protects the refresh of metadataCreds - metadataCredsLock sync.Mutex - metadataCreds *apiCredentials } func getAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) { @@ -124,55 +119,80 @@ func getCreds(cfg *apiConfig) (*apiCredentials, error) { }, nil } -// getMetadataCreds gets Yandex Cloud IAM metadata token -func getMetadataCreds(cfg *apiConfig) (*apiCredentials, error) { - cfg.metadataCredsLock.Lock() - defer cfg.metadataCredsLock.Unlock() - - if cfg.metadataCreds != nil && time.Until(cfg.metadataCreds.Expiration) > 10*time.Second { - // Credentials aren't expired yet. - return cfg.metadataCreds, nil - } - - endpoint := "http://169.254.169.254/latest/api/token" - req, err := http.NewRequest(http.MethodPut, endpoint, nil) - if err != nil { - return nil, fmt.Errorf("cannot create metadata token request: %w", err) - } - ttl := 1800 - expiration := time.Now().Add(time.Duration(ttl) * time.Second) - req.Header.Add("X-aws-ec2-metadata-token-ttl-seconds", strconv.Itoa(ttl)) - resp, err := cfg.client.Do(req) - if err != nil { - return nil, fmt.Errorf("cannot perform metadata token request: %w", err) - } - data, err := readResponseBody(resp, endpoint) - if err != nil { - return nil, fmt.Errorf("cannot read metadata creds from %s: %w", endpoint, err) - } - return &apiCredentials{ - Token: string(data), - Expiration: expiration, - }, nil -} - // getInstanceCreds gets Yandex Cloud IAM token using instance Service Account // // See https://cloud.yandex.com/en-ru/docs/compute/operations/vm-connect/auth-inside-vm func getInstanceCreds(cfg *apiConfig) (*apiCredentials, error) { - metadataCreds, err := getMetadataCreds(cfg) - if err != nil { - return nil, err + // Try obtaining GCE-like creds at first. + // See https://yandex.cloud/en-ru/docs/compute/operations/vm-connect/auth-inside-vm#auth-inside-vm + creds, err := getGCEInstanceCreds(cfg) + if err == nil { + return creds, nil } - endpoint := "http://169.254.169.254/latest/meta-data/iam/security-credentials/default" + errGCE := err + + // Fall back to the disabled IMDSv1 - see https://yandex.cloud/en/docs/security/standard/authentication#aws-token + // + // TODO: remove this when it is completely removed from Yandex Cloud. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5513 + // and https://yandex.cloud/en/docs/security/standard/authentication#aws-token + creds, err = getEC2IMDBSv1Creds(cfg) + if err == nil { + return creds, nil + } + + // Return errGCE, since it is likely the IMDBSv1 is disabled. + return nil, errGCE +} + +// getGCEInstanceCreds gets Yandex Cloud IAM token using GCE API +// +// See https://yandex.cloud/en-ru/docs/compute/operations/vm-connect/auth-inside-vm#auth-inside-vm +func getGCEInstanceCreds(cfg *apiConfig) (*apiCredentials, error) { + endpoint := "http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token" req, err := http.NewRequest(http.MethodGet, endpoint, nil) if err != nil { - return nil, fmt.Errorf("cannot create instance creds request: %w", err) + logger.Panicf("BUG: cannot create GCE token request for %s: %s", endpoint, err) } - req.Header.Add("X-aws-ec2-metadata-token", metadataCreds.Token) + req.Header.Add("Metadata-Flavor", "Google") + resp, err := cfg.client.Do(req) if err != nil { - return nil, fmt.Errorf("cannot read instance creds from %s: %w", endpoint, err) + return nil, fmt.Errorf("cannot obtain GCE token from %s: %w", endpoint, err) + } + data, err := readResponseBody(resp, endpoint) + if err != nil { + return nil, fmt.Errorf("cannot read GCE token from %s: %w", endpoint, err) + } + + var ac gceAPICredentials + if err := json.Unmarshal(data, &ac); err != nil { + return nil, fmt.Errorf("cannot unmarshal GCE token from %s: %w; data=%s", endpoint, err, data) + } + if ac.TokenType != "Bearer" { + return nil, fmt.Errorf("unsupported GCE token type received from %s: %q; supported: %q", endpoint, ac.TokenType, "Bearer") + } + + expiration := time.Now().Add(time.Duration(ac.ExpiresIn) * time.Second) + return &apiCredentials{ + Token: ac.AccessToken, + Expiration: expiration, + }, nil +} + +// See https://yandex.cloud/en-ru/docs/compute/operations/vm-connect/auth-inside-vm#auth-inside-vm +type gceAPICredentials struct { + AccessToken string `json:"access_token"` + ExpiresIn int `json:"expires_in"` + TokenType string `json:"token_type"` +} + +// getEC2IMDBSv1Creds gets Yandex Cloud IAM token using Amazon EC2 IMDBSv1 +func getEC2IMDBSv1Creds(cfg *apiConfig) (*apiCredentials, error) { + endpoint := "http://169.254.169.254/latest/meta-data/iam/security-credentials/default" + resp, err := cfg.client.Get(endpoint) + if err != nil { + return nil, fmt.Errorf("cannot read Amazon EC2 IMDBSv1 token from %s: %w", endpoint, err) } data, err := readResponseBody(resp, endpoint) if err != nil { @@ -181,7 +201,7 @@ func getInstanceCreds(cfg *apiConfig) (*apiCredentials, error) { var ac apiCredentials if err := json.Unmarshal(data, &ac); err != nil { - return nil, fmt.Errorf("cannot parse auth credentials response from %s: %w", endpoint, err) + return nil, fmt.Errorf("cannot parse Amazon EC2 IMDBSv1 token from %s: %w; data=%s", endpoint, err, data) } return &ac, nil } @@ -198,7 +218,7 @@ func getIAMToken(cfg *apiConfig) (*iamToken, error) { body := bytes.NewBuffer(passport) resp, err := cfg.client.Post(iamURL, "application/json", body) if err != nil { - logger.Panicf("BUG: cannot create request to yandex cloud iam api %q: %s", iamURL, err) + return nil, fmt.Errorf("cannot send request to yandex cloud iam api %q: %s", iamURL, err) } data, err := readResponseBody(resp, iamURL) if err != nil { @@ -206,7 +226,7 @@ func getIAMToken(cfg *apiConfig) (*iamToken, error) { } var it iamToken if err := json.Unmarshal(data, &it); err != nil { - return nil, fmt.Errorf("cannot parse iam token: %w; data: %s", err, data) + return nil, fmt.Errorf("cannot parse iam token from %s: %w; data: %s", iamURL, err, data) } return &it, nil } diff --git a/lib/promscrape/discoveryutils/client.go b/lib/promscrape/discoveryutils/client.go index f95543808..590669ae5 100644 --- a/lib/promscrape/discoveryutils/client.go +++ b/lib/promscrape/discoveryutils/client.go @@ -95,7 +95,7 @@ func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxy return nil, fmt.Errorf("cannot parse apiServer=%q: %w", apiServer, err) } - dialFunc := netutil.DialMaybeSRV + dialFunc := netutil.NewStatDialFunc("vm_promscrape_discovery") if u.Scheme == "unix" { // special case for unix socket connection dialAddr := u.Path diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index 24e882bc5..32f365334 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -500,10 +500,11 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b if sw.seriesLimitExceeded || !areIdenticalSeries { samplesDropped = sw.applySeriesLimit(wc) } + responseSize := len(bodyString) am := &autoMetrics{ up: up, scrapeDurationSeconds: scrapeDurationSeconds, - scrapeResponseSize: float64(len(bodyString)), + scrapeResponseSize: responseSize, samplesScraped: samplesScraped, samplesPostRelabeling: samplesPostRelabeling, seriesAdded: seriesAdded, @@ -512,7 +513,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) - sw.prevBodyLen = len(bodyString) + sw.prevBodyLen = responseSize wc.reset() writeRequestCtxPool.Put(wc) // body must be released only after wc is released, since wc refers to body. @@ -523,7 +524,7 @@ func (sw *scrapeWork) processDataOneShot(scrapeTimestamp, realTimestamp int64, b sw.storeLastScrape(body) } sw.finalizeLastScrape() - tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err) + tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err) return err } @@ -581,10 +582,11 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int // This is a trade-off between performance and accuracy. seriesAdded = sw.getSeriesAdded(lastScrape, bodyString) } + responseSize := len(bodyString) am := &autoMetrics{ up: up, scrapeDurationSeconds: scrapeDurationSeconds, - scrapeResponseSize: float64(len(bodyString)), + scrapeResponseSize: responseSize, samplesScraped: samplesScraped, samplesPostRelabeling: samplesPostRelabeling, seriesAdded: seriesAdded, @@ -593,7 +595,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int sw.addAutoMetrics(am, wc, scrapeTimestamp) sw.pushData(sw.Config.AuthToken, &wc.writeRequest) sw.prevLabelsLen = len(wc.labels) - sw.prevBodyLen = len(bodyString) + sw.prevBodyLen = responseSize wc.reset() writeRequestCtxPool.Put(wc) if !areIdenticalSeries { @@ -603,7 +605,7 @@ func (sw *scrapeWork) processDataInStreamMode(scrapeTimestamp, realTimestamp int sw.storeLastScrape(body.B) } sw.finalizeLastScrape() - tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), float64(len(bodyString)), samplesScraped, err) + tsmGlobal.Update(sw, up == 1, realTimestamp, int64(scrapeDurationSeconds*1000), responseSize, samplesScraped, err) // Do not track active series in streaming mode, since this may need too big amounts of memory // when the target exports too big number of metrics. return err @@ -815,7 +817,7 @@ func (sw *scrapeWork) getLabelsHash(labels []prompbmarshal.Label) uint64 { type autoMetrics struct { up int scrapeDurationSeconds float64 - scrapeResponseSize float64 + scrapeResponseSize int samplesScraped int samplesPostRelabeling int seriesAdded int @@ -823,35 +825,47 @@ type autoMetrics struct { } func isAutoMetric(s string) bool { - switch s { - case "up", "scrape_duration_seconds", "scrape_samples_scraped", - "scrape_samples_post_metric_relabeling", "scrape_series_added", - "scrape_timeout_seconds", "scrape_samples_limit", - "scrape_series_limit_samples_dropped", "scrape_series_limit", - "scrape_series_current", "scrape_response_size_bytes": + if s == "up" { return true } - return false + if !strings.HasPrefix(s, "scrape_") { + return false + } + switch s { + case "scrape_duration_seconds", + "scrape_response_size_bytes", + "scrape_samples_limit", + "scrape_samples_post_metric_relabeling", + "scrape_samples_scraped", + "scrape_series_added", + "scrape_series_current", + "scrape_series_limit", + "scrape_series_limit_samples_dropped", + "scrape_timeout_seconds": + return true + default: + return false + } } func (sw *scrapeWork) addAutoMetrics(am *autoMetrics, wc *writeRequestCtx, timestamp int64) { - sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp) sw.addAutoTimeseries(wc, "scrape_duration_seconds", am.scrapeDurationSeconds, timestamp) - sw.addAutoTimeseries(wc, "scrape_response_size_bytes", am.scrapeResponseSize, timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp) - sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp) - sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp) - sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp) + sw.addAutoTimeseries(wc, "scrape_response_size_bytes", float64(am.scrapeResponseSize), timestamp) if sampleLimit := sw.Config.SampleLimit; sampleLimit > 0 { // Expose scrape_samples_limit metric if sample_limit config is set for the target. // See https://github.com/VictoriaMetrics/operator/issues/497 sw.addAutoTimeseries(wc, "scrape_samples_limit", float64(sampleLimit), timestamp) } + sw.addAutoTimeseries(wc, "scrape_samples_post_metric_relabeling", float64(am.samplesPostRelabeling), timestamp) + sw.addAutoTimeseries(wc, "scrape_samples_scraped", float64(am.samplesScraped), timestamp) + sw.addAutoTimeseries(wc, "scrape_series_added", float64(am.seriesAdded), timestamp) if sl := sw.seriesLimiter; sl != nil { + sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp) sw.addAutoTimeseries(wc, "scrape_series_limit_samples_dropped", float64(am.seriesLimitSamplesDropped), timestamp) sw.addAutoTimeseries(wc, "scrape_series_limit", float64(sl.MaxItems()), timestamp) - sw.addAutoTimeseries(wc, "scrape_series_current", float64(sl.CurrentItems()), timestamp) } + sw.addAutoTimeseries(wc, "scrape_timeout_seconds", sw.Config.ScrapeTimeout.Seconds(), timestamp) + sw.addAutoTimeseries(wc, "up", float64(am.up), timestamp) } // addAutoTimeseries adds automatically generated time series with the given name, value and timestamp. diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go index 859fb68d8..2a74a3f69 100644 --- a/lib/promscrape/targetstatus.go +++ b/lib/promscrape/targetstatus.go @@ -178,7 +178,7 @@ func (tsm *targetStatusMap) Unregister(sw *scrapeWork) { tsm.mu.Unlock() } -func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize float64, samplesScraped int, err error) { +func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, scrapeResponseSize, samplesScraped int, err error) { jobName := sw.Config.jobNameOriginal tsm.mu.Lock() @@ -300,7 +300,7 @@ type targetStatus struct { up bool scrapeTime int64 scrapeDuration int64 - scrapeResponseSize float64 + scrapeResponseSize int samplesScraped int scrapesTotal int scrapesFailed int @@ -319,7 +319,7 @@ func (ts *targetStatus) getSizeFromLastScrape() string { if ts.scrapeResponseSize <= 0 { return "never scraped" } - return fmt.Sprintf("%.3f kb", float64(ts.scrapeResponseSize)/1024) + return fmt.Sprintf("%.3fKiB", float64(ts.scrapeResponseSize)/1024) } type droppedTargets struct { diff --git a/lib/promscrape/targetstatus.qtpl b/lib/promscrape/targetstatus.qtpl index 8bacbcba1..6f0cd7a95 100644 --- a/lib/promscrape/targetstatus.qtpl +++ b/lib/promscrape/targetstatus.qtpl @@ -27,9 +27,9 @@ {% if filter.showOriginalLabels %}originalLabels={%s= ts.sw.Config.OriginalLabels.String() %},{% space %}{% endif %} scrapes_total={%d ts.scrapesTotal %},{% space %} scrapes_failed={%d ts.scrapesFailed %},{% space %} - last_scrape={%s ts.getDurationFromLastScrape() %},{% space %} + last_scrape={%s= ts.getDurationFromLastScrape() %},{% space %} scrape_duration={%d int(ts.scrapeDuration) %}ms,{% space %} - scrape_response_size={%s ts.getSizeFromLastScrape() %},{% space %} + scrape_response_size={%s= ts.getSizeFromLastScrape() %},{% space %} samples_scraped={%d ts.samplesScraped %},{% space %} error={% if ts.err != nil %}{%s= ts.err.Error() %}{% endif %} {% newline %} diff --git a/lib/promscrape/targetstatus.qtpl.go b/lib/promscrape/targetstatus.qtpl.go index 53e820a52..10a15586d 100644 --- a/lib/promscrape/targetstatus.qtpl.go +++ b/lib/promscrape/targetstatus.qtpl.go @@ -127,7 +127,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes //line lib/promscrape/targetstatus.qtpl:29 qw422016.N().S(`last_scrape=`) //line lib/promscrape/targetstatus.qtpl:30 - qw422016.E().S(ts.getDurationFromLastScrape()) + qw422016.N().S(ts.getDurationFromLastScrape()) //line lib/promscrape/targetstatus.qtpl:30 qw422016.N().S(`,`) //line lib/promscrape/targetstatus.qtpl:30 @@ -143,7 +143,7 @@ func StreamTargetsResponsePlain(qw422016 *qt422016.Writer, tsr *targetsStatusRes //line lib/promscrape/targetstatus.qtpl:31 qw422016.N().S(`scrape_response_size=`) //line lib/promscrape/targetstatus.qtpl:32 - qw422016.E().S(ts.getSizeFromLastScrape()) + qw422016.N().S(ts.getSizeFromLastScrape()) //line lib/promscrape/targetstatus.qtpl:32 qw422016.N().S(`,`) //line lib/promscrape/targetstatus.qtpl:32 diff --git a/lib/protoparser/opentelemetry/firehose/http.go b/lib/protoparser/opentelemetry/firehose/http.go index d17b14a0d..a222e7279 100644 --- a/lib/protoparser/opentelemetry/firehose/http.go +++ b/lib/protoparser/opentelemetry/firehose/http.go @@ -2,7 +2,6 @@ package firehose import ( "fmt" - "html" "net/http" "time" ) @@ -13,12 +12,11 @@ import ( func WriteSuccessResponse(w http.ResponseWriter, r *http.Request) { requestID := r.Header.Get("X-Amz-Firehose-Request-Id") if requestID == "" { - // This isn't an AWS firehose request - just return an empty response in this case. + // This isn't a AWS firehose request - just return an empty response in this case. w.WriteHeader(http.StatusOK) return } - requestID = html.EscapeString(requestID) body := fmt.Sprintf(`{"requestId":%q,"timestamp":%d}`, requestID, time.Now().UnixMilli()) h := w.Header() diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index e43ead76a..74a0b21e9 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -656,7 +656,7 @@ func (is *indexSearch) searchLabelNamesWithFiltersOnDate(qt *querytracer.Tracer, if filter != nil && filter.Len() <= 100e3 { // It is faster to obtain label names by metricIDs from the filter // instead of scanning the inverted index for the matching filters. - // This would help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 + // This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 metricIDs := filter.AppendTo(nil) qt.Printf("sort %d metricIDs", len(metricIDs)) is.getLabelNamesForMetricIDs(qt, metricIDs, lns, maxLabelNames) @@ -749,13 +749,12 @@ func (is *indexSearch) getLabelNamesForMetricIDs(qt *querytracer.Tracer, metricI } dmis := is.db.s.getDeletedMetricIDs() - checkDeleted := dmis.Len() > 0 var mn MetricName foundLabelNames := 0 var buf []byte for _, metricID := range metricIDs { - if checkDeleted && dmis.Has(metricID) { + if dmis.Has(metricID) { // skip deleted IDs from result continue } @@ -882,7 +881,7 @@ func (is *indexSearch) searchLabelValuesWithFiltersOnDate(qt *querytracer.Tracer if filter != nil && filter.Len() <= 100e3 { // It is faster to obtain label values by metricIDs from the filter // instead of scanning the inverted index for the matching filters. - // This would help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 + // This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2978 metricIDs := filter.AppendTo(nil) qt.Printf("sort %d metricIDs", len(metricIDs)) is.getLabelValuesForMetricIDs(qt, lvs, labelName, metricIDs, maxLabelValues) @@ -956,13 +955,12 @@ func (is *indexSearch) getLabelValuesForMetricIDs(qt *querytracer.Tracer, lvs ma } dmis := is.db.s.getDeletedMetricIDs() - checkDeleted := dmis.Len() > 0 var mn MetricName foundLabelValues := 0 var buf []byte for _, metricID := range metricIDs { - if checkDeleted && dmis.Has(metricID) { + if dmis.Has(metricID) { // skip deleted IDs from result continue } @@ -1858,12 +1856,9 @@ func (is *indexSearch) getTSIDByMetricNameNoExtDB(dst *TSID, metricName []byte, if len(tail) > 0 { logger.Panicf("FATAL: unexpected non-empty tail left after unmarshaling TSID: %X", tail) } - if dmis.Len() > 0 { - // Verify whether the dst is marked as deleted. - if dmis.Has(dst.MetricID) { - // The dst is deleted. Continue searching. - continue - } + if dmis.Has(dst.MetricID) { + // The dst is deleted. Continue searching. + continue } // Found valid dst. return true diff --git a/lib/storage/partition.go b/lib/storage/partition.go index f58710a6c..ab70c3791 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -48,8 +48,7 @@ var rawRowsShardsPerPartition = cgroup.AvailableCPUs() // The interval for flushing buffered rows into parts, so they become visible to search. const pendingRowsFlushInterval = 2 * time.Second -// The interval for guaranteed flush of recently ingested data from memory to on-disk parts, -// so they survive process crash. +// The interval for guaranteed flush of recently ingested data from memory to on-disk parts, so they survive process crash. var dataFlushInterval = 5 * time.Second // SetDataFlushInterval sets the interval for guaranteed flush of recently ingested data from memory to disk. @@ -58,10 +57,14 @@ var dataFlushInterval = 5 * time.Second // // This function must be called before initializing the storage. func SetDataFlushInterval(d time.Duration) { - if d >= time.Second { - dataFlushInterval = d - mergeset.SetDataFlushInterval(d) + if d < pendingRowsFlushInterval { + // There is no sense in setting dataFlushInterval to values smaller than pendingRowsFlushInterval, + // since pending rows unconditionally remain in memory for up to pendingRowsFlushInterval. + d = pendingRowsFlushInterval } + + dataFlushInterval = d + mergeset.SetDataFlushInterval(d) } // The maximum number of rawRow items in rawRowsShard. diff --git a/lib/streamaggr/deduplicator.go b/lib/streamaggr/deduplicator.go index c49580dcc..273fd6eef 100644 --- a/lib/streamaggr/deduplicator.go +++ b/lib/streamaggr/deduplicator.go @@ -35,6 +35,8 @@ type Deduplicator struct { // An optional dropLabels list may contain label names, which must be dropped before de-duplicating samples. // Common case is to drop `replica`-like labels from samples received from HA datasources. // +// alias is url label used in metrics exposed by the returned Deduplicator. +// // MustStop must be called on the returned deduplicator in order to free up occupied resources. func NewDeduplicator(pushFunc PushFunc, dedupInterval time.Duration, dropLabels []string, alias string) *Deduplicator { d := &Deduplicator{ @@ -47,7 +49,8 @@ func NewDeduplicator(pushFunc PushFunc, dedupInterval time.Duration, dropLabels ms := d.ms - metricLabels := fmt.Sprintf(`url=%q`, alias) + metricLabels := fmt.Sprintf(`name="dedup",url=%q`, alias) + _ = ms.NewGauge(fmt.Sprintf(`vm_streamaggr_dedup_state_size_bytes{%s}`, metricLabels), func() float64 { return float64(d.da.sizeBytes()) }) @@ -55,8 +58,8 @@ func NewDeduplicator(pushFunc PushFunc, dedupInterval time.Duration, dropLabels return float64(d.da.itemsCount()) }) - d.dedupFlushDuration = ms.GetOrCreateHistogram(fmt.Sprintf(`vm_streamaggr_dedup_flush_duration_seconds{%s}`, metricLabels)) - d.dedupFlushTimeouts = ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_dedup_flush_timeouts_total{%s}`, metricLabels)) + d.dedupFlushDuration = ms.NewHistogram(fmt.Sprintf(`vm_streamaggr_dedup_flush_duration_seconds{%s}`, metricLabels)) + d.dedupFlushTimeouts = ms.NewCounter(fmt.Sprintf(`vm_streamaggr_dedup_flush_timeouts_total{%s}`, metricLabels)) metrics.RegisterSet(ms) @@ -71,7 +74,7 @@ func NewDeduplicator(pushFunc PushFunc, dedupInterval time.Duration, dropLabels // MustStop stops d. func (d *Deduplicator) MustStop() { - metrics.UnregisterSet(d.ms) + metrics.UnregisterSet(d.ms, true) d.ms = nil close(d.stopCh) diff --git a/lib/streamaggr/histogram_bucket.go b/lib/streamaggr/histogram_bucket.go index c5fe06630..92982dcec 100644 --- a/lib/streamaggr/histogram_bucket.go +++ b/lib/streamaggr/histogram_bucket.go @@ -66,9 +66,8 @@ func (as *histogramBucketAggrState) pushSamples(samples []pushSample) { } } -func (as *histogramBucketAggrState) removeOldEntries(ctx *flushCtx, currentTime uint64) { +func (as *histogramBucketAggrState) removeOldEntries(currentTime uint64) { m := &as.m - var staleOutputSamples int m.Range(func(k, v any) bool { sv := v.(*histogramBucketStateValue) @@ -77,7 +76,6 @@ func (as *histogramBucketAggrState) removeOldEntries(ctx *flushCtx, currentTime if deleted { // Mark the current entry as deleted sv.deleted = deleted - staleOutputSamples++ } sv.mu.Unlock() @@ -86,14 +84,13 @@ func (as *histogramBucketAggrState) removeOldEntries(ctx *flushCtx, currentTime } return true }) - ctx.a.staleOutputSamples["histogram_bucket"].Add(staleOutputSamples) } func (as *histogramBucketAggrState) flushState(ctx *flushCtx, _ bool) { currentTime := fasttime.UnixTimestamp() currentTimeMsec := int64(currentTime) * 1000 - as.removeOldEntries(ctx, currentTime) + as.removeOldEntries(currentTime) m := &as.m m.Range(func(k, v any) bool { diff --git a/lib/streamaggr/rate.go b/lib/streamaggr/rate.go index 6fe401d37..55bd393db 100644 --- a/lib/streamaggr/rate.go +++ b/lib/streamaggr/rate.go @@ -8,11 +8,12 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime" ) -// rateAggrState calculates output=rate, e.g. the counter per-second change. +// rateAggrState calculates output=rate_avg and rate_sum, e.g. the average per-second increase rate for counter metrics. type rateAggrState struct { m sync.Map - suffix string + // isAvg is set to true if rate_avg() must be calculated instead of rate_sum(). + isAvg bool // Time series state is dropped if no new samples are received during stalenessSecs. stalenessSecs uint64 @@ -30,18 +31,17 @@ type rateLastValueState struct { timestamp int64 deleteDeadline uint64 - // total stores cumulative difference between registered values - // in the aggregation interval - total float64 - // prevTimestamp stores timestamp of the last registered value - // in the previous aggregation interval + // increase stores cumulative increase for the current time series on the current aggregation interval + increase float64 + + // prevTimestamp is the timestamp of the last registered sample in the previous aggregation interval prevTimestamp int64 } -func newRateAggrState(stalenessInterval time.Duration, suffix string) *rateAggrState { +func newRateAggrState(stalenessInterval time.Duration, isAvg bool) *rateAggrState { stalenessSecs := roundDurationToSecs(stalenessInterval) return &rateAggrState{ - suffix: suffix, + isAvg: isAvg, stalenessSecs: stalenessSecs, } } @@ -78,15 +78,15 @@ func (as *rateAggrState) pushSamples(samples []pushSample) { sv.mu.Unlock() continue } - if lv.prevTimestamp == 0 { - lv.prevTimestamp = lv.timestamp - } + if s.value >= lv.value { - lv.total += s.value - lv.value + lv.increase += s.value - lv.value } else { // counter reset - lv.total += s.value + lv.increase += s.value } + } else { + lv.prevTimestamp = s.timestamp } lv.value = s.value lv.timestamp = s.timestamp @@ -105,57 +105,82 @@ func (as *rateAggrState) pushSamples(samples []pushSample) { } } -func (as *rateAggrState) flushState(ctx *flushCtx, _ bool) { +func (as *rateAggrState) flushState(ctx *flushCtx, resetState bool) { currentTime := fasttime.UnixTimestamp() currentTimeMsec := int64(currentTime) * 1000 - var staleOutputSamples, staleInputSamples int + + suffix := as.getSuffix() + + as.removeOldEntries(currentTime) m := &as.m m.Range(func(k, v any) bool { sv := v.(*rateStateValue) - sv.mu.Lock() - // check for stale entries - deleted := currentTime > sv.deleteDeadline - if deleted { + sv.mu.Lock() + lvs := sv.lastValues + sumRate := 0.0 + countSeries := 0 + for k1, lv := range lvs { + d := float64(lv.timestamp-lv.prevTimestamp) / 1000 + if d > 0 { + sumRate += lv.increase / d + countSeries++ + } + if resetState { + lv.prevTimestamp = lv.timestamp + lv.increase = 0 + lvs[k1] = lv + } + } + deleted := sv.deleted + sv.mu.Unlock() + + if countSeries == 0 || deleted { + // Nothing to update + return true + } + + result := sumRate + if as.isAvg { + result /= float64(countSeries) + } + + key := k.(string) + ctx.appendSeries(key, suffix, currentTimeMsec, result) + return true + }) +} + +func (as *rateAggrState) getSuffix() string { + if as.isAvg { + return "rate_avg" + } + return "rate_sum" +} + +func (as *rateAggrState) removeOldEntries(currentTime uint64) { + m := &as.m + m.Range(func(k, v any) bool { + sv := v.(*rateStateValue) + + sv.mu.Lock() + if currentTime > sv.deleteDeadline { // Mark the current entry as deleted - sv.deleted = deleted + sv.deleted = true sv.mu.Unlock() - staleOutputSamples++ m.Delete(k) return true } // Delete outdated entries in sv.lastValues - var rate float64 lvs := sv.lastValues - for k1, v1 := range lvs { - if currentTime > v1.deleteDeadline { + for k1, lv := range lvs { + if currentTime > lv.deleteDeadline { delete(lvs, k1) - staleInputSamples++ - continue - } - rateInterval := v1.timestamp - v1.prevTimestamp - if v1.prevTimestamp > 0 && rateInterval > 0 { - // calculate rate only if value was seen at least twice with different timestamps - rate += v1.total * 1000 / float64(rateInterval) - v1.prevTimestamp = v1.timestamp - v1.total = 0 - lvs[k1] = v1 } } - // capture m length after deleted items were removed - totalItems := len(lvs) sv.mu.Unlock() - - if as.suffix == "rate_avg" && totalItems > 0 { - rate /= float64(totalItems) - } - - key := k.(string) - ctx.appendSeries(key, as.suffix, currentTimeMsec, rate) return true }) - ctx.a.staleOutputSamples[as.suffix].Add(staleOutputSamples) - ctx.a.staleInputSamples[as.suffix].Add(staleInputSamples) } diff --git a/lib/streamaggr/streamaggr.go b/lib/streamaggr/streamaggr.go index 7691b9b4a..c8571b9d7 100644 --- a/lib/streamaggr/streamaggr.go +++ b/lib/streamaggr/streamaggr.go @@ -27,29 +27,26 @@ import ( ) var supportedOutputs = []string{ - "rate_sum", - "rate_avg", - "total", - "total_prometheus", + "avg", + "count_samples", + "count_series", + "histogram_bucket", "increase", "increase_prometheus", - "count_series", - "count_samples", - "unique_samples", - "sum_samples", "last", - "min", "max", - "avg", + "min", + "quantiles(phi1, ..., phiN)", + "rate_avg", + "rate_sum", "stddev", "stdvar", - "histogram_bucket", - "quantiles(phi1, ..., phiN)", + "sum_samples", + "total", + "total_prometheus", + "unique_samples", } -// maxLabelValueLen is maximum match expression label value length in stream aggregation metrics -const maxLabelValueLen = 64 - var ( // lc contains information about all compressed labels for streaming aggregation lc promutils.LabelsCompressor @@ -67,8 +64,10 @@ var ( // // opts can contain additional options. If opts is nil, then default options are used. // +// alias is used as url label in metrics exposed for the returned Aggregators. +// // The returned Aggregators must be stopped with MustStop() when no longer needed. -func LoadFromFile(path string, pushFunc PushFunc, opts Options) (*Aggregators, error) { +func LoadFromFile(path string, pushFunc PushFunc, opts *Options, alias string) (*Aggregators, error) { data, err := fscore.ReadFileOrHTTP(path) if err != nil { return nil, fmt.Errorf("cannot load aggregators: %w", err) @@ -78,7 +77,7 @@ func LoadFromFile(path string, pushFunc PushFunc, opts Options) (*Aggregators, e return nil, fmt.Errorf("cannot expand environment variables in %q: %w", path, err) } - as, err := LoadFromData(data, pushFunc, opts) + as, err := loadFromData(data, path, pushFunc, opts, alias) if err != nil { return nil, fmt.Errorf("cannot initialize aggregators from %q: %w; see https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config", path, err) } @@ -136,16 +135,15 @@ type Options struct { // // This option can be overridden individually per each aggregation via ignore_first_intervals option. IgnoreFirstIntervals int - - // Alias is name or url of remote write context - Alias string - - // aggrID is aggregators id number starting from 1, which is used in metrics labels - aggrID int } // Config is a configuration for a single stream aggregation. type Config struct { + // Name is an optional name of the Config. + // + // It is used as `name` label in the exposed metrics for the given Config. + Name string `yaml:"name,omitempty"` + // Match is a label selector for filtering time series for the given selector. // // If the match isn't set, then all the input time series are processed. @@ -175,24 +173,24 @@ type Config struct { // // The following names are allowed: // - // - rate_sum - calculates sum of rate for input counters - // - rate_avg - calculates average of rate for input counters - // - total - aggregates input counters - // - total_prometheus - aggregates input counters, ignoring the first sample in new time series + // - avg - the average value across all the samples + // - count_samples - counts the input samples + // - count_series - counts the number of unique input series + // - histogram_bucket - creates VictoriaMetrics histogram for input samples // - increase - calculates the increase over input series // - increase_prometheus - calculates the increase over input series, ignoring the first sample in new time series - // - count_series - counts the number of unique input series - // - count_samples - counts the input samples - // - unique_samples - counts the number of unique sample values - // - sum_samples - sums the input sample values // - last - the last biggest sample value - // - min - the minimum sample value // - max - the maximum sample value - // - avg - the average value across all the samples + // - min - the minimum sample value + // - quantiles(phi1, ..., phiN) - quantiles' estimation for phi in the range [0..1] + // - rate_avg - calculates average of rate for input counters + // - rate_sum - calculates sum of rate for input counters // - stddev - standard deviation across all the samples // - stdvar - standard variance across all the samples - // - histogram_bucket - creates VictoriaMetrics histogram for input samples - // - quantiles(phi1, ..., phiN) - quantiles' estimation for phi in the range [0..1] + // - sum_samples - sums the input sample values + // - total - aggregates input counters + // - total_prometheus - aggregates input counters, ignoring the first sample in new time series + // - unique_samples - counts the number of unique sample values // // The output time series will have the following names by default: // @@ -249,11 +247,26 @@ type Aggregators struct { // It is used in Equal() for comparing Aggregators. configData []byte + // filePath is the path to config file used for creating the Aggregators. + filePath string + + // ms contains metrics associated with the Aggregators. ms *metrics.Set } +// FilePath returns path to file with the configuration used for creating the given Aggregators. +func (a *Aggregators) FilePath() string { + return a.filePath +} + // LoadFromData loads aggregators from data. -func LoadFromData(data []byte, pushFunc PushFunc, opts Options) (*Aggregators, error) { +// +// opts can contain additional options. If opts is nil, then default options are used. +func LoadFromData(data []byte, pushFunc PushFunc, opts *Options, alias string) (*Aggregators, error) { + return loadFromData(data, "inmemory", pushFunc, opts, alias) +} + +func loadFromData(data []byte, filePath string, pushFunc PushFunc, opts *Options, alias string) (*Aggregators, error) { var cfgs []*Config if err := yaml.UnmarshalStrict(data, &cfgs); err != nil { return nil, fmt.Errorf("cannot parse stream aggregation config: %w", err) @@ -262,8 +275,7 @@ func LoadFromData(data []byte, pushFunc PushFunc, opts Options) (*Aggregators, e ms := metrics.NewSet() as := make([]*aggregator, len(cfgs)) for i, cfg := range cfgs { - opts.aggrID = i + 1 - a, err := newAggregator(cfg, pushFunc, ms, opts) + a, err := newAggregator(cfg, filePath, pushFunc, ms, opts, alias, i+1) if err != nil { // Stop already initialized aggregators before returning the error. for _, a := range as[:i] { @@ -278,30 +290,11 @@ func LoadFromData(data []byte, pushFunc PushFunc, opts Options) (*Aggregators, e logger.Panicf("BUG: cannot marshal the provided configs: %s", err) } - metricLabels := fmt.Sprintf("url=%q", opts.Alias) - _ = ms.NewGauge(fmt.Sprintf(`vm_streamaggr_dedup_state_size_bytes{%s}`, metricLabels), func() float64 { - n := uint64(0) - for _, aggr := range as { - if aggr.da != nil { - n += aggr.da.sizeBytes() - } - } - return float64(n) - }) - _ = ms.NewGauge(fmt.Sprintf(`vm_streamaggr_dedup_state_items_count{%s}`, metricLabels), func() float64 { - n := uint64(0) - for _, aggr := range as { - if aggr.da != nil { - n += aggr.da.itemsCount() - } - } - return float64(n) - }) - metrics.RegisterSet(ms) return &Aggregators{ as: as, configData: configData, + filePath: filePath, ms: ms, }, nil } @@ -323,7 +316,7 @@ func (a *Aggregators) MustStop() { return } - metrics.UnregisterSet(a.ms) + metrics.UnregisterSet(a.ms, true) a.ms = nil for _, aggr := range a.as { @@ -380,11 +373,17 @@ type aggregator struct { without []string aggregateOnlyByTime bool + // interval is the interval between flushes + interval time.Duration + + // dedupInterval is optional deduplication interval for incoming samples + dedupInterval time.Duration + // da is set to non-nil if input samples must be de-duplicated da *dedupAggr - // aggrStates contains aggregate states for the given outputs - aggrStates map[string]aggrState + // aggrOutputs contains aggregate states for the given outputs + aggrOutputs []aggrOutput // minTimestamp is used for ignoring old samples when ignoreOldSamples is set minTimestamp atomic.Int64 @@ -406,11 +405,14 @@ type aggregator struct { flushTimeouts *metrics.Counter dedupFlushTimeouts *metrics.Counter ignoredOldSamples *metrics.Counter - ignoredNanSamples *metrics.Counter + ignoredNaNSamples *metrics.Counter matchedSamples *metrics.Counter - staleInputSamples map[string]*metrics.Counter - staleOutputSamples map[string]*metrics.Counter - flushedSamples map[string]*metrics.Counter +} + +type aggrOutput struct { + as aggrState + + outputSamples *metrics.Counter } type aggrState interface { @@ -419,6 +421,13 @@ type aggrState interface { // samples[].key must be cloned by aggrState, since it may change after returning from pushSamples. pushSamples(samples []pushSample) + // flushState must flush aggrState data to ctx. + // + // if resetState is true, then aggrState must be reset after flushing the data to ctx, + // otherwise the aggrState data must be kept unchanged. + // + // The resetState is set to false only in the benchmark, which measures flushState() performance + // over the same aggrState. flushState(ctx *flushCtx, resetState bool) } @@ -430,7 +439,7 @@ type PushFunc func(tss []prompbmarshal.TimeSeries) // opts can contain additional options. If opts is nil, then default options are used. // // The returned aggregator must be stopped when no longer needed by calling MustStop(). -func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options) (*aggregator, error) { +func newAggregator(cfg *Config, path string, pushFunc PushFunc, ms *metrics.Set, opts *Options, alias string, aggrID int) (*aggregator, error) { // check cfg.Interval if cfg.Interval == "" { return nil, fmt.Errorf("missing `interval` option") @@ -443,6 +452,10 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options return nil, fmt.Errorf("aggregation interval cannot be smaller than 1s; got %s", interval) } + if opts == nil { + opts = &Options{} + } + // check cfg.DedupInterval dedupInterval := opts.DedupInterval if cfg.DedupInterval != "" { @@ -491,7 +504,7 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options by := sortAndRemoveDuplicates(cfg.By) without := sortAndRemoveDuplicates(cfg.Without) if len(by) > 0 && len(without) > 0 { - return nil, fmt.Errorf("`by: %s` and `without: %s` lists cannot be set simultaneously", by, without) + return nil, fmt.Errorf("`by: %s` and `without: %s` lists cannot be set simultaneously; see https://docs.victoriametrics.com/stream-aggregation/", by, without) } aggregateOnlyByTime := (len(by) == 0 && len(without) == 0) if !aggregateOnlyByTime && len(without) == 0 { @@ -505,10 +518,12 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options } if keepMetricNames { if len(cfg.Outputs) != 1 { - return nil, fmt.Errorf("`outputs` list must contain only a single entry if `keep_metric_names` is set; got %q", cfg.Outputs) + return nil, fmt.Errorf("`outputs` list must contain only a single entry if `keep_metric_names` is set; got %q; "+ + "see https://docs.victoriametrics.com/stream-aggregation/#output-metric-names", cfg.Outputs) } if cfg.Outputs[0] == "histogram_bucket" || strings.HasPrefix(cfg.Outputs[0], "quantiles(") && strings.Contains(cfg.Outputs[0], ",") { - return nil, fmt.Errorf("`keep_metric_names` cannot be applied to `outputs: %q`, since they can generate multiple time series", cfg.Outputs) + return nil, fmt.Errorf("`keep_metric_names` cannot be applied to `outputs: %q`, since they can generate multiple time series; "+ + "see https://docs.victoriametrics.com/stream-aggregation/#output-metric-names", cfg.Outputs) } } @@ -524,105 +539,42 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options ignoreFirstIntervals = *v } - // initialize outputs list - if len(cfg.Outputs) == 0 { - return nil, fmt.Errorf("`outputs` list must contain at least a single entry from the list %s", supportedOutputs) + // Initialize common metric labels + name := cfg.Name + if name == "" { + name = "none" } - aggrStates := make(map[string]aggrState, len(cfg.Outputs)) - for _, output := range cfg.Outputs { - // check for duplicated output - if _, ok := aggrStates[output]; ok { - return nil, fmt.Errorf("`outputs` list contains duplicated aggregation function: %s", output) + metricLabels := fmt.Sprintf(`name=%q,path=%q,url=%q,position="%d"`, name, path, alias, aggrID) + + // initialize aggrOutputs + if len(cfg.Outputs) == 0 { + return nil, fmt.Errorf("`outputs` list must contain at least a single entry from the list %s; "+ + "see https://docs.victoriametrics.com/stream-aggregation/", supportedOutputs) + } + aggrOutputs := make([]aggrOutput, len(cfg.Outputs)) + outputsSeen := make(map[string]struct{}, len(cfg.Outputs)) + for i, output := range cfg.Outputs { + as, err := newAggrState(output, outputsSeen, stalenessInterval) + if err != nil { + return nil, err } - if strings.HasPrefix(output, "quantiles(") { - if !strings.HasSuffix(output, ")") { - return nil, fmt.Errorf("missing closing brace for `quantiles()` output") - } - argsStr := output[len("quantiles(") : len(output)-1] - if len(argsStr) == 0 { - return nil, fmt.Errorf("`quantiles()` must contain at least one phi") - } - args := strings.Split(argsStr, ",") - phis := make([]float64, len(args)) - for j, arg := range args { - arg = strings.TrimSpace(arg) - phi, err := strconv.ParseFloat(arg, 64) - if err != nil { - return nil, fmt.Errorf("cannot parse phi=%q for quantiles(%s): %w", arg, argsStr, err) - } - if phi < 0 || phi > 1 { - return nil, fmt.Errorf("phi inside quantiles(%s) must be in the range [0..1]; got %v", argsStr, phi) - } - phis[j] = phi - } - if _, ok := aggrStates["quantiles"]; ok { - return nil, fmt.Errorf("`outputs` list contains duplicated `quantiles()` function, please combine multiple phi* like `quantiles(0.5, 0.9)`") - } - aggrStates["quantiles"] = newQuantilesAggrState(phis) - continue - } - switch output { - case "total": - aggrStates[output] = newTotalAggrState(stalenessInterval, false, true) - case "total_prometheus": - aggrStates[output] = newTotalAggrState(stalenessInterval, false, false) - case "increase": - aggrStates[output] = newTotalAggrState(stalenessInterval, true, true) - case "increase_prometheus": - aggrStates[output] = newTotalAggrState(stalenessInterval, true, false) - case "rate_sum": - aggrStates[output] = newRateAggrState(stalenessInterval, "rate_sum") - case "rate_avg": - aggrStates[output] = newRateAggrState(stalenessInterval, "rate_avg") - case "count_series": - aggrStates[output] = newCountSeriesAggrState() - case "count_samples": - aggrStates[output] = newCountSamplesAggrState() - case "unique_samples": - aggrStates[output] = newUniqueSamplesAggrState() - case "sum_samples": - aggrStates[output] = newSumSamplesAggrState() - case "last": - aggrStates[output] = newLastAggrState() - case "min": - aggrStates[output] = newMinAggrState() - case "max": - aggrStates[output] = newMaxAggrState() - case "avg": - aggrStates[output] = newAvgAggrState() - case "stddev": - aggrStates[output] = newStddevAggrState() - case "stdvar": - aggrStates[output] = newStdvarAggrState() - case "histogram_bucket": - aggrStates[output] = newHistogramBucketAggrState(stalenessInterval) - default: - return nil, fmt.Errorf("unsupported output=%q; supported values: %s;", output, supportedOutputs) + aggrOutputs[i] = aggrOutput{ + as: as, + + outputSamples: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_output_samples_total{output=%q,%s}`, output, metricLabels)), } } // initialize suffix to add to metric names after aggregation suffix := ":" + cfg.Interval - group := "none" if labels := removeUnderscoreName(by); len(labels) > 0 { - group = fmt.Sprintf("by: %s", strings.Join(labels, ",")) suffix += fmt.Sprintf("_by_%s", strings.Join(labels, "_")) } if labels := removeUnderscoreName(without); len(labels) > 0 { - group = fmt.Sprintf("without: %s", strings.Join(labels, ",")) suffix += fmt.Sprintf("_without_%s", strings.Join(labels, "_")) } suffix += "_" - outputs := strings.Join(cfg.Outputs, ",") - - matchExpr := cfg.Match.String() - if len(matchExpr) > maxLabelValueLen { - matchExpr = matchExpr[:maxLabelValueLen-3] + "..." - } - - metricLabels := fmt.Sprintf(`match=%q, group=%q, url=%q, position="%d"`, matchExpr, group, opts.Alias, opts.aggrID) - // initialize the aggregator a := &aggregator{ match: cfg.Match, @@ -638,36 +590,37 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options without: without, aggregateOnlyByTime: aggregateOnlyByTime, - aggrStates: aggrStates, + interval: interval, + dedupInterval: dedupInterval, + + aggrOutputs: aggrOutputs, suffix: suffix, stopCh: make(chan struct{}), - flushDuration: ms.GetOrCreateHistogram(fmt.Sprintf(`vm_streamaggr_flush_duration_seconds{outputs=%q, %s}`, outputs, metricLabels)), - dedupFlushDuration: ms.GetOrCreateHistogram(fmt.Sprintf(`vm_streamaggr_dedup_flush_duration_seconds{outputs=%q, %s}`, outputs, metricLabels)), - samplesLag: ms.GetOrCreateHistogram(fmt.Sprintf(`vm_streamaggr_samples_lag_seconds{outputs=%q, %s}`, outputs, metricLabels)), + flushDuration: ms.NewHistogram(fmt.Sprintf(`vm_streamaggr_flush_duration_seconds{%s}`, metricLabels)), + dedupFlushDuration: ms.NewHistogram(fmt.Sprintf(`vm_streamaggr_dedup_flush_duration_seconds{%s}`, metricLabels)), + samplesLag: ms.NewHistogram(fmt.Sprintf(`vm_streamaggr_samples_lag_seconds{%s}`, metricLabels)), - matchedSamples: ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_matched_samples_total{outputs=%q, %s}`, outputs, metricLabels)), - flushTimeouts: ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_flush_timeouts_total{outputs=%q, %s}`, outputs, metricLabels)), - dedupFlushTimeouts: ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_dedup_flush_timeouts_total{outputs=%q, %s}`, outputs, metricLabels)), - ignoredNanSamples: ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_ignored_samples_total{reason="nan", outputs=%q, %s}`, outputs, metricLabels)), - ignoredOldSamples: ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_ignored_samples_total{reason="too_old", outputs=%q, %s}`, outputs, metricLabels)), - staleInputSamples: make(map[string]*metrics.Counter, len(cfg.Outputs)), - staleOutputSamples: make(map[string]*metrics.Counter, len(cfg.Outputs)), - flushedSamples: make(map[string]*metrics.Counter, len(cfg.Outputs)), - } - for _, output := range cfg.Outputs { - // Removing output args for metric label value in outputs like quantile(arg1, arg2) - if ri := strings.IndexRune(output, '('); ri >= 0 { - output = output[:ri] - } - a.staleInputSamples[output] = ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_stale_samples_total{key="input", output=%q, %s}`, output, metricLabels)) - a.staleOutputSamples[output] = ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_stale_samples_total{key="output", output=%q, %s}`, output, metricLabels)) - a.flushedSamples[output] = ms.GetOrCreateCounter(fmt.Sprintf(`vm_streamaggr_flushed_samples_total{output=%q, %s}`, output, metricLabels)) + matchedSamples: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_matched_samples_total{%s}`, metricLabels)), + flushTimeouts: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_flush_timeouts_total{%s}`, metricLabels)), + dedupFlushTimeouts: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_dedup_flush_timeouts_total{%s}`, metricLabels)), + ignoredNaNSamples: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_ignored_samples_total{reason="nan",%s}`, metricLabels)), + ignoredOldSamples: ms.NewCounter(fmt.Sprintf(`vm_streamaggr_ignored_samples_total{reason="too_old",%s}`, metricLabels)), } + if dedupInterval > 0 { a.da = newDedupAggr() + + _ = ms.NewGauge(fmt.Sprintf(`vm_streamaggr_dedup_state_size_bytes{%s}`, metricLabels), func() float64 { + n := a.da.sizeBytes() + return float64(n) + }) + _ = ms.NewGauge(fmt.Sprintf(`vm_streamaggr_dedup_state_items_count{%s}`, metricLabels), func() float64 { + n := a.da.itemsCount() + return float64(n) + }) } alignFlushToInterval := !opts.NoAlignFlushToInterval @@ -682,14 +635,89 @@ func newAggregator(cfg *Config, pushFunc PushFunc, ms *metrics.Set, opts Options a.wg.Add(1) go func() { - a.runFlusher(pushFunc, alignFlushToInterval, skipIncompleteFlush, interval, dedupInterval, ignoreFirstIntervals) + a.runFlusher(pushFunc, alignFlushToInterval, skipIncompleteFlush, ignoreFirstIntervals) a.wg.Done() }() return a, nil } -func (a *aggregator) runFlusher(pushFunc PushFunc, alignFlushToInterval, skipIncompleteFlush bool, interval, dedupInterval time.Duration, ignoreFirstIntervals int) { +func newAggrState(output string, outputsSeen map[string]struct{}, stalenessInterval time.Duration) (aggrState, error) { + // check for duplicated output + if _, ok := outputsSeen[output]; ok { + return nil, fmt.Errorf("`outputs` list contains duplicate aggregation function: %s", output) + } + outputsSeen[output] = struct{}{} + + if strings.HasPrefix(output, "quantiles(") { + if !strings.HasSuffix(output, ")") { + return nil, fmt.Errorf("missing closing brace for `quantiles()` output") + } + argsStr := output[len("quantiles(") : len(output)-1] + if len(argsStr) == 0 { + return nil, fmt.Errorf("`quantiles()` must contain at least one phi") + } + args := strings.Split(argsStr, ",") + phis := make([]float64, len(args)) + for i, arg := range args { + arg = strings.TrimSpace(arg) + phi, err := strconv.ParseFloat(arg, 64) + if err != nil { + return nil, fmt.Errorf("cannot parse phi=%q for quantiles(%s): %w", arg, argsStr, err) + } + if phi < 0 || phi > 1 { + return nil, fmt.Errorf("phi inside quantiles(%s) must be in the range [0..1]; got %v", argsStr, phi) + } + phis[i] = phi + } + if _, ok := outputsSeen["quantiles"]; ok { + return nil, fmt.Errorf("`outputs` list contains duplicated `quantiles()` function, please combine multiple phi* like `quantiles(0.5, 0.9)`") + } + outputsSeen["quantiles"] = struct{}{} + return newQuantilesAggrState(phis), nil + } + + switch output { + case "avg": + return newAvgAggrState(), nil + case "count_samples": + return newCountSamplesAggrState(), nil + case "count_series": + return newCountSeriesAggrState(), nil + case "histogram_bucket": + return newHistogramBucketAggrState(stalenessInterval), nil + case "increase": + return newTotalAggrState(stalenessInterval, true, true), nil + case "increase_prometheus": + return newTotalAggrState(stalenessInterval, true, false), nil + case "last": + return newLastAggrState(), nil + case "max": + return newMaxAggrState(), nil + case "min": + return newMinAggrState(), nil + case "rate_avg": + return newRateAggrState(stalenessInterval, true), nil + case "rate_sum": + return newRateAggrState(stalenessInterval, false), nil + case "stddev": + return newStddevAggrState(), nil + case "stdvar": + return newStdvarAggrState(), nil + case "sum_samples": + return newSumSamplesAggrState(), nil + case "total": + return newTotalAggrState(stalenessInterval, false, true), nil + case "total_prometheus": + return newTotalAggrState(stalenessInterval, false, false), nil + case "unique_samples": + return newUniqueSamplesAggrState(), nil + default: + return nil, fmt.Errorf("unsupported output=%q; supported values: %s; see https://docs.victoriametrics.com/stream-aggregation/", output, supportedOutputs) + } +} + +func (a *aggregator) runFlusher(pushFunc PushFunc, alignFlushToInterval, skipIncompleteFlush bool, ignoreFirstIntervals int) { alignedSleep := func(d time.Duration) { if !alignFlushToInterval { return @@ -714,22 +742,22 @@ func (a *aggregator) runFlusher(pushFunc PushFunc, alignFlushToInterval, skipInc } } - if dedupInterval <= 0 { - alignedSleep(interval) - t := time.NewTicker(interval) + if a.dedupInterval <= 0 { + alignedSleep(a.interval) + t := time.NewTicker(a.interval) defer t.Stop() if alignFlushToInterval && skipIncompleteFlush { - a.flush(nil, interval, true) + a.flush(nil) ignoreFirstIntervals-- } for tickerWait(t) { if ignoreFirstIntervals > 0 { - a.flush(nil, interval, true) + a.flush(nil) ignoreFirstIntervals-- } else { - a.flush(pushFunc, interval, true) + a.flush(pushFunc) } if alignFlushToInterval { @@ -740,30 +768,30 @@ func (a *aggregator) runFlusher(pushFunc PushFunc, alignFlushToInterval, skipInc } } } else { - alignedSleep(dedupInterval) - t := time.NewTicker(dedupInterval) + alignedSleep(a.dedupInterval) + t := time.NewTicker(a.dedupInterval) defer t.Stop() - flushDeadline := time.Now().Add(interval) + flushDeadline := time.Now().Add(a.interval) isSkippedFirstFlush := false for tickerWait(t) { - a.dedupFlush(dedupInterval) + a.dedupFlush() ct := time.Now() if ct.After(flushDeadline) { // It is time to flush the aggregated state if alignFlushToInterval && skipIncompleteFlush && !isSkippedFirstFlush { - a.flush(nil, interval, true) + a.flush(nil) ignoreFirstIntervals-- isSkippedFirstFlush = true } else if ignoreFirstIntervals > 0 { - a.flush(nil, interval, true) + a.flush(nil) ignoreFirstIntervals-- } else { - a.flush(pushFunc, interval, true) + a.flush(pushFunc) } for ct.After(flushDeadline) { - flushDeadline = flushDeadline.Add(interval) + flushDeadline = flushDeadline.Add(a.interval) } } @@ -777,13 +805,13 @@ func (a *aggregator) runFlusher(pushFunc PushFunc, alignFlushToInterval, skipInc } if !skipIncompleteFlush && ignoreFirstIntervals <= 0 { - a.dedupFlush(dedupInterval) - a.flush(pushFunc, interval, true) + a.dedupFlush() + a.flush(pushFunc) } } -func (a *aggregator) dedupFlush(dedupInterval time.Duration) { - if dedupInterval <= 0 { +func (a *aggregator) dedupFlush() { + if a.dedupInterval <= 0 { // The de-duplication is disabled. return } @@ -794,15 +822,22 @@ func (a *aggregator) dedupFlush(dedupInterval time.Duration) { d := time.Since(startTime) a.dedupFlushDuration.Update(d.Seconds()) - if d > dedupInterval { + if d > a.dedupInterval { a.dedupFlushTimeouts.Inc() logger.Warnf("deduplication couldn't be finished in the configured dedup_interval=%s; it took %.03fs; "+ "possible solutions: increase dedup_interval; use match filter matching smaller number of series; "+ - "reduce samples' ingestion rate to stream aggregation", dedupInterval, d.Seconds()) + "reduce samples' ingestion rate to stream aggregation", a.dedupInterval, d.Seconds()) } } -func (a *aggregator) flush(pushFunc PushFunc, interval time.Duration, resetState bool) { +// flush flushes aggregator state to pushFunc. +// +// If pushFunc is nil, then the aggregator state is just reset. +func (a *aggregator) flush(pushFunc PushFunc) { + a.flushInternal(pushFunc, true) +} + +func (a *aggregator) flushInternal(pushFunc PushFunc, resetState bool) { startTime := time.Now() // Update minTimestamp before flushing samples to the storage, @@ -811,31 +846,31 @@ func (a *aggregator) flush(pushFunc PushFunc, interval time.Duration, resetState a.minTimestamp.Store(startTime.UnixMilli() - 5_000) var wg sync.WaitGroup - for output, as := range a.aggrStates { + for i := range a.aggrOutputs { + ao := &a.aggrOutputs[i] flushConcurrencyCh <- struct{}{} wg.Add(1) - go func(as aggrState) { + go func(ao *aggrOutput) { defer func() { <-flushConcurrencyCh wg.Done() }() - ctx := getFlushCtx(a, pushFunc) - as.flushState(ctx, resetState) - ctx.flushSeries(output) - ctx.resetSeries() + ctx := getFlushCtx(a, ao, pushFunc) + ao.as.flushState(ctx, resetState) + ctx.flushSeries() putFlushCtx(ctx) - }(as) + }(ao) } wg.Wait() d := time.Since(startTime) a.flushDuration.Update(d.Seconds()) - if d > interval { + if d > a.interval { a.flushTimeouts.Inc() logger.Warnf("stream aggregation couldn't be finished in the configured interval=%s; it took %.03fs; "+ "possible solutions: increase interval; use match filter matching smaller number of series; "+ - "reduce samples' ingestion rate to stream aggregation", interval, d.Seconds()) + "reduce samples' ingestion rate to stream aggregation", a.interval, d.Seconds()) } } @@ -851,7 +886,6 @@ func (a *aggregator) MustStop() { // Push pushes tss to a. func (a *aggregator) Push(tss []prompbmarshal.TimeSeries, matchIdxs []byte) { - now := time.Now().UnixMilli() ctx := getPushCtx() defer putPushCtx(ctx) @@ -864,7 +898,9 @@ func (a *aggregator) Push(tss []prompbmarshal.TimeSeries, matchIdxs []byte) { dropLabels := a.dropInputLabels ignoreOldSamples := a.ignoreOldSamples minTimestamp := a.minTimestamp.Load() - var maxLag int64 + + nowMsec := time.Now().UnixMilli() + var maxLagMsec int64 for idx, ts := range tss { if !a.match.Match(ts.Labels) { continue @@ -896,30 +932,31 @@ func (a *aggregator) Push(tss []prompbmarshal.TimeSeries, matchIdxs []byte) { // key remains valid only by the end of this function and can't be reused after // do not intern key because number of unique keys could be too high key := bytesutil.ToUnsafeString(buf[bufLen:]) - for _, sample := range ts.Samples { - if math.IsNaN(sample.Value) { - a.ignoredNanSamples.Inc() + for _, s := range ts.Samples { + if math.IsNaN(s.Value) { + a.ignoredNaNSamples.Inc() // Skip NaN values continue } - if ignoreOldSamples && sample.Timestamp < minTimestamp { + if ignoreOldSamples && s.Timestamp < minTimestamp { a.ignoredOldSamples.Inc() // Skip old samples outside the current aggregation interval continue } - if maxLag < now-sample.Timestamp { - maxLag = now - sample.Timestamp + lagMsec := nowMsec - s.Timestamp + if lagMsec > maxLagMsec { + maxLagMsec = lagMsec } samples = append(samples, pushSample{ key: key, - value: sample.Value, - timestamp: sample.Timestamp, + value: s.Value, + timestamp: s.Timestamp, }) } } if len(samples) > 0 { a.matchedSamples.Add(len(samples)) - a.samplesLag.Update(float64(maxLag) / 1_000) + a.samplesLag.Update(float64(maxLagMsec) / 1_000) } ctx.samples = samples ctx.buf = buf @@ -969,8 +1006,8 @@ func getInputOutputKey(key string) (string, string) { } func (a *aggregator) pushSamples(samples []pushSample) { - for _, as := range a.aggrStates { - as.pushSamples(samples) + for _, ao := range a.aggrOutputs { + ao.as.pushSamples(samples) } } @@ -1036,13 +1073,14 @@ func getInputOutputLabels(dstInput, dstOutput, labels []prompbmarshal.Label, by, return dstInput, dstOutput } -func getFlushCtx(a *aggregator, pushFunc PushFunc) *flushCtx { +func getFlushCtx(a *aggregator, ao *aggrOutput, pushFunc PushFunc) *flushCtx { v := flushCtxPool.Get() if v == nil { v = &flushCtx{} } ctx := v.(*flushCtx) ctx.a = a + ctx.ao = ao ctx.pushFunc = pushFunc return ctx } @@ -1056,6 +1094,7 @@ var flushCtxPool sync.Pool type flushCtx struct { a *aggregator + ao *aggrOutput pushFunc PushFunc tss []prompbmarshal.TimeSeries @@ -1065,6 +1104,7 @@ type flushCtx struct { func (ctx *flushCtx) reset() { ctx.a = nil + ctx.ao = nil ctx.pushFunc = nil ctx.resetSeries() } @@ -1079,7 +1119,9 @@ func (ctx *flushCtx) resetSeries() { ctx.samples = ctx.samples[:0] } -func (ctx *flushCtx) flushSeries(aggrStateSuffix string) { +func (ctx *flushCtx) flushSeries() { + defer ctx.resetSeries() + tss := ctx.tss if len(tss) == 0 { // nothing to flush @@ -1091,7 +1133,7 @@ func (ctx *flushCtx) flushSeries(aggrStateSuffix string) { // Fast path - push the output metrics. if ctx.pushFunc != nil { ctx.pushFunc(tss) - ctx.a.flushedSamples[aggrStateSuffix].Add(len(tss)) + ctx.ao.outputSamples.Add(len(tss)) } return } @@ -1113,7 +1155,7 @@ func (ctx *flushCtx) flushSeries(aggrStateSuffix string) { } if ctx.pushFunc != nil { ctx.pushFunc(dst) - ctx.a.flushedSamples[aggrStateSuffix].Add(len(dst)) + ctx.ao.outputSamples.Add(len(dst)) } auxLabels.Labels = dstLabels promutils.PutLabels(auxLabels) @@ -1137,8 +1179,7 @@ func (ctx *flushCtx) appendSeries(key, suffix string, timestamp int64, value flo // Limit the maximum length of ctx.tss in order to limit memory usage. if len(ctx.tss) >= 10_000 { - ctx.flushSeries(suffix) - ctx.resetSeries() + ctx.flushSeries() } } @@ -1161,7 +1202,11 @@ func (ctx *flushCtx) appendSeriesWithExtraLabel(key, suffix string, timestamp in Labels: ctx.labels[labelsLen:], Samples: ctx.samples[samplesLen:], }) - ctx.a.flushedSamples[suffix].Add(len(ctx.tss)) + + // Limit the maximum length of ctx.tss in order to limit memory usage. + if len(ctx.tss) >= 10_000 { + ctx.flushSeries() + } } func addMetricSuffix(labels []prompbmarshal.Label, offset int, firstSuffix, lastSuffix string) []prompbmarshal.Label { diff --git a/lib/streamaggr/streamaggr_test.go b/lib/streamaggr/streamaggr_test.go index 28170cb96..ae64a6ac7 100644 --- a/lib/streamaggr/streamaggr_test.go +++ b/lib/streamaggr/streamaggr_test.go @@ -19,7 +19,7 @@ func TestAggregatorsFailure(t *testing.T) { pushFunc := func(_ []prompbmarshal.TimeSeries) { panic(fmt.Errorf("pushFunc shouldn't be called")) } - a, err := LoadFromData([]byte(config), pushFunc, Options{}) + a, err := LoadFromData([]byte(config), pushFunc, nil, "some_alias") if err == nil { t.Fatalf("expecting non-nil error") } @@ -200,11 +200,11 @@ func TestAggregatorsEqual(t *testing.T) { t.Helper() pushFunc := func(_ []prompbmarshal.TimeSeries) {} - aa, err := LoadFromData([]byte(a), pushFunc, Options{}) + aa, err := LoadFromData([]byte(a), pushFunc, nil, "some_alias") if err != nil { t.Fatalf("cannot initialize aggregators: %s", err) } - ab, err := LoadFromData([]byte(b), pushFunc, Options{}) + ab, err := LoadFromData([]byte(b), pushFunc, nil, "some_alias") if err != nil { t.Fatalf("cannot initialize aggregators: %s", err) } @@ -263,11 +263,11 @@ func TestAggregatorsSuccess(t *testing.T) { tssOutput = appendClonedTimeseries(tssOutput, tss) tssOutputLock.Unlock() } - opts := Options{ + opts := &Options{ FlushOnShutdown: true, NoAlignFlushToInterval: true, } - a, err := LoadFromData([]byte(config), pushFunc, opts) + a, err := LoadFromData([]byte(config), pushFunc, opts, "some_alias") if err != nil { t.Fatalf("cannot initialize aggregators: %s", err) } @@ -515,6 +515,7 @@ foo-1m-without-abc-sum-samples 12.5 without: [abc] outputs: [count_samples, sum_samples, count_series] match: '{non_existing_label!=""}' + name: foobar `, ` foo{abc="123"} 4 bar 5 @@ -527,6 +528,7 @@ foo{abc="456",de="fg"} 8 - interval: 1m by: [abc] outputs: [count_samples, sum_samples, count_series] + name: abcdef match: - foo{abc=~".+"} - '{non_existing_label!=""}' @@ -891,21 +893,28 @@ foo{abc="123", cde="1"} 4 foo{abc="123", cde="1"} 8.5 10 foo{abc="456", cde="1"} 8 foo{abc="456", cde="1"} 10 10 +foo 12 34 `, `foo:1m_by_cde_rate_avg{cde="1"} 0.325 foo:1m_by_cde_rate_sum{cde="1"} 0.65 -`, "1111") +`, "11111") - // rate with duplicated events + // rate_sum and rate_avg with duplicated events f(` - interval: 1m - by: [cde] outputs: [rate_sum, rate_avg] `, ` foo{abc="123", cde="1"} 4 10 foo{abc="123", cde="1"} 4 10 -`, `foo:1m_by_cde_rate_avg{cde="1"} 0 -foo:1m_by_cde_rate_sum{cde="1"} 0 -`, "11") +`, ``, "11") + + // rate_sum and rate_avg for a single sample + f(` +- interval: 1m + outputs: [rate_sum, rate_avg] +`, ` +foo 4 10 +bar 5 10 +`, ``, "11") // unique_samples output f(` @@ -973,11 +982,11 @@ func TestAggregatorsWithDedupInterval(t *testing.T) { } tssOutputLock.Unlock() } - opts := Options{ + opts := &Options{ DedupInterval: 30 * time.Second, FlushOnShutdown: true, } - a, err := LoadFromData([]byte(config), pushFunc, opts) + a, err := LoadFromData([]byte(config), pushFunc, opts, "some_alias") if err != nil { t.Fatalf("cannot initialize aggregators: %s", err) } diff --git a/lib/streamaggr/streamaggr_timing_test.go b/lib/streamaggr/streamaggr_timing_test.go index 26d3446bd..530ac9d6b 100644 --- a/lib/streamaggr/streamaggr_timing_test.go +++ b/lib/streamaggr/streamaggr_timing_test.go @@ -11,24 +11,24 @@ import ( ) var benchOutputs = []string{ - "total", - "total_prometheus", + "avg", + "count_samples", + "count_series", + "histogram_bucket", "increase", "increase_prometheus", - "rate_sum", - "rate_avg", - "count_series", - "count_samples", - "unique_samples", - "sum_samples", "last", - "min", "max", - "avg", + "min", + "quantiles(0, 0.5, 1)", + "rate_avg", + "rate_sum", "stddev", "stdvar", - "histogram_bucket", - "quantiles(0, 0.5, 1)", + "sum_samples", + "total", + "total_prometheus", + "unique_samples", } func BenchmarkAggregatorsPush(b *testing.B) { @@ -39,23 +39,18 @@ func BenchmarkAggregatorsPush(b *testing.B) { } } -func BenchmarkAggregatorsFlushSerial(b *testing.B) { - outputs := []string{ - "total", "sum_samples", "count_samples", "min", - "max", "avg", "increase", "count_series", - "last", "stddev", "stdvar", "total_prometheus", "increase_prometheus", - } +func BenchmarkAggregatorsFlushInternalSerial(b *testing.B) { pushFunc := func(_ []prompbmarshal.TimeSeries) {} - a := newBenchAggregators(outputs, pushFunc) + a := newBenchAggregators(benchOutputs, pushFunc) defer a.MustStop() _ = a.Push(benchSeries, nil) b.ResetTimer() b.ReportAllocs() - b.SetBytes(int64(len(benchSeries) * len(outputs))) + b.SetBytes(int64(len(benchSeries) * len(benchOutputs))) for i := 0; i < b.N; i++ { for _, aggr := range a.as { - aggr.flush(pushFunc, time.Hour, false) + aggr.flushInternal(pushFunc, false) } } } @@ -92,7 +87,7 @@ func newBenchAggregators(outputs []string, pushFunc PushFunc) *Aggregators { outputs: [%s] `, strings.Join(outputsQuoted, ",")) - a, err := LoadFromData([]byte(config), pushFunc, Options{}) + a, err := LoadFromData([]byte(config), pushFunc, nil, "some_alias") if err != nil { panic(fmt.Errorf("unexpected error when initializing aggregators: %s", err)) } diff --git a/lib/streamaggr/total.go b/lib/streamaggr/total.go index e0c26a0fe..fda700b4c 100644 --- a/lib/streamaggr/total.go +++ b/lib/streamaggr/total.go @@ -9,12 +9,10 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime" ) -// totalAggrState calculates output=total, e.g. the summary counter over input counters. +// totalAggrState calculates output=total, total_prometheus, increase and increase_prometheus. type totalAggrState struct { m sync.Map - suffix string - // Whether to reset the output value on every flushState call. resetTotalOnFlush bool @@ -50,15 +48,8 @@ type totalLastValueState struct { func newTotalAggrState(stalenessInterval time.Duration, resetTotalOnFlush, keepFirstSample bool) *totalAggrState { stalenessSecs := roundDurationToSecs(stalenessInterval) ignoreFirstSampleDeadline := fasttime.UnixTimestamp() + stalenessSecs - suffix := "total" - if resetTotalOnFlush { - suffix = "increase" - } - if !keepFirstSample { - suffix += "_prometheus" - } + return &totalAggrState{ - suffix: suffix, resetTotalOnFlush: resetTotalOnFlush, keepFirstSample: keepFirstSample, stalenessSecs: stalenessSecs, @@ -124,48 +115,18 @@ func (as *totalAggrState) pushSamples(samples []pushSample) { } } -func (as *totalAggrState) removeOldEntries(ctx *flushCtx, currentTime uint64) { - m := &as.m - var staleInputSamples, staleOutputSamples int - m.Range(func(k, v any) bool { - sv := v.(*totalStateValue) - - sv.mu.Lock() - deleted := currentTime > sv.deleteDeadline - if deleted { - // Mark the current entry as deleted - sv.deleted = deleted - staleOutputSamples++ - } else { - // Delete outdated entries in sv.lastValues - m := sv.lastValues - for k1, v1 := range m { - if currentTime > v1.deleteDeadline { - delete(m, k1) - staleInputSamples++ - } - } - } - sv.mu.Unlock() - - if deleted { - m.Delete(k) - } - return true - }) - ctx.a.staleInputSamples[as.suffix].Add(staleInputSamples) - ctx.a.staleOutputSamples[as.suffix].Add(staleOutputSamples) -} - func (as *totalAggrState) flushState(ctx *flushCtx, resetState bool) { currentTime := fasttime.UnixTimestamp() currentTimeMsec := int64(currentTime) * 1000 - as.removeOldEntries(ctx, currentTime) + suffix := as.getSuffix() + + as.removeOldEntries(currentTime) m := &as.m m.Range(func(k, v any) bool { sv := v.(*totalStateValue) + sv.mu.Lock() total := sv.total if resetState { @@ -178,10 +139,51 @@ func (as *totalAggrState) flushState(ctx *flushCtx, resetState bool) { } deleted := sv.deleted sv.mu.Unlock() + if !deleted { key := k.(string) - ctx.appendSeries(key, as.suffix, currentTimeMsec, total) + ctx.appendSeries(key, suffix, currentTimeMsec, total) } return true }) } + +func (as *totalAggrState) getSuffix() string { + // Note: this function is at hot path, so it shouldn't allocate. + if as.resetTotalOnFlush { + if as.keepFirstSample { + return "increase" + } + return "increase_prometheus" + } + if as.keepFirstSample { + return "total" + } + return "total_prometheus" +} + +func (as *totalAggrState) removeOldEntries(currentTime uint64) { + m := &as.m + m.Range(func(k, v any) bool { + sv := v.(*totalStateValue) + + sv.mu.Lock() + if currentTime > sv.deleteDeadline { + // Mark the current entry as deleted + sv.deleted = true + sv.mu.Unlock() + m.Delete(k) + return true + } + + // Delete outdated entries in sv.lastValues + lvs := sv.lastValues + for k1, lv := range lvs { + if currentTime > lv.deleteDeadline { + delete(lvs, k1) + } + } + sv.mu.Unlock() + return true + }) +} diff --git a/lib/uint64set/uint64set.go b/lib/uint64set/uint64set.go index edd8bf042..95fee3ee0 100644 --- a/lib/uint64set/uint64set.go +++ b/lib/uint64set/uint64set.go @@ -183,6 +183,10 @@ func (s *Set) Has(x uint64) bool { if s == nil { return false } + return s.hasSlow(x) +} + +func (s *Set) hasSlow(x uint64) bool { hi32 := uint32(x >> 32) lo32 := uint32(x) bs := s.buckets diff --git a/vendor/github.com/VictoriaMetrics/metrics/histogram.go b/vendor/github.com/VictoriaMetrics/metrics/histogram.go index ccb63d99e..d703ae82f 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/histogram.go +++ b/vendor/github.com/VictoriaMetrics/metrics/histogram.go @@ -47,13 +47,21 @@ var bucketMultiplier = math.Pow(10, 1.0/bucketsPerDecimal) // Zero histogram is usable. type Histogram struct { // Mu gurantees synchronous update for all the counters and sum. - mu sync.RWMutex + // + // Do not use sync.RWMutex, since it has zero sense from performance PoV. + // It only complicates the code. + mu sync.Mutex + // decimalBuckets contains counters for histogram buckets decimalBuckets [decimalBucketsCount]*[bucketsPerDecimal]uint64 + // lower is the number of values, which hit the lower bucket lower uint64 + + // upper is the number of values, which hit the upper bucket upper uint64 + // sum is the sum of all the values put into Histogram sum float64 } @@ -109,28 +117,30 @@ func (h *Histogram) Update(v float64) { h.mu.Unlock() } -// Merge merges histograms -func (h *Histogram) Merge(b *Histogram) { +// Merge merges src to h +func (h *Histogram) Merge(src *Histogram) { h.mu.Lock() defer h.mu.Unlock() - b.mu.RLock() - defer b.mu.RUnlock() + src.mu.Lock() + defer src.mu.Unlock() - h.lower += b.lower - h.upper += b.upper - h.sum += b.sum + h.lower += src.lower + h.upper += src.upper + h.sum += src.sum - for i, db := range b.decimalBuckets { - if db == nil { + for i, dbSrc := range src.decimalBuckets { + if dbSrc == nil { continue } - if h.decimalBuckets[i] == nil { + dbDst := h.decimalBuckets[i] + if dbDst == nil { var b [bucketsPerDecimal]uint64 - h.decimalBuckets[i] = &b + dbDst = &b + h.decimalBuckets[i] = dbDst } - for j := range db { - h.decimalBuckets[i][j] += db[j] + for j := range dbSrc { + dbDst[j] += dbSrc[j] } } } @@ -142,7 +152,7 @@ func (h *Histogram) Merge(b *Histogram) { // This is required to be compatible with Prometheus-style histogram buckets // with `le` (less or equal) labels. func (h *Histogram) VisitNonZeroBuckets(f func(vmrange string, count uint64)) { - h.mu.RLock() + h.mu.Lock() if h.lower > 0 { f(lowerBucketRange, h.lower) } @@ -161,7 +171,7 @@ func (h *Histogram) VisitNonZeroBuckets(f func(vmrange string, count uint64)) { if h.upper > 0 { f(upperBucketRange, h.upper) } - h.mu.RUnlock() + h.mu.Unlock() } // NewHistogram creates and returns new histogram with the given name. @@ -249,9 +259,9 @@ func (h *Histogram) marshalTo(prefix string, w io.Writer) { } func (h *Histogram) getSum() float64 { - h.mu.RLock() + h.mu.Lock() sum := h.sum - h.mu.RUnlock() + h.mu.Unlock() return sum } diff --git a/vendor/github.com/VictoriaMetrics/metrics/metrics.go b/vendor/github.com/VictoriaMetrics/metrics/metrics.go index 51f236e32..74e97352c 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/metrics.go +++ b/vendor/github.com/VictoriaMetrics/metrics/metrics.go @@ -55,11 +55,16 @@ func RegisterSet(s *Set) { // UnregisterSet stops exporting metrics for the given s via global WritePrometheus() call. // -// Call s.UnregisterAllMetrics() after unregistering s if it is no longer used. -func UnregisterSet(s *Set) { +// If destroySet is set to true, then s.UnregisterAllMetrics() is called on s after unregistering it, +// so s becomes destroyed. Otherwise the s can be registered again in the set by passing it to RegisterSet(). +func UnregisterSet(s *Set, destroySet bool) { registeredSetsLock.Lock() delete(registeredSets, s) registeredSetsLock.Unlock() + + if destroySet { + s.UnregisterAllMetrics() + } } // RegisterMetricsWriter registers writeMetrics callback for including metrics in the output generated by WritePrometheus. diff --git a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go index 22a9e6ec9..e4587b717 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go +++ b/vendor/github.com/VictoriaMetrics/metrics/process_metrics_linux.go @@ -16,6 +16,11 @@ import ( // See https://github.com/prometheus/procfs/blob/a4ac0826abceb44c40fc71daed2b301db498b93e/proc_stat.go#L40 . const userHZ = 100 +// Different environments may have different page size. +// +// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6457 +var pageSizeBytes = uint64(os.Getpagesize()) + // See http://man7.org/linux/man-pages/man5/proc.5.html type procStat struct { State byte @@ -80,7 +85,7 @@ func writeProcessMetrics(w io.Writer) { WriteCounterUint64(w, "process_major_pagefaults_total", uint64(p.Majflt)) WriteCounterUint64(w, "process_minor_pagefaults_total", uint64(p.Minflt)) WriteGaugeUint64(w, "process_num_threads", uint64(p.NumThreads)) - WriteGaugeUint64(w, "process_resident_memory_bytes", uint64(p.Rss)*uint64(os.Getpagesize())) + WriteGaugeUint64(w, "process_resident_memory_bytes", uint64(p.Rss)*pageSizeBytes) WriteGaugeUint64(w, "process_start_time_seconds", uint64(startTimeSeconds)) WriteGaugeUint64(w, "process_virtual_memory_bytes", uint64(p.Vsize)) writeProcessMemMetrics(w) diff --git a/vendor/github.com/VictoriaMetrics/metrics/push.go b/vendor/github.com/VictoriaMetrics/metrics/push.go index 63a3b1b31..f33886f9b 100644 --- a/vendor/github.com/VictoriaMetrics/metrics/push.go +++ b/vendor/github.com/VictoriaMetrics/metrics/push.go @@ -32,7 +32,8 @@ type PushOptions struct { // By default the compression is enabled. DisableCompression bool - // Method is an optional of HTTP request method. + // Method is HTTP request method to use when pushing metrics to pushURL. + // // By default the Method is GET. Method string @@ -301,7 +302,7 @@ func newPushContext(pushURL string, opts *PushOptions) (*pushContext, error) { } method := opts.Method - if len(method) == 0 { + if method == "" { method = http.MethodGet } diff --git a/vendor/modules.txt b/vendor/modules.txt index 1950fd32b..712a41d76 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -115,7 +115,7 @@ github.com/VictoriaMetrics/easyproto # github.com/VictoriaMetrics/fastcache v1.12.2 ## explicit; go 1.13 github.com/VictoriaMetrics/fastcache -# github.com/VictoriaMetrics/metrics v1.34.1 +# github.com/VictoriaMetrics/metrics v1.35.1 ## explicit; go 1.17 github.com/VictoriaMetrics/metrics # github.com/VictoriaMetrics/metricsql v0.76.0