diff --git a/Makefile b/Makefile index d6a1f85bc..9dfff4396 100644 --- a/Makefile +++ b/Makefile @@ -261,7 +261,7 @@ golangci-lint: install-golangci-lint golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m install-golangci-lint: - which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.40.1 + which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.42.1 install-wwhrd: which wwhrd || GO111MODULE=off go get github.com/frapposelli/wwhrd diff --git a/README.md b/README.md index bc947e1d7..f7c7118ba 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Enterprise binaries can be downloaded and evaluated for free from [the releases Case studies: +* [AbiosGaming](https://docs.victoriametrics.com/CaseStudies.html#abiosgaming) * [adidas](https://docs.victoriametrics.com/CaseStudies.html#adidas) * [Adsterra](https://docs.victoriametrics.com/CaseStudies.html#adsterra) * [ARNES](https://docs.victoriametrics.com/CaseStudies.html#arnes) @@ -35,12 +36,16 @@ Case studies: * [CERN](https://docs.victoriametrics.com/CaseStudies.html#cern) * [COLOPL](https://docs.victoriametrics.com/CaseStudies.html#colopl) * [Dreamteam](https://docs.victoriametrics.com/CaseStudies.html#dreamteam) +* [Fly.io](https://docs.victoriametrics.com/CaseStudies.html#flyio) * [German Research Center for Artificial Intelligence](https://docs.victoriametrics.com/CaseStudies.html#german-research-center-for-artificial-intelligence) * [Grammarly](https://docs.victoriametrics.com/CaseStudies.html#grammarly) * [Groove X](https://docs.victoriametrics.com/CaseStudies.html#groove-x) * [Idealo.de](https://docs.victoriametrics.com/CaseStudies.html#idealode) * [MHI Vestas Offshore Wind](https://docs.victoriametrics.com/CaseStudies.html#mhi-vestas-offshore-wind) +* [Razorpay](https://docs.victoriametrics.com/CaseStudies.html#razorpay) +* [Percona](https://docs.victoriametrics.com/CaseStudies.html#percona) * [Sensedia](https://docs.victoriametrics.com/CaseStudies.html#sensedia) +* [Smarkets](https://docs.victoriametrics.com/CaseStudies.html#smarkets) * [Synthesio](https://docs.victoriametrics.com/CaseStudies.html#synthesio) * [Wedos.com](https://docs.victoriametrics.com/CaseStudies.html#wedoscom) * [Wix.com](https://docs.victoriametrics.com/CaseStudies.html#wixcom) @@ -604,6 +609,12 @@ The UI allows exploring query results via graphs and tables. Graphs support scro * Drag the graph to the left / right in order to move the displayed time range into the past / future. * Hold `Ctrl` (or `Cmd` on MacOS) and scroll up / down in order to zoom in / out the graph. +Query history can be navigated by holding `Ctrl` (or `Cmd` on MacOS) and pressing `up` or `down` arrows on the keyboard while the cursor is located in the query input field. + +When querying the [backfilled data](https://docs.victoriametrics.com/#backfilling), it may be useful disabling response cache by clicking `Enable cache` checkbox. + +See the [example VMUI at VictoriaMetrics playground](https://play.victoriametrics.com/select/accounting/1/6a716b0f-38bc-4856-90ce-448fd713e3fe/prometheus/graph/?g0.expr=100%20*%20sum(rate(process_cpu_seconds_total))%20by%20(job)&g0.range_input=1d). + ## How to build from sources @@ -1540,6 +1551,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for big merges. Default value is used if set to 0 -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) + -datadog.maxInsertRequestSize size + The maximum size in bytes of a single DataDog POST request to /api/v1/series + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 67108864) -dedup.minScrapeInterval duration Leave only the first sample in every time series per each discrete interval equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details -deleteAuthKey string @@ -1705,8 +1719,11 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -promscrape.maxScrapeSize size The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216) + -promscrape.minResponseSizeForStreamParse size + The minimum target response size for automatic switching to stream parsing mode, which can reduce memory usage. See https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 1000000) -promscrape.noStaleMarkers - Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. See also https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode + Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. This option also disables populating the scrape_series_added metric. See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series -promscrape.openstackSDCheckInterval duration Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s) -promscrape.seriesLimitPerTarget int @@ -1718,7 +1735,7 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li -promscrape.suppressScrapeErrors Whether to suppress scrape errors logging. The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed -relabelConfig string - Optional path to a file with relabeling rules, which are applied to all the ingested metrics. See https://docs.victoriametrics.com/#relabeling for details + Optional path to a file with relabeling rules, which are applied to all the ingested metrics. See https://docs.victoriametrics.com/#relabeling for details. The config is reloaded on SIGHUP signal -relabelDebug Whether to log metrics before and after relabeling with -relabelConfig. If the -relabelDebug is enabled, then the metrics aren't sent to storage. This is useful for debugging the relabeling configs -retentionPeriod value @@ -1795,6 +1812,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of unique series can be added to the storage during the last 24 hours. Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries -storage.maxHourlySeries int The maximum number of unique series can be added to the storage during the last hour. Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries + -storage.minFreeDiskSpaceBytes size + The minimum free disk space at -storageDataPath after which the storage stops accepting new data + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 10000000) -storageDataPath string Path to storage data (default "victoria-metrics-data") -tls diff --git a/app/victoria-metrics/main.go b/app/victoria-metrics/main.go index 25066ca36..025974808 100644 --- a/app/victoria-metrics/main.go +++ b/app/victoria-metrics/main.go @@ -99,7 +99,9 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { {"/vmui", "Web UI"}, {"/targets", "discovered targets list"}, {"/api/v1/targets", "advanced information about discovered targets in JSON format"}, + {"/config", "-promscrape.config contents"}, {"/metrics", "available service metrics"}, + {"/flags", "command-line flags"}, {"/api/v1/status/tsdb", "tsdb status page"}, {"/api/v1/status/top_queries", "top queries"}, {"/api/v1/status/active_queries", "active queries"}, diff --git a/app/vmagent/README.md b/app/vmagent/README.md index 93b612d0a..564b582b6 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -302,12 +302,14 @@ You can read more about relabeling in the following articles: * If the scrape target is removed from the list of targets, then stale markers are sent for all the metrics scraped from this target. * Stale markers are sent for all the scraped metrics on graceful shutdown of `vmagent`. -Prometheus staleness markers aren't sent to `-remoteWrite.url` in [stream parsing mode](#stream-parsing-mode) or if `-promscrape.noStaleMarkers` command-line is set. +Prometheus staleness markers' tracking needs additional memory, since it must store the previous response body per each scrape target in order to compare it to the current response body. The memory usage may be reduced by passing `-promscrape.noStaleMarkers` command-line flag to `vmagent`. This disables staleness tracking. This also disables tracking the number of new time series per each scrape with the auto-generated `scrape_series_added` metric. See [these docs](https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series) for details. ## Stream parsing mode -By default `vmagent` reads the full response from scrape target into memory, then parses it, applies [relabeling](#relabeling) and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode. When this mode is enabled, then `vmagent` reads response from scrape target in chunks, then immediately processes every chunk and pushes the processed metrics to remote storage. This allows saving memory when scraping targets that expose millions of metrics. Stream parsing mode may be enabled in the following places: +By default `vmagent` reads the full response body from scrape target into memory, then parses it, applies [relabeling](#relabeling) and then pushes the resulting metrics to the configured `-remoteWrite.url`. This mode works good for the majority of cases when the scrape target exposes small number of metrics (e.g. less than 10 thousand). But this mode may take big amounts of memory when the scrape target exposes big number of metrics. In this case it is recommended enabling stream parsing mode. When this mode is enabled, then `vmagent` reads response from scrape target in chunks, then immediately processes every chunk and pushes the processed metrics to remote storage. This allows saving memory when scraping targets that expose millions of metrics. + +Stream parsing mode is automatically enabled for scrape targets returning response bodies with sizes bigger than the `-promscrape.minResponseSizeForStreamParse` command-line flag value. Additionally, the stream parsing mode can be explicitly enabled in the following places: - Via `-promscrape.streamParse` command-line flag. In this case all the scrape targets defined in the file pointed by `-promscrape.config` are scraped in stream parsing mode. - Via `stream_parse: true` option at `scrape_configs` section. In this case all the scrape targets defined in this section are scraped in stream parsing mode. @@ -329,7 +331,7 @@ scrape_configs: 'match[]': ['{__name__!=""}'] ``` -Note that `sample_limit` option doesn't prevent from data push to remote storage if stream parsing is enabled because the parsed data is pushed to remote storage as soon as it is parsed. +Note that `sample_limit` and `series_limit` options cannot be used in stream parsing mode because the parsed data is pushed to remote storage as soon as it is parsed. ## Scraping big number of targets @@ -449,7 +451,8 @@ It may be useful to perform `vmagent` rolling update without any scrape loss. as `vmagent` establishes at least a single TCP connection per target. * If `vmagent` uses too big amounts of memory, then the following options can help: - * Enabling stream parsing. See [these docs](#stream-parsing-mode). + * Disabling staleness tracking with `-promscrape.noStaleMarkers` option. See [these docs](#prometheus-staleness-markers). + * Enabling stream parsing mode. See [these docs](#stream-parsing-mode). * Reducing the number of output queues with `-remoteWrite.queues` command-line option. * Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option. Another option is to reduce memory limits in Docker and/or Kuberntes if `vmagent` runs under these systems. * Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`, where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems. @@ -706,6 +709,9 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . -csvTrimTimestamp duration Trim timestamps when importing csv data to this duration. Minimum practical duration is 1ms. Higher duration (i.e. 1s) may be used for reducing disk space usage for timestamp data (default 1ms) + -datadog.maxInsertRequestSize size + The maximum size in bytes of a single DataDog POST request to /api/v1/series + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 67108864) -dryRun Whether to check only config files without running vmagent. The following files are checked: -promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse -enableTCP6 @@ -853,8 +859,11 @@ See the docs at https://docs.victoriametrics.com/vmagent.html . -promscrape.maxScrapeSize size The maximum size of scrape response in bytes to process from Prometheus targets. Bigger responses are rejected Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16777216) + -promscrape.minResponseSizeForStreamParse size + The minimum target response size for automatic switching to stream parsing mode, which can reduce memory usage. See https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode + Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 1000000) -promscrape.noStaleMarkers - Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. See also https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode + Whether to disable sending Prometheus stale markers for metrics when scrape target disappears. This option may reduce memory usage if stale markers aren't needed for your setup. This option also disables populating the scrape_series_added metric. See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series -promscrape.openstackSDCheckInterval duration Interval for checking for changes in openstack API server. This works only if openstack_sd_configs is configured in '-promscrape.config' file. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config for details (default 30s) -promscrape.seriesLimitPerTarget int diff --git a/app/vmagent/main.go b/app/vmagent/main.go index 7b2e02ace..befe066eb 100644 --- a/app/vmagent/main.go +++ b/app/vmagent/main.go @@ -159,7 +159,9 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { httpserver.WriteAPIHelp(w, [][2]string{ {"/targets", "discovered targets list"}, {"/api/v1/targets", "advanced information about discovered targets in JSON format"}, + {"/config", "-promscrape.config contents"}, {"/metrics", "available service metrics"}, + {"/flags", "command-line flags"}, {"/-/reload", "reload configuration"}, }) return true @@ -259,6 +261,11 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { promscrapeTargetsRequests.Inc() promscrape.WriteHumanReadableTargetsStatus(w, r) return true + case "/config": + promscrapeConfigRequests.Inc() + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + promscrape.WriteConfigData(w) + return true case "/api/v1/targets": promscrapeAPIV1TargetsRequests.Inc() w.Header().Set("Content-Type", "application/json; charset=utf-8") @@ -427,6 +434,8 @@ var ( promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`) promscrapeAPIV1TargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/targets"}`) + promscrapeConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/config"}`) + promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`) ) diff --git a/app/vmalert/README.md b/app/vmalert/README.md index d62cf6ae4..d7169df6c 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -351,12 +351,12 @@ See full description for these flags in `./vmalert --help`. ## Monitoring -`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. -We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported +`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. +We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported metrics may be analyzed later. Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. -If you have suggestions for improvements or have found a bug - please open an issue on github or add +If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard. @@ -496,6 +496,8 @@ The shortlist of configuration flags is the following: Optional bearer auth token to use for -remoteRead.url. -remoteRead.bearerTokenFile string Optional path to bearer token file to use for -remoteRead.url. + -remoteRead.disablePathAppend + Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url. -remoteRead.ignoreRestoreErrors Whether to ignore errors from remote storage when restoring alerts state on startup. (default true) -remoteRead.lookback duration @@ -511,7 +513,7 @@ The shortlist of configuration flags is the following: -remoteRead.tlsServerName string Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used -remoteRead.url vmalert - Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428 + Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428. See also -remoteRead.disablePathAppend -remoteWrite.basicAuth.password string Optional basic auth password for -remoteWrite.url -remoteWrite.basicAuth.passwordFile string diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 4aa77adea..d654246c8 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -163,7 +163,13 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([] // so the hash key will be consistent on restore s.SetLabel(k, v) } - + // set additional labels to identify group and rule name + if ar.Name != "" { + s.SetLabel(alertNameLabel, ar.Name) + } + if !*disableAlertGroupLabel && ar.GroupName != "" { + s.SetLabel(alertGroupNameLabel, ar.GroupName) + } a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert if err != nil { return nil, fmt.Errorf("failed to create alert: %s", err) @@ -178,13 +184,11 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([] // if alert with For > 0 prevT := time.Time{} - //activeAt := time.Time{} for i := range s.Values { at := time.Unix(s.Timestamps[i], 0) if at.Sub(prevT) > ar.EvalInterval { // reset to Pending if there are gaps > EvalInterval between DPs a.State = notifier.StatePending - //activeAt = at a.Start = at } else if at.Sub(a.Start) >= ar.For { a.State = notifier.StateFiring @@ -231,6 +235,14 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e // so the hash key will be consistent on restore m.SetLabel(k, v) } + // set additional labels to identify group and rule name + // set additional labels to identify group and rule name + if ar.Name != "" { + m.SetLabel(alertNameLabel, ar.Name) + } + if !*disableAlertGroupLabel && ar.GroupName != "" { + m.SetLabel(alertGroupNameLabel, ar.GroupName) + } h := hash(m) if _, ok := updated[h]; ok { // duplicate may be caused by extra labels @@ -352,11 +364,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notif Start: start, Expr: ar.Expr, } - // label defined here to make override possible by - // time series labels. - if !*disableAlertGroupLabel && ar.GroupName != "" { - a.Labels[alertGroupNameLabel] = ar.GroupName - } for _, l := range m.Labels { // drop __name__ to be consistent with Prometheus alerting if l.Name == "__name__" { @@ -415,7 +422,7 @@ func (ar *AlertingRule) AlertsAPI() []*APIAlert { } func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert { - return &APIAlert{ + aa := &APIAlert{ // encode as strings to avoid rounding ID: fmt.Sprintf("%d", a.ID), GroupID: fmt.Sprintf("%d", a.GroupID), @@ -427,8 +434,13 @@ func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert { Annotations: a.Annotations, State: a.State.String(), ActiveAt: a.Start, + Restored: a.Restored, Value: strconv.FormatFloat(a.Value, 'f', -1, 32), } + if alertURLGeneratorFn != nil { + aa.SourceLink = alertURLGeneratorFn(a) + } + return aa } const ( @@ -443,43 +455,42 @@ const ( alertStateLabel = "alertstate" // alertGroupNameLabel defines the label name attached for generated time series. + // attaching this label may be disabled via `-disableAlertgroupLabel` flag. alertGroupNameLabel = "alertgroup" ) // alertToTimeSeries converts the given alert with the given timestamp to timeseries func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries { var tss []prompbmarshal.TimeSeries - tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp)) + tss = append(tss, alertToTimeSeries(a, timestamp)) if ar.For > 0 { - tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp)) + tss = append(tss, alertForToTimeSeries(a, timestamp)) } return tss } -func alertToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries { +func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries { labels := make(map[string]string) for k, v := range a.Labels { labels[k] = v } labels["__name__"] = alertMetricName - labels[alertNameLabel] = name labels[alertStateLabel] = a.State.String() return newTimeSeries([]float64{1}, []int64{timestamp}, labels) } // alertForToTimeSeries returns a timeseries that represents // state of active alerts, where value is time when alert become active -func alertForToTimeSeries(name string, a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries { +func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries { labels := make(map[string]string) for k, v := range a.Labels { labels[k] = v } labels["__name__"] = alertForStateMetricName - labels[alertNameLabel] = name return newTimeSeries([]float64{float64(a.Start.Unix())}, []int64{timestamp}, labels) } -// Restore restores the state of active alerts basing on previously written timeseries. +// Restore restores the state of active alerts basing on previously written time series. // Restore restores only Start field. Field State will be always Pending and supposed // to be updated on next Exec, as well as Value field. // Only rules with For > 0 will be restored. @@ -507,23 +518,13 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb } for _, m := range qMetrics { - labels := m.Labels - m.Labels = make([]datasource.Label, 0) - // drop all extra labels, so hash key will - // be identical to time series received in Exec - for _, l := range labels { - if l.Name == alertNameLabel || l.Name == alertGroupNameLabel { - continue - } - m.Labels = append(m.Labels, l) - } - a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn) if err != nil { return fmt.Errorf("failed to create alert: %w", err) } a.ID = hash(m) a.State = notifier.StatePending + a.Restored = true ar.alerts[a.ID] = a logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start) } diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index 0be0c055a..cff88f53b 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -27,7 +27,6 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), - alertNameLabel: "instant", }), }, }, @@ -41,7 +40,6 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), - alertNameLabel: "instant extra labels", "job": "foo", "instance": "bar", }), @@ -57,7 +55,6 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), - alertNameLabel: "instant labels override", }), }, }, @@ -68,13 +65,11 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), - alertNameLabel: "for", }), newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertForStateMetricName, - alertNameLabel: "for", + "__name__": alertForStateMetricName, }), }, }, @@ -85,13 +80,11 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StatePending.String(), - alertNameLabel: "for pending", }), newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, []int64{timestamp.UnixNano()}, map[string]string{ - "__name__": alertForStateMetricName, - alertNameLabel: "for pending", + "__name__": alertForStateMetricName, }), }, }, @@ -109,23 +102,27 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { func TestAlertingRule_Exec(t *testing.T) { const defaultStep = 5 * time.Millisecond + type testAlert struct { + labels []string + alert *notifier.Alert + } testCases := []struct { rule *AlertingRule steps [][]datasource.Metric - expAlerts map[uint64]*notifier.Alert + expAlerts []testAlert }{ { newTestAlertingRule("empty", 0), [][]datasource.Metric{}, - map[uint64]*notifier.Alert{}, + nil, }, { newTestAlertingRule("empty labels", 0), [][]datasource.Metric{ {datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}}, }, - map[uint64]*notifier.Alert{ - hash(datasource.Metric{}): {State: notifier.StateFiring}, + []testAlert{ + {alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -133,8 +130,8 @@ func TestAlertingRule_Exec(t *testing.T) { [][]datasource.Metric{ {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -143,8 +140,8 @@ func TestAlertingRule_Exec(t *testing.T) { {metricWithLabels(t, "name", "foo")}, {}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, }, }, { @@ -154,8 +151,8 @@ func TestAlertingRule_Exec(t *testing.T) { {}, {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -166,8 +163,8 @@ func TestAlertingRule_Exec(t *testing.T) { {metricWithLabels(t, "name", "foo")}, {}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, }, }, { @@ -179,7 +176,7 @@ func TestAlertingRule_Exec(t *testing.T) { {}, {}, }, - map[uint64]*notifier.Alert{}, + nil, }, { newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0), @@ -191,8 +188,8 @@ func TestAlertingRule_Exec(t *testing.T) { {}, {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -204,10 +201,10 @@ func TestAlertingRule_Exec(t *testing.T) { metricWithLabels(t, "name", "foo2"), }, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, - hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateFiring}, - hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, + {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -220,9 +217,9 @@ func TestAlertingRule_Exec(t *testing.T) { // 1: fire first alert // 2: fire second alert, set first inactive // 3: fire third alert, set second inactive, delete first one - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateInactive}, - hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo1"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, + {labels: []string{"name", "foo2"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -230,8 +227,8 @@ func TestAlertingRule_Exec(t *testing.T) { [][]datasource.Metric{ {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}, }, }, { @@ -240,8 +237,8 @@ func TestAlertingRule_Exec(t *testing.T) { {metricWithLabels(t, "name", "foo")}, {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, { @@ -252,7 +249,7 @@ func TestAlertingRule_Exec(t *testing.T) { // empty step to reset and delete pending alerts {}, }, - map[uint64]*notifier.Alert{}, + nil, }, { newTestAlertingRule("for-pending=>firing=>inactive", defaultStep), @@ -262,8 +259,8 @@ func TestAlertingRule_Exec(t *testing.T) { // empty step to reset pending alerts {}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateInactive}}, }, }, { @@ -275,8 +272,8 @@ func TestAlertingRule_Exec(t *testing.T) { {}, {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StatePending}}, }, }, { @@ -289,8 +286,8 @@ func TestAlertingRule_Exec(t *testing.T) { {metricWithLabels(t, "name", "foo")}, {metricWithLabels(t, "name", "foo")}, }, - map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring}, + []testAlert{ + {labels: []string{"name", "foo"}, alert: ¬ifier.Alert{State: notifier.StateFiring}}, }, }, } @@ -312,7 +309,15 @@ func TestAlertingRule_Exec(t *testing.T) { if len(tc.rule.alerts) != len(tc.expAlerts) { t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts)) } - for key, exp := range tc.expAlerts { + expAlerts := make(map[uint64]*notifier.Alert) + for _, ta := range tc.expAlerts { + labels := ta.labels + labels = append(labels, alertNameLabel) + labels = append(labels, tc.rule.Name) + h := hash(metricWithLabels(t, labels...)) + expAlerts[h] = ta.alert + } + for key, exp := range expAlerts { got, ok := tc.rule.alerts[key] if !ok { t.Fatalf("expected to have key %d", key) @@ -468,6 +473,11 @@ func TestAlertingRule_ExecRange(t *testing.T) { var j int for _, series := range tc.data { for _, timestamp := range series.Timestamps { + a := tc.expAlerts[j] + if a.Labels == nil { + a.Labels = make(map[string]string) + } + a.Labels[alertNameLabel] = tc.rule.Name expTS = append(expTS, tc.rule.alertToTimeSeries(tc.expAlerts[j], timestamp)...) j++ } @@ -496,7 +506,6 @@ func TestAlertingRule_Restore(t *testing.T) { []datasource.Metric{ metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), "__name__", alertForStateMetricName, - alertNameLabel, "", ), }, map[uint64]*notifier.Alert{ @@ -509,7 +518,7 @@ func TestAlertingRule_Restore(t *testing.T) { []datasource.Metric{ metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), "__name__", alertForStateMetricName, - alertNameLabel, "", + alertNameLabel, "metric labels", alertGroupNameLabel, "groupID", "foo", "bar", "namespace", "baz", @@ -517,6 +526,8 @@ func TestAlertingRule_Restore(t *testing.T) { }, map[uint64]*notifier.Alert{ hash(metricWithLabels(t, + alertNameLabel, "metric labels", + alertGroupNameLabel, "groupID", "foo", "bar", "namespace", "baz", )): {State: notifier.StatePending, @@ -528,7 +539,6 @@ func TestAlertingRule_Restore(t *testing.T) { []datasource.Metric{ metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()), "__name__", alertForStateMetricName, - alertNameLabel, "", "foo", "bar", "namespace", "baz", // extra labels set by rule @@ -645,18 +655,20 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 1, "instance", "bar"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "region", "east", "instance", "foo")): { + hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "foo")): { Annotations: map[string]string{}, Labels: map[string]string{ - "region": "east", - "instance": "foo", + alertNameLabel: "common", + "region": "east", + "instance": "foo", }, }, - hash(metricWithLabels(t, "region", "east", "instance", "bar")): { + hash(metricWithLabels(t, alertNameLabel, "common", "region", "east", "instance", "bar")): { Annotations: map[string]string{}, Labels: map[string]string{ - "region": "east", - "instance": "bar", + alertNameLabel: "common", + "region": "east", + "instance": "bar", }, }, }, @@ -679,20 +691,22 @@ func TestAlertingRule_Template(t *testing.T) { metricWithValueAndLabels(t, 10, "instance", "bar"), }, map[uint64]*notifier.Alert{ - hash(metricWithLabels(t, "region", "east", "instance", "foo")): { + hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "foo")): { Labels: map[string]string{ - "instance": "foo", - "region": "east", + alertNameLabel: "override label", + "instance": "foo", + "region": "east", }, Annotations: map[string]string{ "summary": `Too high connection number for "foo" for region east`, "description": `It is 2 connections for "foo"`, }, }, - hash(metricWithLabels(t, "region", "east", "instance", "bar")): { + hash(metricWithLabels(t, alertNameLabel, "override label", "region", "east", "instance", "bar")): { Labels: map[string]string{ - "instance": "bar", - "region": "east", + alertNameLabel: "override label", + "instance": "bar", + "region": "east", }, Annotations: map[string]string{ "summary": `Too high connection number for "bar" for region east`, diff --git a/app/vmalert/datasource/init.go b/app/vmalert/datasource/init.go index 3e28d6e43..72cfddfc9 100644 --- a/app/vmalert/datasource/init.go +++ b/app/vmalert/datasource/init.go @@ -52,6 +52,9 @@ func Init(extraParams []Param) (QuerierBuilder, error) { return nil, fmt.Errorf("failed to create transport: %w", err) } tr.MaxIdleConnsPerHost = *maxIdleConnections + if tr.MaxIdleConns != 0 && tr.MaxIdleConns < tr.MaxIdleConnsPerHost { + tr.MaxIdleConns = tr.MaxIdleConnsPerHost + } if *roundDigits > 0 { extraParams = append(extraParams, Param{ diff --git a/app/vmalert/datasource/vm.go b/app/vmalert/datasource/vm.go index a6058dec8..5d3dc6267 100644 --- a/app/vmalert/datasource/vm.go +++ b/app/vmalert/datasource/vm.go @@ -24,18 +24,20 @@ type VMStorage struct { evaluationInterval time.Duration extraLabels []string extraParams []Param + disablePathAppend bool } // Clone makes clone of VMStorage, shares http client. func (s *VMStorage) Clone() *VMStorage { return &VMStorage{ - c: s.c, - authCfg: s.authCfg, - datasourceURL: s.datasourceURL, - lookBack: s.lookBack, - queryStep: s.queryStep, - appendTypePrefix: s.appendTypePrefix, - dataSourceType: s.dataSourceType, + c: s.c, + authCfg: s.authCfg, + datasourceURL: s.datasourceURL, + lookBack: s.lookBack, + queryStep: s.queryStep, + appendTypePrefix: s.appendTypePrefix, + dataSourceType: s.dataSourceType, + disablePathAppend: s.disablePathAppend, } } @@ -57,15 +59,16 @@ func (s *VMStorage) BuildWithParams(params QuerierParams) Querier { } // NewVMStorage is a constructor for VMStorage -func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Duration, queryStep time.Duration, appendTypePrefix bool, c *http.Client) *VMStorage { +func NewVMStorage(baseURL string, authCfg *promauth.Config, lookBack time.Duration, queryStep time.Duration, appendTypePrefix bool, c *http.Client, disablePathAppend bool) *VMStorage { return &VMStorage{ - c: c, - authCfg: authCfg, - datasourceURL: strings.TrimSuffix(baseURL, "/"), - appendTypePrefix: appendTypePrefix, - lookBack: lookBack, - queryStep: queryStep, - dataSourceType: NewPrometheusType(), + c: c, + authCfg: authCfg, + datasourceURL: strings.TrimSuffix(baseURL, "/"), + appendTypePrefix: appendTypePrefix, + lookBack: lookBack, + queryStep: queryStep, + dataSourceType: NewPrometheusType(), + disablePathAppend: disablePathAppend, } } @@ -132,12 +135,12 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) { resp, err := s.c.Do(req.WithContext(ctx)) if err != nil { - return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err) + return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err) } if resp.StatusCode != http.StatusOK { body, _ := ioutil.ReadAll(resp.Body) _ = resp.Body.Close() - return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body) + return nil, fmt.Errorf("unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL.Redacted(), body) } return resp, nil } diff --git a/app/vmalert/datasource/vm_graphite_api.go b/app/vmalert/datasource/vm_graphite_api.go index 3c6a2ab34..36b89e187 100644 --- a/app/vmalert/datasource/vm_graphite_api.go +++ b/app/vmalert/datasource/vm_graphite_api.go @@ -38,7 +38,7 @@ func (r graphiteResponse) metrics() []Metric { func parseGraphiteResponse(req *http.Request, resp *http.Response) ([]Metric, error) { r := &graphiteResponse{} if err := json.NewDecoder(resp.Body).Decode(r); err != nil { - return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL, err) + return nil, fmt.Errorf("error parsing graphite metrics for %s: %w", req.URL.Redacted(), err) } return r.metrics(), nil } diff --git a/app/vmalert/datasource/vm_prom_api.go b/app/vmalert/datasource/vm_prom_api.go index 62154c313..2faba4d27 100644 --- a/app/vmalert/datasource/vm_prom_api.go +++ b/app/vmalert/datasource/vm_prom_api.go @@ -82,10 +82,10 @@ const ( func parsePrometheusResponse(req *http.Request, resp *http.Response) ([]Metric, error) { r := &promResponse{} if err := json.NewDecoder(resp.Body).Decode(r); err != nil { - return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL, err) + return nil, fmt.Errorf("error parsing prometheus metrics for %s: %w", req.URL.Redacted(), err) } if r.Status == statusError { - return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error) + return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL.Redacted(), r.ErrorType, r.Error) } if r.Status != statusSuccess { return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status) @@ -118,7 +118,9 @@ func (s *VMStorage) setPrometheusInstantReqParams(r *http.Request, query string, if s.appendTypePrefix { r.URL.Path += prometheusPrefix } - r.URL.Path += prometheusInstantPath + if !s.disablePathAppend { + r.URL.Path += prometheusInstantPath + } q := r.URL.Query() if s.lookBack > 0 { timestamp = timestamp.Add(-s.lookBack) @@ -136,7 +138,9 @@ func (s *VMStorage) setPrometheusRangeReqParams(r *http.Request, query string, s if s.appendTypePrefix { r.URL.Path += prometheusPrefix } - r.URL.Path += prometheusRangePath + if !s.disablePathAppend { + r.URL.Path += prometheusRangePath + } q := r.URL.Query() q.Add("start", fmt.Sprintf("%d", start.Unix())) q.Add("end", fmt.Sprintf("%d", end.Unix())) diff --git a/app/vmalert/datasource/vm_test.go b/app/vmalert/datasource/vm_test.go index 801974d9c..ba406a46c 100644 --- a/app/vmalert/datasource/vm_test.go +++ b/app/vmalert/datasource/vm_test.go @@ -83,7 +83,7 @@ func TestVMInstantQuery(t *testing.T) { if err != nil { t.Fatalf("unexpected: %s", err) } - s := NewVMStorage(srv.URL, authCfg, time.Minute, 0, false, srv.Client()) + s := NewVMStorage(srv.URL, authCfg, time.Minute, 0, false, srv.Client(), false) p := NewPrometheusType() pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second}) @@ -193,7 +193,7 @@ func TestVMRangeQuery(t *testing.T) { if err != nil { t.Fatalf("unexpected: %s", err) } - s := NewVMStorage(srv.URL, authCfg, time.Minute, 0, false, srv.Client()) + s := NewVMStorage(srv.URL, authCfg, time.Minute, 0, false, srv.Client(), false) p := NewPrometheusType() pq := s.BuildWithParams(QuerierParams{DataSourceType: &p, EvaluationInterval: 15 * time.Second}) @@ -252,6 +252,17 @@ func TestRequestParams(t *testing.T) { checkEqualString(t, prometheusInstantPath, r.URL.Path) }, }, + { + "prometheus path with disablePathAppend", + false, + &VMStorage{ + dataSourceType: NewPrometheusType(), + disablePathAppend: true, + }, + func(t *testing.T, r *http.Request) { + checkEqualString(t, "", r.URL.Path) + }, + }, { "prometheus prefix", false, @@ -263,6 +274,18 @@ func TestRequestParams(t *testing.T) { checkEqualString(t, prometheusPrefix+prometheusInstantPath, r.URL.Path) }, }, + { + "prometheus prefix with disablePathAppend", + false, + &VMStorage{ + dataSourceType: NewPrometheusType(), + appendTypePrefix: true, + disablePathAppend: true, + }, + func(t *testing.T, r *http.Request) { + checkEqualString(t, prometheusPrefix, r.URL.Path) + }, + }, { "prometheus range path", true, @@ -273,6 +296,17 @@ func TestRequestParams(t *testing.T) { checkEqualString(t, prometheusRangePath, r.URL.Path) }, }, + { + "prometheus range path with disablePathAppend", + true, + &VMStorage{ + dataSourceType: NewPrometheusType(), + disablePathAppend: true, + }, + func(t *testing.T, r *http.Request) { + checkEqualString(t, "", r.URL.Path) + }, + }, { "prometheus range prefix", true, @@ -284,6 +318,18 @@ func TestRequestParams(t *testing.T) { checkEqualString(t, prometheusPrefix+prometheusRangePath, r.URL.Path) }, }, + { + "prometheus range prefix with disablePathAppend", + true, + &VMStorage{ + dataSourceType: NewPrometheusType(), + appendTypePrefix: true, + disablePathAppend: true, + }, + func(t *testing.T, r *http.Request) { + checkEqualString(t, prometheusPrefix, r.URL.Path) + }, + }, { "graphite path", false, diff --git a/app/vmalert/group.go b/app/vmalert/group.go index e80b1e6e8..47fab99d2 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -123,6 +123,9 @@ func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule { // ID return unique group ID that consists of // rules file and group name func (g *Group) ID() uint64 { + g.mu.RLock() + defer g.mu.RUnlock() + hash := fnv.New64a() hash.Write([]byte(g.File)) hash.Write([]byte("\xff")) diff --git a/app/vmalert/group_test.go b/app/vmalert/group_test.go index 629f705d8..3910f4170 100644 --- a/app/vmalert/group_test.go +++ b/app/vmalert/group_test.go @@ -192,7 +192,14 @@ func TestGroupStart(t *testing.T) { // add rule labels - see config/testdata/rules1-good.rules alert1.Labels["label"] = "bar" alert1.Labels["host"] = inst1 - alert1.ID = hash(m1) + // add service labels + alert1.Labels[alertNameLabel] = alert1.Name + alert1.Labels[alertGroupNameLabel] = g.Name + var labels1 []string + for k, v := range alert1.Labels { + labels1 = append(labels1, k, v) + } + alert1.ID = hash(metricWithLabels(t, labels1...)) alert2, err := r.newAlert(m2, time.Now(), nil) if err != nil { @@ -204,7 +211,14 @@ func TestGroupStart(t *testing.T) { // add rule labels - see config/testdata/rules1-good.rules alert2.Labels["label"] = "bar" alert2.Labels["host"] = inst2 - alert2.ID = hash(m2) + // add service labels + alert2.Labels[alertNameLabel] = alert2.Name + alert2.Labels[alertGroupNameLabel] = g.Name + var labels2 []string + for k, v := range alert2.Labels { + labels2 = append(labels2, k, v) + } + alert2.ID = hash(metricWithLabels(t, labels2...)) finished := make(chan struct{}) fs.add(m1) diff --git a/app/vmalert/helpers_test.go b/app/vmalert/helpers_test.go index a4f99cddc..bc0ce54cb 100644 --- a/app/vmalert/helpers_test.go +++ b/app/vmalert/helpers_test.go @@ -205,7 +205,8 @@ func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error { }*/ } if len(expTS.Labels) != len(gotTS.Labels) { - return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels)) + return fmt.Errorf("expected number of labels %d (%v); got %d (%v)", + len(expTS.Labels), expTS.Labels, len(gotTS.Labels), gotTS.Labels) } for i, exp := range expTS.Labels { got := gotTS.Labels[i] diff --git a/app/vmalert/main.go b/app/vmalert/main.go index fa8ed3753..775f756b2 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -59,6 +59,8 @@ eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{ dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.") ) +var alertURLGeneratorFn notifier.AlertURLGenerator + func main() { // Write flags and help message to stdout, since it is easier to grep or pipe. flag.CommandLine.SetOutput(os.Stdout) @@ -79,15 +81,22 @@ func main() { } return } + + eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS()) + if err != nil { + logger.Fatalf("failed to init `external.url`: %s", err) + } + notifier.InitTemplateFunc(eu) + alertURLGeneratorFn, err = getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates) + if err != nil { + logger.Fatalf("failed to init `external.alert.source`: %s", err) + } + if *replayFrom != "" || *replayTo != "" { rw, err := remotewrite.Init(context.Background()) if err != nil { logger.Fatalf("failed to init remoteWrite: %s", err) } - eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS()) - if err != nil { - logger.Fatalf("failed to init `external.url`: %s", err) - } notifier.InitTemplateFunc(eu) groupsCfg, err := config.Parse(*rulePath, *validateTemplates, *validateExpressions) if err != nil { @@ -118,11 +127,16 @@ func main() { logger.Fatalf("cannot parse configuration file: %s", err) } + // Register SIGHUP handler for config re-read just before manager.start call. + // This guarantees that the config will be re-read if the signal arrives during manager.start call. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240 + sighupCh := procutil.NewSighupChan() + if err := manager.start(ctx, groupsCfg); err != nil { logger.Fatalf("failed to start: %s", err) } - go configReload(ctx, manager, groupsCfg) + go configReload(ctx, manager, groupsCfg, sighupCh) rh := &requestHandler{m: manager} go httpserver.Serve(*httpListenAddr, rh.handler) @@ -148,20 +162,10 @@ func newManager(ctx context.Context) (*manager, error) { if err != nil { return nil, fmt.Errorf("failed to init datasource: %w", err) } - eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS()) - if err != nil { - return nil, fmt.Errorf("failed to init `external.url`: %w", err) - } - notifier.InitTemplateFunc(eu) - aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates) - if err != nil { - return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err) - } - nts, err := notifier.Init(aug) + nts, err := notifier.Init(alertURLGeneratorFn) if err != nil { return nil, fmt.Errorf("failed to init notifier: %w", err) } - manager := &manager{ groups: make(map[uint64]*Group), querierBuilder: q, @@ -246,12 +250,7 @@ See the docs at https://docs.victoriametrics.com/vmalert.html . flagutil.Usage(s) } -func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) { - // Register SIGHUP handler for config re-read just before manager.start call. - // This guarantees that the config will be re-read if the signal arrives during manager.start call. - // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240 - sighupCh := procutil.NewSighupChan() - +func configReload(ctx context.Context, m *manager, groupsCfg []config.Group, sighupCh <-chan os.Signal) { var configCheckCh <-chan time.Time if *rulesCheckInterval > 0 { ticker := time.NewTicker(*rulesCheckInterval) diff --git a/app/vmalert/main_test.go b/app/vmalert/main_test.go index 8a4174ce3..7b927805a 100644 --- a/app/vmalert/main_test.go +++ b/app/vmalert/main_test.go @@ -102,8 +102,9 @@ groups: } syncCh := make(chan struct{}) + sighupCh := procutil.NewSighupChan() go func() { - configReload(ctx, m, nil) + configReload(ctx, m, nil, sighupCh) close(syncCh) }() diff --git a/app/vmalert/notifier/alert.go b/app/vmalert/notifier/alert.go index d345791e5..c2889b94f 100644 --- a/app/vmalert/notifier/alert.go +++ b/app/vmalert/notifier/alert.go @@ -34,6 +34,8 @@ type Alert struct { Value float64 // ID is the unique identifer for the Alert ID uint64 + // Restored is true if Alert was restored after restart + Restored bool } // AlertState type indicates the Alert state diff --git a/app/vmalert/remoteread/init.go b/app/vmalert/remoteread/init.go index 73a2993b5..427b5def2 100644 --- a/app/vmalert/remoteread/init.go +++ b/app/vmalert/remoteread/init.go @@ -12,7 +12,7 @@ import ( var ( addr = flag.String("remoteRead.url", "", "Optional URL to VictoriaMetrics or vmselect that will be used to restore alerts "+ "state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state. "+ - "E.g. http://127.0.0.1:8428") + "E.g. http://127.0.0.1:8428. See also -remoteRead.disablePathAppend") basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url") basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url") basicAuthPasswordFile = flag.String("remoteRead.basicAuth.passwordFile", "", "Optional path to basic auth password to use for -remoteRead.url") @@ -26,6 +26,7 @@ var ( "By default system CA is used") tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+ "By default the server name from -remoteRead.url is used") + disablePathAppend = flag.Bool("remoteRead.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/query' path to the configured -remoteRead.url.") ) // Init creates a Querier from provided flag values. @@ -43,5 +44,5 @@ func Init() (datasource.QuerierBuilder, error) { return nil, fmt.Errorf("failed to configure auth: %w", err) } c := &http.Client{Transport: tr} - return datasource.NewVMStorage(*addr, authCfg, 0, 0, false, c), nil + return datasource.NewVMStorage(*addr, authCfg, 0, 0, false, c, *disablePathAppend), nil } diff --git a/app/vmalert/remotewrite/remotewrite.go b/app/vmalert/remotewrite/remotewrite.go index 30d948224..035f2af77 100644 --- a/app/vmalert/remotewrite/remotewrite.go +++ b/app/vmalert/remotewrite/remotewrite.go @@ -246,13 +246,13 @@ func (c *Client) send(ctx context.Context, data []byte) error { resp, err := c.c.Do(req.WithContext(ctx)) if err != nil { return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)", - req.URL, err, len(data), r.Size()) + req.URL.Redacted(), err, len(data), r.Size()) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { body, _ := ioutil.ReadAll(resp.Body) return fmt.Errorf("unexpected response code %d for %s. Response body %q", - resp.StatusCode, req.URL, body) + resp.StatusCode, req.URL.Redacted(), body) } return nil } diff --git a/app/vmalert/web.go b/app/vmalert/web.go index ef31ce359..e1034dab7 100644 --- a/app/vmalert/web.go +++ b/app/vmalert/web.go @@ -28,13 +28,14 @@ func initLinks() { {path.Join(pathPrefix, "api/v1/groups"), "list all loaded groups and rules"}, {path.Join(pathPrefix, "api/v1/alerts"), "list all active alerts"}, {path.Join(pathPrefix, "api/v1/groupID/alertID/status"), "get alert status by ID"}, + {path.Join(pathPrefix, "flags"), "command-line flags"}, {path.Join(pathPrefix, "metrics"), "list of application metrics"}, {path.Join(pathPrefix, "-/reload"), "reload configuration"}, } navItems = []tpl.NavItem{ - {Name: "vmalert", Url: path.Join(pathPrefix, "/")}, + {Name: "vmalert", Url: pathPrefix}, {Name: "Groups", Url: path.Join(pathPrefix, "groups")}, - {Name: "Alerts", Url: path.Join(pathPrefix, "/alerts")}, + {Name: "Alerts", Url: path.Join(pathPrefix, "alerts")}, {Name: "Docs", Url: "https://docs.victoriametrics.com/vmalert.html"}, } } diff --git a/app/vmalert/web.qtpl b/app/vmalert/web.qtpl index 7c5c6b2b4..c15fa9de8 100644 --- a/app/vmalert/web.qtpl +++ b/app/vmalert/web.qtpl @@ -51,7 +51,7 @@
{%s g.File %}
{% if len(g.ExtraFilterLabels) > 0 %} @@ -155,7 +155,9 @@ sort.Strings(labelKeys) %}{%s defaultAR.Expression %}