From 1cb7037fc889aaf392820036e10ad66c5f55de1e Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Tue, 31 Aug 2021 12:28:02 +0300 Subject: [PATCH] Vmalert metrics update (#1580) * vmalert: remove `vmalert_execution_duration_seconds` metric The summary for `vmalert_execution_duration_seconds` metric gives no additional value comparing to `vmalert_iteration_duration_seconds` metric. * vmalert: update config reload success metric properly Previously, if there was unsuccessfull attempt to reload config and then rollback to previous version - the metric remained set to 0. * vmalert: add Grafana dashboard to overview application metrics * docker: include vmalert target into list for scraping * vmalert: extend notifier metrics with addr label The change adds an `addr` label to metrics for alerts_sent and alerts_send_errors to identify which exact address is having issues. The according change was made to vmalert dashboard. * vmalert: update documentation and docker environment for vmalert's dashboard Mention Grafana's dashboard in vmalert's README in a new section #Monitoring. Update docker-compose env to automatically add vmalert's dashboard. Update docker-compose README with additional info about services. --- app/vmalert/Makefile | 1 + app/vmalert/README.md | 11 +++++++ app/vmalert/group.go | 40 +++++++++++++---------- app/vmalert/helpers_test.go | 1 + app/vmalert/main.go | 3 ++ app/vmalert/notifier/alertmanager.go | 9 +++-- app/vmalert/notifier/alertmanager_test.go | 8 +++++ app/vmalert/notifier/notifier.go | 7 +++- deployment/docker/README.md | 2 +- deployment/docker/prometheus.yml | 6 ++-- 10 files changed, 64 insertions(+), 24 deletions(-) diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile index 8c6f8850a4..8a3dbf24a1 100644 --- a/app/vmalert/Makefile +++ b/app/vmalert/Makefile @@ -56,6 +56,7 @@ test-vmalert: go test -v -race -cover ./app/vmalert/datasource go test -v -race -cover ./app/vmalert/notifier go test -v -race -cover ./app/vmalert/config + go test -v -race -cover ./app/vmalert/remotewrite run-vmalert: vmalert ./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \ diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 920938367e..253d907f6f 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`. * `query` template function is disabled for performance reasons (might be changed in future); +## Monitoring + +`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. +We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported +metrics may be analyzed later. + +Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. +If you have suggestions for improvements or have found a bug - please open an issue on github or add +a review to the dashboard. + + ## Configuration Pass `-help` to `vmalert` in order to see the full list of supported diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 5394e4d8c8..f9f2b957a3 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error { return nil } -var ( - alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`) - alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`) - alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`) -) - func (g *Group) close() { if g.doneCh == nil { return @@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr } logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency) - e := &executor{nts, rw} + e := &executor{rw: rw} + for _, nt := range nts { + ent := eNotifier{ + Notifier: nt, + alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())), + alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())), + } + e.notifiers = append(e.notifiers, ent) + } + t := time.NewTicker(g.Interval) defer t.Stop() for { @@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr } type executor struct { - notifiers []notifier.Notifier + notifiers []eNotifier rw *remotewrite.Client } +type eNotifier struct { + notifier.Notifier + alertsSent *counter + alertsSendErrors *counter +} + func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error { res := make(chan error, len(rules)) if concurrency == 1 { @@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren } var ( - execTotal = metrics.NewCounter(`vmalert_execution_total`) - execErrors = metrics.NewCounter(`vmalert_execution_errors_total`) - execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`) + alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`) + + execTotal = metrics.NewCounter(`vmalert_execution_total`) + execErrors = metrics.NewCounter(`vmalert_execution_errors_total`) remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) ) func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error { execTotal.Inc() - execStart := time.Now() - defer func() { - execDuration.UpdateDuration(execStart) - }() tss, err := rule.Exec(ctx) if err != nil { @@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) return nil } - alertsSent.Add(len(alerts)) errGr := new(utils.ErrGroup) for _, nt := range e.notifiers { + nt.alertsSent.Add(len(alerts)) if err := nt.Send(ctx, alerts); err != nil { - alertsSendErrors.Inc() + nt.alertsSendErrors.Inc() errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err)) } } diff --git a/app/vmalert/helpers_test.go b/app/vmalert/helpers_test.go index 4d5433a652..a4f99cddcf 100644 --- a/app/vmalert/helpers_test.go +++ b/app/vmalert/helpers_test.go @@ -63,6 +63,7 @@ type fakeNotifier struct { alerts []notifier.Alert } +func (*fakeNotifier) Addr() string { return "" } func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error { fn.Lock() defer fn.Unlock() diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 8951c31247..db21b99000 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) { continue } if configsEqual(newGroupsCfg, groupsCfg) { + // set success to 1 since previous reload + // could have been unsuccessful + configSuccess.Set(1) // config didn't change - skip it continue } diff --git a/app/vmalert/notifier/alertmanager.go b/app/vmalert/notifier/alertmanager.go index a24f5f723a..36b75cfbb3 100644 --- a/app/vmalert/notifier/alertmanager.go +++ b/app/vmalert/notifier/alertmanager.go @@ -12,6 +12,7 @@ import ( // AlertManager represents integration provider with Prometheus alert manager // https://github.com/prometheus/alertmanager type AlertManager struct { + addr string alertURL string basicAuthUser string basicAuthPass string @@ -19,6 +20,9 @@ type AlertManager struct { client *http.Client } +// Addr returns address where alerts are sent. +func (am AlertManager) Addr() string { return am.addr } + // Send an alert or resolve message func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error { b := &bytes.Buffer{} @@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts" // NewAlertManager is a constructor for AlertManager func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager { - addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath + url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath return &AlertManager{ - alertURL: addr, + addr: alertManagerURL, + alertURL: url, argFunc: fn, client: c, basicAuthUser: user, diff --git a/app/vmalert/notifier/alertmanager_test.go b/app/vmalert/notifier/alertmanager_test.go index f7bdb47d4a..0cf0fa1483 100644 --- a/app/vmalert/notifier/alertmanager_test.go +++ b/app/vmalert/notifier/alertmanager_test.go @@ -10,6 +10,14 @@ import ( "time" ) +func TestAlertManager_Addr(t *testing.T) { + const addr = "http://localhost" + am := NewAlertManager(addr, "", "", nil, nil) + if am.Addr() != addr { + t.Errorf("expected to have %q; got %q", addr, am.Addr()) + } +} + func TestAlertManager_Send(t *testing.T) { const baUser, baPass = "foo", "bar" mux := http.NewServeMux() diff --git a/app/vmalert/notifier/notifier.go b/app/vmalert/notifier/notifier.go index 5564e23cb4..8135a19ea6 100644 --- a/app/vmalert/notifier/notifier.go +++ b/app/vmalert/notifier/notifier.go @@ -2,7 +2,12 @@ package notifier import "context" -// Notifier is common interface for alert manager provider +// Notifier is a common interface for alert manager provider type Notifier interface { + // Send sends the given list of alerts. + // Returns an error if fails to send the alerts. + // Must unblock if the given ctx is cancelled. Send(ctx context.Context, alerts []Alert) error + // Addr returns address where alerts are sent. + Addr() string } diff --git a/deployment/docker/README.md b/deployment/docker/README.md index a74ad089ec..c2a413c963 100644 --- a/deployment/docker/README.md +++ b/deployment/docker/README.md @@ -55,4 +55,4 @@ Default creds: Grafana is provisioned by default with following entities: * VictoriaMetrics datasource * Prometheus datasource -* VictoriaMetrics overview dashboard \ No newline at end of file +* VictoriaMetrics overview dashboard diff --git a/deployment/docker/prometheus.yml b/deployment/docker/prometheus.yml index 158307dfea..966da54d67 100644 --- a/deployment/docker/prometheus.yml +++ b/deployment/docker/prometheus.yml @@ -6,15 +6,15 @@ scrape_configs: - job_name: 'vmagent' static_configs: - targets: ['vmagent:8429'] - + - job_name: 'vmalert' + static_configs: + - targets: ['vmalert:8880'] - job_name: 'vminsert' static_configs: - targets: ['vminsert:8480'] - - job_name: 'vmselect' static_configs: - targets: ['vmselect:8481'] - - job_name: 'vmstorage' static_configs: - targets: ['vmstorage:8482']