diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile index 8c6f8850a..8a3dbf24a 100644 --- a/app/vmalert/Makefile +++ b/app/vmalert/Makefile @@ -56,6 +56,7 @@ test-vmalert: go test -v -race -cover ./app/vmalert/datasource go test -v -race -cover ./app/vmalert/notifier go test -v -race -cover ./app/vmalert/config + go test -v -race -cover ./app/vmalert/remotewrite run-vmalert: vmalert ./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \ diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 920938367..253d907f6 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`. * `query` template function is disabled for performance reasons (might be changed in future); +## Monitoring + +`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. +We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported +metrics may be analyzed later. + +Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview. +If you have suggestions for improvements or have found a bug - please open an issue on github or add +a review to the dashboard. + + ## Configuration Pass `-help` to `vmalert` in order to see the full list of supported diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 5394e4d8c..f9f2b957a 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error { return nil } -var ( - alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`) - alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`) - alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`) -) - func (g *Group) close() { if g.doneCh == nil { return @@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr } logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency) - e := &executor{nts, rw} + e := &executor{rw: rw} + for _, nt := range nts { + ent := eNotifier{ + Notifier: nt, + alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())), + alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())), + } + e.notifiers = append(e.notifiers, ent) + } + t := time.NewTicker(g.Interval) defer t.Stop() for { @@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr } type executor struct { - notifiers []notifier.Notifier + notifiers []eNotifier rw *remotewrite.Client } +type eNotifier struct { + notifier.Notifier + alertsSent *counter + alertsSendErrors *counter +} + func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error { res := make(chan error, len(rules)) if concurrency == 1 { @@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren } var ( - execTotal = metrics.NewCounter(`vmalert_execution_total`) - execErrors = metrics.NewCounter(`vmalert_execution_errors_total`) - execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`) + alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`) + + execTotal = metrics.NewCounter(`vmalert_execution_total`) + execErrors = metrics.NewCounter(`vmalert_execution_errors_total`) remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) ) func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error { execTotal.Inc() - execStart := time.Now() - defer func() { - execDuration.UpdateDuration(execStart) - }() tss, err := rule.Exec(ctx) if err != nil { @@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) return nil } - alertsSent.Add(len(alerts)) errGr := new(utils.ErrGroup) for _, nt := range e.notifiers { + nt.alertsSent.Add(len(alerts)) if err := nt.Send(ctx, alerts); err != nil { - alertsSendErrors.Inc() + nt.alertsSendErrors.Inc() errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err)) } } diff --git a/app/vmalert/helpers_test.go b/app/vmalert/helpers_test.go index 4d5433a65..a4f99cddc 100644 --- a/app/vmalert/helpers_test.go +++ b/app/vmalert/helpers_test.go @@ -63,6 +63,7 @@ type fakeNotifier struct { alerts []notifier.Alert } +func (*fakeNotifier) Addr() string { return "" } func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error { fn.Lock() defer fn.Unlock() diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 8951c3124..db21b9900 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) { continue } if configsEqual(newGroupsCfg, groupsCfg) { + // set success to 1 since previous reload + // could have been unsuccessful + configSuccess.Set(1) // config didn't change - skip it continue } diff --git a/app/vmalert/notifier/alertmanager.go b/app/vmalert/notifier/alertmanager.go index a24f5f723..36b75cfbb 100644 --- a/app/vmalert/notifier/alertmanager.go +++ b/app/vmalert/notifier/alertmanager.go @@ -12,6 +12,7 @@ import ( // AlertManager represents integration provider with Prometheus alert manager // https://github.com/prometheus/alertmanager type AlertManager struct { + addr string alertURL string basicAuthUser string basicAuthPass string @@ -19,6 +20,9 @@ type AlertManager struct { client *http.Client } +// Addr returns address where alerts are sent. +func (am AlertManager) Addr() string { return am.addr } + // Send an alert or resolve message func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error { b := &bytes.Buffer{} @@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts" // NewAlertManager is a constructor for AlertManager func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager { - addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath + url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath return &AlertManager{ - alertURL: addr, + addr: alertManagerURL, + alertURL: url, argFunc: fn, client: c, basicAuthUser: user, diff --git a/app/vmalert/notifier/alertmanager_test.go b/app/vmalert/notifier/alertmanager_test.go index f7bdb47d4..0cf0fa148 100644 --- a/app/vmalert/notifier/alertmanager_test.go +++ b/app/vmalert/notifier/alertmanager_test.go @@ -10,6 +10,14 @@ import ( "time" ) +func TestAlertManager_Addr(t *testing.T) { + const addr = "http://localhost" + am := NewAlertManager(addr, "", "", nil, nil) + if am.Addr() != addr { + t.Errorf("expected to have %q; got %q", addr, am.Addr()) + } +} + func TestAlertManager_Send(t *testing.T) { const baUser, baPass = "foo", "bar" mux := http.NewServeMux() diff --git a/app/vmalert/notifier/notifier.go b/app/vmalert/notifier/notifier.go index 5564e23cb..8135a19ea 100644 --- a/app/vmalert/notifier/notifier.go +++ b/app/vmalert/notifier/notifier.go @@ -2,7 +2,12 @@ package notifier import "context" -// Notifier is common interface for alert manager provider +// Notifier is a common interface for alert manager provider type Notifier interface { + // Send sends the given list of alerts. + // Returns an error if fails to send the alerts. + // Must unblock if the given ctx is cancelled. Send(ctx context.Context, alerts []Alert) error + // Addr returns address where alerts are sent. + Addr() string } diff --git a/deployment/docker/README.md b/deployment/docker/README.md index a74ad089e..c2a413c96 100644 --- a/deployment/docker/README.md +++ b/deployment/docker/README.md @@ -55,4 +55,4 @@ Default creds: Grafana is provisioned by default with following entities: * VictoriaMetrics datasource * Prometheus datasource -* VictoriaMetrics overview dashboard \ No newline at end of file +* VictoriaMetrics overview dashboard diff --git a/deployment/docker/prometheus.yml b/deployment/docker/prometheus.yml index 158307dfe..966da54d6 100644 --- a/deployment/docker/prometheus.yml +++ b/deployment/docker/prometheus.yml @@ -6,15 +6,15 @@ scrape_configs: - job_name: 'vmagent' static_configs: - targets: ['vmagent:8429'] - + - job_name: 'vmalert' + static_configs: + - targets: ['vmalert:8880'] - job_name: 'vminsert' static_configs: - targets: ['vminsert:8480'] - - job_name: 'vmselect' static_configs: - targets: ['vmselect:8481'] - - job_name: 'vmstorage' static_configs: - targets: ['vmstorage:8482']