Vmalert metrics update (#1580)

* vmalert: remove `vmalert_execution_duration_seconds` metric The summary for `vmalert_execution_duration_seconds` metric gives no additional value comparing to `vmalert_iteration_duration_seconds` metric. * vmalert: update config reload success metric properly Previously, if there was unsuccessfull attempt to reload config and then rollback to previous version - the metric remained set to 0. * vmalert: add Grafana dashboard to overview application metrics * docker: include vmalert target into list for scraping * vmalert: extend notifier metrics with addr label The change adds an `addr` label to metrics for alerts_sent and alerts_send_errors to identify which exact address is having issues. The according change was made to vmalert dashboard. * vmalert: update documentation and docker environment for vmalert's dashboard Mention Grafana's dashboard in vmalert's README in a new section #Monitoring. Update docker-compose env to automatically add vmalert's dashboard. Update docker-compose README with additional info about services.
2025-01-10 15:14:09 +00:00 · 2021-08-31 12:28:02 +03:00 · 2021-08-31 12:28:02 +03:00 · eff940aa76
commit eff940aa76
parent f41b3d6118
12 changed files with 2381 additions and 20 deletions
--- a/app/vmalert/Makefile
+++ b/app/vmalert/Makefile
@ -56,6 +56,7 @@ test-vmalert:
 	go test -v -race -cover ./app/vmalert/datasource
 	go test -v -race -cover ./app/vmalert/notifier
 	go test -v -race -cover ./app/vmalert/config
+	go test -v -race -cover ./app/vmalert/remotewrite

 run-vmalert: vmalert
 	./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
--- a/app/vmalert/README.md
+++ b/app/vmalert/README.md
@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`.
 * `query` template function is disabled for performance reasons (might be changed in future);


+## Monitoring
+
+`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. 
+We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported 
+metrics may be analyzed later.
+
+Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview.
+If you have suggestions for improvements or have found a bug - please open an issue on github or add 
+a review to the dashboard.
+
+
 ## Configuration

 Pass `-help` to `vmalert` in order to see the full list of supported
--- a/app/vmalert/group.go
+++ b/app/vmalert/group.go
@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error {
 	return nil
 }

-var (
-	alertsFired      = metrics.NewCounter(`vmalert_alerts_fired_total`)
-	alertsSent       = metrics.NewCounter(`vmalert_alerts_sent_total`)
-	alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
-)
-
 func (g *Group) close() {
 	if g.doneCh == nil {
 		return
@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
 	}

 	logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
-	e := &executor{nts, rw}
+	e := &executor{rw: rw}
+	for _, nt := range nts {
+		ent := eNotifier{
+			Notifier:         nt,
+			alertsSent:       getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
+			alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
+		}
+		e.notifiers = append(e.notifiers, ent)
+	}
+
 	t := time.NewTicker(g.Interval)
 	defer t.Stop()
 	for {
@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
 }

 type executor struct {
-	notifiers []notifier.Notifier
+	notifiers []eNotifier
 	rw        *remotewrite.Client
 }

+type eNotifier struct {
+	notifier.Notifier
+	alertsSent       *counter
+	alertsSendErrors *counter
+}
+
 func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
 	res := make(chan error, len(rules))
 	if concurrency == 1 {
@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
 }

 var (
-	execTotal    = metrics.NewCounter(`vmalert_execution_total`)
-	execErrors   = metrics.NewCounter(`vmalert_execution_errors_total`)
-	execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
+	alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
+
+	execTotal  = metrics.NewCounter(`vmalert_execution_total`)
+	execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)

 	remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
 )

 func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
 	execTotal.Inc()
-	execStart := time.Now()
-	defer func() {
-		execDuration.UpdateDuration(execStart)
-	}()

 	tss, err := rule.Exec(ctx)
 	if err != nil {
@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
 		return nil
 	}

-	alertsSent.Add(len(alerts))
 	errGr := new(utils.ErrGroup)
 	for _, nt := range e.notifiers {
+		nt.alertsSent.Add(len(alerts))
 		if err := nt.Send(ctx, alerts); err != nil {
-			alertsSendErrors.Inc()
+			nt.alertsSendErrors.Inc()
 			errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
 		}
 	}
--- a/app/vmalert/helpers_test.go
+++ b/app/vmalert/helpers_test.go
@ -63,6 +63,7 @@ type fakeNotifier struct {
 	alerts []notifier.Alert
 }

+func (*fakeNotifier) Addr() string { return "" }
 func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
 	fn.Lock()
 	defer fn.Unlock()
--- a/app/vmalert/main.go
+++ b/app/vmalert/main.go
@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
 			continue
 		}
 		if configsEqual(newGroupsCfg, groupsCfg) {
+			// set success to 1 since previous reload
+			// could have been unsuccessful
+			configSuccess.Set(1)
 			// config didn't change - skip it
 			continue
 		}
--- a/app/vmalert/notifier/alertmanager.go
+++ b/app/vmalert/notifier/alertmanager.go
@ -12,6 +12,7 @@ import (
 // AlertManager represents integration provider with Prometheus alert manager
 // https://github.com/prometheus/alertmanager
 type AlertManager struct {
+	addr          string
 	alertURL      string
 	basicAuthUser string
 	basicAuthPass string
@ -19,6 +20,9 @@ type AlertManager struct {
 	client        *http.Client
 }

+// Addr returns address where alerts are sent.
+func (am AlertManager) Addr() string { return am.addr }
+
 // Send an alert or resolve message
 func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
 	b := &bytes.Buffer{}
@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts"

 // NewAlertManager is a constructor for AlertManager
 func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
-	addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
+	url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
 	return &AlertManager{
-		alertURL:      addr,
+		addr:          alertManagerURL,
+		alertURL:      url,
 		argFunc:       fn,
 		client:        c,
 		basicAuthUser: user,
--- a/app/vmalert/notifier/alertmanager_test.go
+++ b/app/vmalert/notifier/alertmanager_test.go
@ -10,6 +10,14 @@ import (
 	"time"
 )

+func TestAlertManager_Addr(t *testing.T) {
+	const addr = "http://localhost"
+	am := NewAlertManager(addr, "", "", nil, nil)
+	if am.Addr() != addr {
+		t.Errorf("expected to have %q; got %q", addr, am.Addr())
+	}
+}
+
 func TestAlertManager_Send(t *testing.T) {
 	const baUser, baPass = "foo", "bar"
 	mux := http.NewServeMux()
--- a/app/vmalert/notifier/notifier.go
+++ b/app/vmalert/notifier/notifier.go
@ -2,7 +2,12 @@ package notifier

 import "context"

-// Notifier is common interface for alert manager provider
+// Notifier is a common interface for alert manager provider
 type Notifier interface {
+	// Send sends the given list of alerts.
+	// Returns an error if fails to send the alerts.
+	// Must unblock if the given ctx is cancelled.
 	Send(ctx context.Context, alerts []Alert) error
+	// Addr returns address where alerts are sent.
+	Addr() string
 }
--- a/dashboards/vmalert.json
+++ b/dashboards/vmalert.json
--- a/deployment/docker/README.md
+++ b/deployment/docker/README.md
@ -19,6 +19,23 @@ vmagent is used for scraping and pushing timeseries to
 VictoriaMetrics instance. It accepts Prometheus-compatible
 configuration `prometheus.yml` with listed targets for scraping.

+[Web interface link](http://localhost:8429/).
+
+##### vmalert
+
+vmalert evaluates alerting rules (`alerts.yml`) to track VictoriaMetrics 
+health state. It is connected with AlertManager for firing alerts,
+and with VictoriaMetrics for executing queries and storing alert's state.
+
+[Web interface link](http://localhost:8880/).
+
+##### alertmanager
+
+AlertManager accepts notifications from `vmalert` and fires alerts.
+All notifications are blackholed according to `alertmanager.yml` config.
+
+[Web interface link](http://localhost:9093/).
+
 ##### Grafana

 To access service open following [link](http://localhost:3000).
--- a/deployment/docker/docker-compose.yml
+++ b/deployment/docker/docker-compose.yml
@ -49,6 +49,7 @@ services:
      - ./provisioning/:/etc/grafana/provisioning/
      - ./../../dashboards/victoriametrics.json:/var/lib/grafana/dashboards/vm.json
      - ./../../dashboards/vmagent.json:/var/lib/grafana/dashboards/vmagent.json
+      - ./../../dashboards/vmalert.json:/var/lib/grafana/dashboards/vmalert.json
    networks:
      - vm_net
    restart: always
--- a/deployment/docker/prometheus.yml
+++ b/deployment/docker/prometheus.yml
@ -5,6 +5,9 @@ scrape_configs:
  - job_name: 'vmagent'
    static_configs:
      - targets: ['vmagent:8429']
+  - job_name: 'vmalert'
+    static_configs:
+      - targets: ['vmalert:8880']
  - job_name: 'victoriametrics'
    static_configs:
      - targets: ['victoriametrics:8428']