Vmalert metrics update (#1580)

* vmalert: remove `vmalert_execution_duration_seconds` metric

The summary for `vmalert_execution_duration_seconds` metric gives no additional
value comparing to `vmalert_iteration_duration_seconds` metric.

* vmalert: update config reload success metric properly

Previously, if there was unsuccessfull attempt to reload config and then
rollback to previous version - the metric remained set to 0.

* vmalert: add Grafana dashboard to overview application metrics

* docker: include vmalert target into list for scraping

* vmalert: extend notifier metrics with addr label

The change adds an `addr` label to metrics for alerts_sent and alerts_send_errors
to identify which exact address is having issues.
The according change was made to vmalert dashboard.

* vmalert: update documentation and docker environment for vmalert's dashboard

Mention Grafana's dashboard in vmalert's README in a new section #Monitoring.

Update docker-compose env to automatically add vmalert's dashboard.
Update docker-compose README with additional info about services.
This commit is contained in:
Roman Khavronenko 2021-08-31 12:28:02 +03:00 committed by Aliaksandr Valialkin
parent 434f33d04d
commit 1cb7037fc8
10 changed files with 64 additions and 24 deletions

View file

@ -56,6 +56,7 @@ test-vmalert:
go test -v -race -cover ./app/vmalert/datasource
go test -v -race -cover ./app/vmalert/notifier
go test -v -race -cover ./app/vmalert/config
go test -v -race -cover ./app/vmalert/remotewrite
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \

View file

@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`.
* `query` template function is disabled for performance reasons (might be changed in future);
## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported
metrics may be analyzed later.
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview.
If you have suggestions for improvements or have found a bug - please open an issue on github or add
a review to the dashboard.
## Configuration
Pass `-help` to `vmalert` in order to see the full list of supported

View file

@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil
}
var (
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
)
func (g *Group) close() {
if g.doneCh == nil {
return
@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
}
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
e := &executor{nts, rw}
e := &executor{rw: rw}
for _, nt := range nts {
ent := eNotifier{
Notifier: nt,
alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
}
e.notifiers = append(e.notifiers, ent)
}
t := time.NewTicker(g.Interval)
defer t.Stop()
for {
@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
}
type executor struct {
notifiers []notifier.Notifier
notifiers []eNotifier
rw *remotewrite.Client
}
type eNotifier struct {
notifier.Notifier
alertsSent *counter
alertsSendErrors *counter
}
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
res := make(chan error, len(rules))
if concurrency == 1 {
@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
}
var (
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
execTotal.Inc()
execStart := time.Now()
defer func() {
execDuration.UpdateDuration(execStart)
}()
tss, err := rule.Exec(ctx)
if err != nil {
@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
return nil
}
alertsSent.Add(len(alerts))
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers {
nt.alertsSent.Add(len(alerts))
if err := nt.Send(ctx, alerts); err != nil {
alertsSendErrors.Inc()
nt.alertsSendErrors.Inc()
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
}
}

View file

@ -63,6 +63,7 @@ type fakeNotifier struct {
alerts []notifier.Alert
}
func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()

View file

@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
continue
}
if configsEqual(newGroupsCfg, groupsCfg) {
// set success to 1 since previous reload
// could have been unsuccessful
configSuccess.Set(1)
// config didn't change - skip it
continue
}

View file

@ -12,6 +12,7 @@ import (
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
addr string
alertURL string
basicAuthUser string
basicAuthPass string
@ -19,6 +20,9 @@ type AlertManager struct {
client *http.Client
}
// Addr returns address where alerts are sent.
func (am AlertManager) Addr() string { return am.addr }
// Send an alert or resolve message
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
b := &bytes.Buffer{}
@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
return &AlertManager{
alertURL: addr,
addr: alertManagerURL,
alertURL: url,
argFunc: fn,
client: c,
basicAuthUser: user,

View file

@ -10,6 +10,14 @@ import (
"time"
)
func TestAlertManager_Addr(t *testing.T) {
const addr = "http://localhost"
am := NewAlertManager(addr, "", "", nil, nil)
if am.Addr() != addr {
t.Errorf("expected to have %q; got %q", addr, am.Addr())
}
}
func TestAlertManager_Send(t *testing.T) {
const baUser, baPass = "foo", "bar"
mux := http.NewServeMux()

View file

@ -2,7 +2,12 @@ package notifier
import "context"
// Notifier is common interface for alert manager provider
// Notifier is a common interface for alert manager provider
type Notifier interface {
// Send sends the given list of alerts.
// Returns an error if fails to send the alerts.
// Must unblock if the given ctx is cancelled.
Send(ctx context.Context, alerts []Alert) error
// Addr returns address where alerts are sent.
Addr() string
}

View file

@ -6,15 +6,15 @@ scrape_configs:
- job_name: 'vmagent'
static_configs:
- targets: ['vmagent:8429']
- job_name: 'vmalert'
static_configs:
- targets: ['vmalert:8880']
- job_name: 'vminsert'
static_configs:
- targets: ['vminsert:8480']
- job_name: 'vmselect'
static_configs:
- targets: ['vmselect:8481']
- job_name: 'vmstorage'
static_configs:
- targets: ['vmstorage:8482']