mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-02-09 15:27:11 +00:00
Vmalert metrics update (#1580)
* vmalert: remove `vmalert_execution_duration_seconds` metric The summary for `vmalert_execution_duration_seconds` metric gives no additional value comparing to `vmalert_iteration_duration_seconds` metric. * vmalert: update config reload success metric properly Previously, if there was unsuccessfull attempt to reload config and then rollback to previous version - the metric remained set to 0. * vmalert: add Grafana dashboard to overview application metrics * docker: include vmalert target into list for scraping * vmalert: extend notifier metrics with addr label The change adds an `addr` label to metrics for alerts_sent and alerts_send_errors to identify which exact address is having issues. The according change was made to vmalert dashboard. * vmalert: update documentation and docker environment for vmalert's dashboard Mention Grafana's dashboard in vmalert's README in a new section #Monitoring. Update docker-compose env to automatically add vmalert's dashboard. Update docker-compose README with additional info about services.
This commit is contained in:
parent
434f33d04d
commit
1cb7037fc8
10 changed files with 64 additions and 24 deletions
|
@ -56,6 +56,7 @@ test-vmalert:
|
||||||
go test -v -race -cover ./app/vmalert/datasource
|
go test -v -race -cover ./app/vmalert/datasource
|
||||||
go test -v -race -cover ./app/vmalert/notifier
|
go test -v -race -cover ./app/vmalert/notifier
|
||||||
go test -v -race -cover ./app/vmalert/config
|
go test -v -race -cover ./app/vmalert/config
|
||||||
|
go test -v -race -cover ./app/vmalert/remotewrite
|
||||||
|
|
||||||
run-vmalert: vmalert
|
run-vmalert: vmalert
|
||||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||||
|
|
|
@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`.
|
||||||
* `query` template function is disabled for performance reasons (might be changed in future);
|
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||||
|
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||||
|
We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported
|
||||||
|
metrics may be analyzed later.
|
||||||
|
|
||||||
|
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview.
|
||||||
|
If you have suggestions for improvements or have found a bug - please open an issue on github or add
|
||||||
|
a review to the dashboard.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||||
|
|
|
@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
|
||||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
|
||||||
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
|
||||||
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
|
||||||
)
|
|
||||||
|
|
||||||
func (g *Group) close() {
|
func (g *Group) close() {
|
||||||
if g.doneCh == nil {
|
if g.doneCh == nil {
|
||||||
return
|
return
|
||||||
|
@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||||
e := &executor{nts, rw}
|
e := &executor{rw: rw}
|
||||||
|
for _, nt := range nts {
|
||||||
|
ent := eNotifier{
|
||||||
|
Notifier: nt,
|
||||||
|
alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
|
||||||
|
alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
|
||||||
|
}
|
||||||
|
e.notifiers = append(e.notifiers, ent)
|
||||||
|
}
|
||||||
|
|
||||||
t := time.NewTicker(g.Interval)
|
t := time.NewTicker(g.Interval)
|
||||||
defer t.Stop()
|
defer t.Stop()
|
||||||
for {
|
for {
|
||||||
|
@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
||||||
}
|
}
|
||||||
|
|
||||||
type executor struct {
|
type executor struct {
|
||||||
notifiers []notifier.Notifier
|
notifiers []eNotifier
|
||||||
rw *remotewrite.Client
|
rw *remotewrite.Client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type eNotifier struct {
|
||||||
|
notifier.Notifier
|
||||||
|
alertsSent *counter
|
||||||
|
alertsSendErrors *counter
|
||||||
|
}
|
||||||
|
|
||||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||||
res := make(chan error, len(rules))
|
res := make(chan error, len(rules))
|
||||||
if concurrency == 1 {
|
if concurrency == 1 {
|
||||||
|
@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||||
|
|
||||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||||
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
|
||||||
|
|
||||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
|
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
|
||||||
execTotal.Inc()
|
execTotal.Inc()
|
||||||
execStart := time.Now()
|
|
||||||
defer func() {
|
|
||||||
execDuration.UpdateDuration(execStart)
|
|
||||||
}()
|
|
||||||
|
|
||||||
tss, err := rule.Exec(ctx)
|
tss, err := rule.Exec(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
alertsSent.Add(len(alerts))
|
|
||||||
errGr := new(utils.ErrGroup)
|
errGr := new(utils.ErrGroup)
|
||||||
for _, nt := range e.notifiers {
|
for _, nt := range e.notifiers {
|
||||||
|
nt.alertsSent.Add(len(alerts))
|
||||||
if err := nt.Send(ctx, alerts); err != nil {
|
if err := nt.Send(ctx, alerts); err != nil {
|
||||||
alertsSendErrors.Inc()
|
nt.alertsSendErrors.Inc()
|
||||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,6 +63,7 @@ type fakeNotifier struct {
|
||||||
alerts []notifier.Alert
|
alerts []notifier.Alert
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (*fakeNotifier) Addr() string { return "" }
|
||||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||||
fn.Lock()
|
fn.Lock()
|
||||||
defer fn.Unlock()
|
defer fn.Unlock()
|
||||||
|
|
|
@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||||
|
// set success to 1 since previous reload
|
||||||
|
// could have been unsuccessful
|
||||||
|
configSuccess.Set(1)
|
||||||
// config didn't change - skip it
|
// config didn't change - skip it
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
// AlertManager represents integration provider with Prometheus alert manager
|
// AlertManager represents integration provider with Prometheus alert manager
|
||||||
// https://github.com/prometheus/alertmanager
|
// https://github.com/prometheus/alertmanager
|
||||||
type AlertManager struct {
|
type AlertManager struct {
|
||||||
|
addr string
|
||||||
alertURL string
|
alertURL string
|
||||||
basicAuthUser string
|
basicAuthUser string
|
||||||
basicAuthPass string
|
basicAuthPass string
|
||||||
|
@ -19,6 +20,9 @@ type AlertManager struct {
|
||||||
client *http.Client
|
client *http.Client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Addr returns address where alerts are sent.
|
||||||
|
func (am AlertManager) Addr() string { return am.addr }
|
||||||
|
|
||||||
// Send an alert or resolve message
|
// Send an alert or resolve message
|
||||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||||
b := &bytes.Buffer{}
|
b := &bytes.Buffer{}
|
||||||
|
@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts"
|
||||||
|
|
||||||
// NewAlertManager is a constructor for AlertManager
|
// NewAlertManager is a constructor for AlertManager
|
||||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||||
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||||
return &AlertManager{
|
return &AlertManager{
|
||||||
alertURL: addr,
|
addr: alertManagerURL,
|
||||||
|
alertURL: url,
|
||||||
argFunc: fn,
|
argFunc: fn,
|
||||||
client: c,
|
client: c,
|
||||||
basicAuthUser: user,
|
basicAuthUser: user,
|
||||||
|
|
|
@ -10,6 +10,14 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestAlertManager_Addr(t *testing.T) {
|
||||||
|
const addr = "http://localhost"
|
||||||
|
am := NewAlertManager(addr, "", "", nil, nil)
|
||||||
|
if am.Addr() != addr {
|
||||||
|
t.Errorf("expected to have %q; got %q", addr, am.Addr())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAlertManager_Send(t *testing.T) {
|
func TestAlertManager_Send(t *testing.T) {
|
||||||
const baUser, baPass = "foo", "bar"
|
const baUser, baPass = "foo", "bar"
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
|
|
|
@ -2,7 +2,12 @@ package notifier
|
||||||
|
|
||||||
import "context"
|
import "context"
|
||||||
|
|
||||||
// Notifier is common interface for alert manager provider
|
// Notifier is a common interface for alert manager provider
|
||||||
type Notifier interface {
|
type Notifier interface {
|
||||||
|
// Send sends the given list of alerts.
|
||||||
|
// Returns an error if fails to send the alerts.
|
||||||
|
// Must unblock if the given ctx is cancelled.
|
||||||
Send(ctx context.Context, alerts []Alert) error
|
Send(ctx context.Context, alerts []Alert) error
|
||||||
|
// Addr returns address where alerts are sent.
|
||||||
|
Addr() string
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,15 +6,15 @@ scrape_configs:
|
||||||
- job_name: 'vmagent'
|
- job_name: 'vmagent'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['vmagent:8429']
|
- targets: ['vmagent:8429']
|
||||||
|
- job_name: 'vmalert'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['vmalert:8880']
|
||||||
- job_name: 'vminsert'
|
- job_name: 'vminsert'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['vminsert:8480']
|
- targets: ['vminsert:8480']
|
||||||
|
|
||||||
- job_name: 'vmselect'
|
- job_name: 'vmselect'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['vmselect:8481']
|
- targets: ['vmselect:8481']
|
||||||
|
|
||||||
- job_name: 'vmstorage'
|
- job_name: 'vmstorage'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['vmstorage:8482']
|
- targets: ['vmstorage:8482']
|
||||||
|
|
Loading…
Reference in a new issue