mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
Vmalert metrics update (#1580)
* vmalert: remove `vmalert_execution_duration_seconds` metric The summary for `vmalert_execution_duration_seconds` metric gives no additional value comparing to `vmalert_iteration_duration_seconds` metric. * vmalert: update config reload success metric properly Previously, if there was unsuccessfull attempt to reload config and then rollback to previous version - the metric remained set to 0. * vmalert: add Grafana dashboard to overview application metrics * docker: include vmalert target into list for scraping * vmalert: extend notifier metrics with addr label The change adds an `addr` label to metrics for alerts_sent and alerts_send_errors to identify which exact address is having issues. The according change was made to vmalert dashboard. * vmalert: update documentation and docker environment for vmalert's dashboard Mention Grafana's dashboard in vmalert's README in a new section #Monitoring. Update docker-compose env to automatically add vmalert's dashboard. Update docker-compose README with additional info about services.
This commit is contained in:
parent
434f33d04d
commit
1cb7037fc8
10 changed files with 64 additions and 24 deletions
|
@ -56,6 +56,7 @@ test-vmalert:
|
|||
go test -v -race -cover ./app/vmalert/datasource
|
||||
go test -v -race -cover ./app/vmalert/notifier
|
||||
go test -v -race -cover ./app/vmalert/config
|
||||
go test -v -race -cover ./app/vmalert/remotewrite
|
||||
|
||||
run-vmalert: vmalert
|
||||
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
|
||||
|
|
|
@ -340,6 +340,17 @@ See full description for these flags in `./vmalert --help`.
|
|||
* `query` template function is disabled for performance reasons (might be changed in future);
|
||||
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus so that the exported
|
||||
metrics may be analyzed later.
|
||||
|
||||
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/14950) for `vmalert` overview.
|
||||
If you have suggestions for improvements or have found a bug - please open an issue on github or add
|
||||
a review to the dashboard.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
Pass `-help` to `vmalert` in order to see the full list of supported
|
||||
|
|
|
@ -174,12 +174,6 @@ func (g *Group) updateWith(newGroup *Group) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
||||
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
||||
)
|
||||
|
||||
func (g *Group) close() {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
|
@ -220,7 +214,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
|||
}
|
||||
|
||||
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
||||
e := &executor{nts, rw}
|
||||
e := &executor{rw: rw}
|
||||
for _, nt := range nts {
|
||||
ent := eNotifier{
|
||||
Notifier: nt,
|
||||
alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
|
||||
alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
|
||||
}
|
||||
e.notifiers = append(e.notifiers, ent)
|
||||
}
|
||||
|
||||
t := time.NewTicker(g.Interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
|
@ -263,10 +266,16 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr
|
|||
}
|
||||
|
||||
type executor struct {
|
||||
notifiers []notifier.Notifier
|
||||
notifiers []eNotifier
|
||||
rw *remotewrite.Client
|
||||
}
|
||||
|
||||
type eNotifier struct {
|
||||
notifier.Notifier
|
||||
alertsSent *counter
|
||||
alertsSendErrors *counter
|
||||
}
|
||||
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
if concurrency == 1 {
|
||||
|
@ -297,19 +306,16 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren
|
|||
}
|
||||
|
||||
var (
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
||||
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
||||
|
||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||
|
||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||
)
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error {
|
||||
execTotal.Inc()
|
||||
execStart := time.Now()
|
||||
defer func() {
|
||||
execDuration.UpdateDuration(execStart)
|
||||
}()
|
||||
|
||||
tss, err := rule.Exec(ctx)
|
||||
if err != nil {
|
||||
|
@ -350,11 +356,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration)
|
|||
return nil
|
||||
}
|
||||
|
||||
alertsSent.Add(len(alerts))
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers {
|
||||
nt.alertsSent.Add(len(alerts))
|
||||
if err := nt.Send(ctx, alerts); err != nil {
|
||||
alertsSendErrors.Inc()
|
||||
nt.alertsSendErrors.Inc()
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,6 +63,7 @@ type fakeNotifier struct {
|
|||
alerts []notifier.Alert
|
||||
}
|
||||
|
||||
func (*fakeNotifier) Addr() string { return "" }
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
|
|
|
@ -274,6 +274,9 @@ func configReload(ctx context.Context, m *manager, groupsCfg []config.Group) {
|
|||
continue
|
||||
}
|
||||
if configsEqual(newGroupsCfg, groupsCfg) {
|
||||
// set success to 1 since previous reload
|
||||
// could have been unsuccessful
|
||||
configSuccess.Set(1)
|
||||
// config didn't change - skip it
|
||||
continue
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
// AlertManager represents integration provider with Prometheus alert manager
|
||||
// https://github.com/prometheus/alertmanager
|
||||
type AlertManager struct {
|
||||
addr string
|
||||
alertURL string
|
||||
basicAuthUser string
|
||||
basicAuthPass string
|
||||
|
@ -19,6 +20,9 @@ type AlertManager struct {
|
|||
client *http.Client
|
||||
}
|
||||
|
||||
// Addr returns address where alerts are sent.
|
||||
func (am AlertManager) Addr() string { return am.addr }
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
|
||||
b := &bytes.Buffer{}
|
||||
|
@ -57,9 +61,10 @@ const alertManagerPath = "/api/v2/alerts"
|
|||
|
||||
// NewAlertManager is a constructor for AlertManager
|
||||
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
|
||||
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
url := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
|
||||
return &AlertManager{
|
||||
alertURL: addr,
|
||||
addr: alertManagerURL,
|
||||
alertURL: url,
|
||||
argFunc: fn,
|
||||
client: c,
|
||||
basicAuthUser: user,
|
||||
|
|
|
@ -10,6 +10,14 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
func TestAlertManager_Addr(t *testing.T) {
|
||||
const addr = "http://localhost"
|
||||
am := NewAlertManager(addr, "", "", nil, nil)
|
||||
if am.Addr() != addr {
|
||||
t.Errorf("expected to have %q; got %q", addr, am.Addr())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlertManager_Send(t *testing.T) {
|
||||
const baUser, baPass = "foo", "bar"
|
||||
mux := http.NewServeMux()
|
||||
|
|
|
@ -2,7 +2,12 @@ package notifier
|
|||
|
||||
import "context"
|
||||
|
||||
// Notifier is common interface for alert manager provider
|
||||
// Notifier is a common interface for alert manager provider
|
||||
type Notifier interface {
|
||||
// Send sends the given list of alerts.
|
||||
// Returns an error if fails to send the alerts.
|
||||
// Must unblock if the given ctx is cancelled.
|
||||
Send(ctx context.Context, alerts []Alert) error
|
||||
// Addr returns address where alerts are sent.
|
||||
Addr() string
|
||||
}
|
||||
|
|
|
@ -55,4 +55,4 @@ Default creds:
|
|||
Grafana is provisioned by default with following entities:
|
||||
* VictoriaMetrics datasource
|
||||
* Prometheus datasource
|
||||
* VictoriaMetrics overview dashboard
|
||||
* VictoriaMetrics overview dashboard
|
||||
|
|
|
@ -6,15 +6,15 @@ scrape_configs:
|
|||
- job_name: 'vmagent'
|
||||
static_configs:
|
||||
- targets: ['vmagent:8429']
|
||||
|
||||
- job_name: 'vmalert'
|
||||
static_configs:
|
||||
- targets: ['vmalert:8880']
|
||||
- job_name: 'vminsert'
|
||||
static_configs:
|
||||
- targets: ['vminsert:8480']
|
||||
|
||||
- job_name: 'vmselect'
|
||||
static_configs:
|
||||
- targets: ['vmselect:8481']
|
||||
|
||||
- job_name: 'vmstorage'
|
||||
static_configs:
|
||||
- targets: ['vmstorage:8482']
|
||||
|
|
Loading…
Reference in a new issue