2020-05-10 16:58:17 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
2021-12-02 12:45:08 +00:00
|
|
|
"net/url"
|
|
|
|
"sort"
|
2020-05-10 16:58:17 +00:00
|
|
|
"sync"
|
|
|
|
|
2020-06-01 10:46:37 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
2020-05-10 16:58:17 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
)
|
|
|
|
|
2020-06-01 10:46:37 +00:00
|
|
|
// manager controls group states
|
2020-05-10 16:58:17 +00:00
|
|
|
type manager struct {
|
2021-04-28 20:41:15 +00:00
|
|
|
querierBuilder datasource.QuerierBuilder
|
2022-02-02 12:11:41 +00:00
|
|
|
notifiers func() []notifier.Notifier
|
2020-05-10 16:58:17 +00:00
|
|
|
|
|
|
|
rw *remotewrite.Client
|
2021-04-28 20:41:15 +00:00
|
|
|
// remote read builder.
|
|
|
|
rr datasource.QuerierBuilder
|
2020-05-10 16:58:17 +00:00
|
|
|
|
2020-07-28 11:20:31 +00:00
|
|
|
wg sync.WaitGroup
|
|
|
|
labels map[string]string
|
2020-05-10 16:58:17 +00:00
|
|
|
|
|
|
|
groupsMu sync.RWMutex
|
|
|
|
groups map[uint64]*Group
|
|
|
|
}
|
|
|
|
|
2020-06-01 10:46:37 +00:00
|
|
|
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
2020-05-10 16:58:17 +00:00
|
|
|
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
|
|
|
m.groupsMu.RLock()
|
|
|
|
defer m.groupsMu.RUnlock()
|
|
|
|
|
|
|
|
g, ok := m.groups[gID]
|
|
|
|
if !ok {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 18:24:45 +00:00
|
|
|
return nil, fmt.Errorf("can't find group with id %d", gID)
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
for _, rule := range g.Rules {
|
2020-06-01 10:46:37 +00:00
|
|
|
ar, ok := rule.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
2020-05-10 16:58:17 +00:00
|
|
|
return apiAlert, nil
|
|
|
|
}
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 18:24:45 +00:00
|
|
|
return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name)
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
|
2021-05-25 13:27:22 +00:00
|
|
|
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
|
|
|
|
return m.update(ctx, groupsCfg, true)
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *manager) close() {
|
|
|
|
if m.rw != nil {
|
|
|
|
err := m.rw.Close()
|
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
m.wg.Wait()
|
|
|
|
}
|
|
|
|
|
2021-05-05 07:07:19 +00:00
|
|
|
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
|
2020-05-13 18:32:58 +00:00
|
|
|
if restore && m.rr != nil {
|
2020-07-28 11:20:31 +00:00
|
|
|
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
2020-05-10 16:58:17 +00:00
|
|
|
if err != nil {
|
2021-05-10 08:06:31 +00:00
|
|
|
if !*remoteReadIgnoreRestoreErrors {
|
|
|
|
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
|
|
|
|
}
|
|
|
|
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
m.wg.Add(1)
|
|
|
|
id := group.ID()
|
|
|
|
go func() {
|
2021-04-28 20:41:15 +00:00
|
|
|
group.start(ctx, m.notifiers, m.rw)
|
2020-05-10 16:58:17 +00:00
|
|
|
m.wg.Done()
|
|
|
|
}()
|
2020-06-01 10:46:37 +00:00
|
|
|
m.groups[id] = group
|
2021-05-05 07:07:19 +00:00
|
|
|
return nil
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
|
2021-05-25 13:27:22 +00:00
|
|
|
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
2021-11-29 23:18:48 +00:00
|
|
|
var rrPresent, arPresent bool
|
2020-06-01 10:46:37 +00:00
|
|
|
groupsRegistry := make(map[uint64]*Group)
|
|
|
|
for _, cfg := range groupsCfg {
|
2021-11-29 23:18:48 +00:00
|
|
|
for _, r := range cfg.Rules {
|
|
|
|
if rrPresent && arPresent {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if r.Record != "" {
|
|
|
|
rrPresent = true
|
|
|
|
}
|
|
|
|
if r.Alert != "" {
|
|
|
|
arPresent = true
|
|
|
|
}
|
|
|
|
}
|
2021-04-28 20:41:15 +00:00
|
|
|
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
2020-05-10 16:58:17 +00:00
|
|
|
groupsRegistry[ng.ID()] = ng
|
|
|
|
}
|
|
|
|
|
2021-11-29 23:18:48 +00:00
|
|
|
if rrPresent && m.rw == nil {
|
|
|
|
return fmt.Errorf("config contains recording rules but `-remoteWrite.url` isn't set")
|
|
|
|
}
|
|
|
|
if arPresent && m.notifiers == nil {
|
2022-02-02 12:11:41 +00:00
|
|
|
return fmt.Errorf("config contains alerting rules but neither `-notifier.url` nor `-notifier.config` aren't set")
|
2021-11-29 23:18:48 +00:00
|
|
|
}
|
|
|
|
|
2020-09-11 19:14:30 +00:00
|
|
|
type updateItem struct {
|
|
|
|
old *Group
|
|
|
|
new *Group
|
|
|
|
}
|
|
|
|
var toUpdate []updateItem
|
|
|
|
|
2020-05-10 16:58:17 +00:00
|
|
|
m.groupsMu.Lock()
|
|
|
|
for _, og := range m.groups {
|
2020-05-17 14:12:09 +00:00
|
|
|
ng, ok := groupsRegistry[og.ID()]
|
2020-05-10 16:58:17 +00:00
|
|
|
if !ok {
|
2020-09-11 19:14:30 +00:00
|
|
|
// old group is not present in new list,
|
|
|
|
// so must be stopped and deleted
|
2020-05-10 16:58:17 +00:00
|
|
|
og.close()
|
|
|
|
delete(m.groups, og.ID())
|
|
|
|
og = nil
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
delete(groupsRegistry, ng.ID())
|
2020-09-11 19:14:30 +00:00
|
|
|
if og.Checksum != ng.Checksum {
|
|
|
|
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
|
|
|
|
}
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
for _, ng := range groupsRegistry {
|
2021-05-05 07:07:19 +00:00
|
|
|
if err := m.startGroup(ctx, ng, restore); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-05-10 16:58:17 +00:00
|
|
|
}
|
|
|
|
m.groupsMu.Unlock()
|
2020-09-11 19:14:30 +00:00
|
|
|
|
|
|
|
if len(toUpdate) > 0 {
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
for _, item := range toUpdate {
|
|
|
|
wg.Add(1)
|
|
|
|
go func(old *Group, new *Group) {
|
|
|
|
old.updateCh <- new
|
|
|
|
wg.Done()
|
|
|
|
}(item.old, item.new)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
}
|
2020-05-10 16:58:17 +00:00
|
|
|
return nil
|
|
|
|
}
|
2020-06-01 10:46:37 +00:00
|
|
|
|
|
|
|
func (g *Group) toAPI() APIGroup {
|
2020-09-11 19:14:30 +00:00
|
|
|
g.mu.RLock()
|
|
|
|
defer g.mu.RUnlock()
|
|
|
|
|
2020-06-01 10:46:37 +00:00
|
|
|
ag := APIGroup{
|
2020-09-11 19:14:30 +00:00
|
|
|
// encode as string to avoid rounding
|
2021-05-22 21:26:01 +00:00
|
|
|
ID: fmt.Sprintf("%d", g.ID()),
|
|
|
|
|
2022-03-15 11:54:53 +00:00
|
|
|
Name: g.Name,
|
|
|
|
Type: g.Type.String(),
|
|
|
|
File: g.File,
|
|
|
|
Interval: g.Interval.Seconds(),
|
|
|
|
LastEvaluation: g.LastEvaluation,
|
|
|
|
Concurrency: g.Concurrency,
|
|
|
|
Params: urlValuesToStrings(g.Params),
|
|
|
|
Labels: g.Labels,
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
for _, r := range g.Rules {
|
2022-03-15 11:54:53 +00:00
|
|
|
ag.Rules = append(ag.Rules, r.ToAPI())
|
2020-06-01 10:46:37 +00:00
|
|
|
}
|
|
|
|
return ag
|
|
|
|
}
|
2021-12-02 12:45:08 +00:00
|
|
|
|
|
|
|
func urlValuesToStrings(values url.Values) []string {
|
|
|
|
if len(values) < 1 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
keys := make([]string, 0, len(values))
|
|
|
|
for k := range values {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Strings(keys)
|
|
|
|
|
|
|
|
var res []string
|
|
|
|
for _, k := range keys {
|
|
|
|
params := values[k]
|
|
|
|
for _, v := range params {
|
|
|
|
res = append(res, fmt.Sprintf("%s=%s", k, v))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|