2020-04-06 11:44:03 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"hash/fnv"
|
|
|
|
"sort"
|
2020-04-11 15:49:23 +00:00
|
|
|
"strconv"
|
2020-04-06 11:44:03 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
|
2020-04-27 21:18:02 +00:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
2020-04-06 11:44:03 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Group grouping array of alert
|
|
|
|
type Group struct {
|
|
|
|
Name string
|
|
|
|
Rules []*Rule
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rule is basic alert entity
|
|
|
|
type Rule struct {
|
|
|
|
Name string `yaml:"alert"`
|
|
|
|
Expr string `yaml:"expr"`
|
|
|
|
For time.Duration `yaml:"for"`
|
|
|
|
Labels map[string]string `yaml:"labels"`
|
|
|
|
Annotations map[string]string `yaml:"annotations"`
|
|
|
|
|
|
|
|
group *Group
|
|
|
|
|
|
|
|
// guard status fields
|
2020-04-11 15:49:23 +00:00
|
|
|
mu sync.RWMutex
|
2020-04-06 11:44:03 +00:00
|
|
|
// stores list of active alerts
|
|
|
|
alerts map[uint64]*notifier.Alert
|
|
|
|
// stores last moment of time Exec was called
|
|
|
|
lastExecTime time.Time
|
|
|
|
// stores last error that happened in Exec func
|
|
|
|
// resets on every successful Exec
|
|
|
|
// may be used as Health state
|
|
|
|
lastExecError error
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate validates rule
|
|
|
|
func (r *Rule) Validate() error {
|
|
|
|
if r.Name == "" {
|
|
|
|
return errors.New("rule name can not be empty")
|
|
|
|
}
|
|
|
|
if r.Expr == "" {
|
|
|
|
return fmt.Errorf("expression for rule %q can't be empty", r.Name)
|
|
|
|
}
|
|
|
|
if _, err := metricsql.Parse(r.Expr); err != nil {
|
|
|
|
return fmt.Errorf("invalid expression for rule %q: %w", r.Name, err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Exec executes Rule expression via the given Querier.
|
|
|
|
// Based on the Querier results Rule maintains notifier.Alerts
|
|
|
|
func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
|
2020-04-11 19:42:01 +00:00
|
|
|
qMetrics, err := q.Query(ctx, r.Expr)
|
2020-04-06 11:44:03 +00:00
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
|
|
|
|
r.lastExecError = err
|
|
|
|
r.lastExecTime = time.Now()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to execute query %q: %s", r.Expr, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for h, a := range r.alerts {
|
|
|
|
// cleanup inactive alerts from previous Eval
|
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
delete(r.alerts, h)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
updated := make(map[uint64]struct{})
|
|
|
|
// update list of active alerts
|
2020-04-11 19:42:01 +00:00
|
|
|
for _, m := range qMetrics {
|
2020-04-06 11:44:03 +00:00
|
|
|
h := hash(m)
|
|
|
|
updated[h] = struct{}{}
|
|
|
|
if _, ok := r.alerts[h]; ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a, err := r.newAlert(m)
|
|
|
|
if err != nil {
|
|
|
|
r.lastExecError = err
|
|
|
|
return fmt.Errorf("failed to create alert: %s", err)
|
|
|
|
}
|
2020-04-11 09:40:24 +00:00
|
|
|
a.ID = h
|
2020-04-06 11:44:03 +00:00
|
|
|
a.State = notifier.StatePending
|
|
|
|
r.alerts[h] = a
|
|
|
|
}
|
|
|
|
|
|
|
|
for h, a := range r.alerts {
|
|
|
|
// if alert wasn't updated in this iteration
|
|
|
|
// means it is resolved already
|
|
|
|
if _, ok := updated[h]; !ok {
|
|
|
|
a.State = notifier.StateInactive
|
|
|
|
// set endTime to last execution time
|
|
|
|
// so it can be sent by notifier on next step
|
|
|
|
a.End = r.lastExecTime
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
|
|
|
|
a.State = notifier.StateFiring
|
2020-04-11 19:42:01 +00:00
|
|
|
alertsFired.Inc()
|
2020-04-06 11:44:03 +00:00
|
|
|
}
|
|
|
|
if a.State == notifier.StateFiring {
|
|
|
|
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: consider hashing algorithm in VM
|
|
|
|
func hash(m datasource.Metric) uint64 {
|
|
|
|
hash := fnv.New64a()
|
|
|
|
labels := m.Labels
|
|
|
|
sort.Slice(labels, func(i, j int) bool {
|
|
|
|
return labels[i].Name < labels[j].Name
|
|
|
|
})
|
|
|
|
for _, l := range labels {
|
|
|
|
hash.Write([]byte(l.Name))
|
|
|
|
hash.Write([]byte(l.Value))
|
|
|
|
hash.Write([]byte("\xff"))
|
|
|
|
}
|
|
|
|
return hash.Sum64()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
|
|
|
|
a := ¬ifier.Alert{
|
|
|
|
Group: r.group.Name,
|
|
|
|
Name: r.Name,
|
|
|
|
Labels: map[string]string{},
|
|
|
|
Value: m.Value,
|
|
|
|
Start: time.Now(),
|
|
|
|
// TODO: support End time
|
|
|
|
}
|
2020-04-26 10:30:10 +00:00
|
|
|
|
|
|
|
// 1. use data labels
|
2020-04-06 11:44:03 +00:00
|
|
|
for _, l := range m.Labels {
|
|
|
|
a.Labels[l.Name] = l.Value
|
|
|
|
}
|
2020-04-26 10:30:10 +00:00
|
|
|
|
|
|
|
// 2. template rule labels with data labels
|
|
|
|
rLabels, err := a.ExecTemplate(r.Labels)
|
|
|
|
if err != nil {
|
|
|
|
return a, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// 3. merge data labels and rule labels
|
2020-04-06 11:44:03 +00:00
|
|
|
// metric labels may be overridden by
|
|
|
|
// rule labels
|
2020-04-26 10:30:10 +00:00
|
|
|
for k, v := range rLabels {
|
2020-04-06 11:44:03 +00:00
|
|
|
a.Labels[k] = v
|
|
|
|
}
|
2020-04-26 10:30:10 +00:00
|
|
|
|
|
|
|
// 4. template merged labels
|
|
|
|
a.Labels, err = a.ExecTemplate(a.Labels)
|
|
|
|
if err != nil {
|
|
|
|
return a, err
|
|
|
|
}
|
|
|
|
|
2020-04-06 11:44:03 +00:00
|
|
|
a.Annotations, err = a.ExecTemplate(r.Annotations)
|
|
|
|
return a, err
|
|
|
|
}
|
2020-04-11 09:40:24 +00:00
|
|
|
|
2020-04-12 12:08:11 +00:00
|
|
|
// AlertAPI generates APIAlert object from alert by its id(hash)
|
|
|
|
func (r *Rule) AlertAPI(id uint64) *APIAlert {
|
2020-04-11 15:49:23 +00:00
|
|
|
r.mu.RLock()
|
|
|
|
defer r.mu.RUnlock()
|
|
|
|
a, ok := r.alerts[id]
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return r.newAlertAPI(*a)
|
|
|
|
}
|
|
|
|
|
2020-04-12 12:08:11 +00:00
|
|
|
// AlertsAPI generates list of APIAlert objects from existing alerts
|
|
|
|
func (r *Rule) AlertsAPI() []*APIAlert {
|
|
|
|
var alerts []*APIAlert
|
2020-04-11 15:49:23 +00:00
|
|
|
r.mu.RLock()
|
2020-04-11 09:40:24 +00:00
|
|
|
for _, a := range r.alerts {
|
2020-04-11 15:49:23 +00:00
|
|
|
alerts = append(alerts, r.newAlertAPI(*a))
|
2020-04-11 09:40:24 +00:00
|
|
|
}
|
2020-04-11 15:49:23 +00:00
|
|
|
r.mu.RUnlock()
|
|
|
|
return alerts
|
2020-04-11 09:40:24 +00:00
|
|
|
}
|
|
|
|
|
2020-04-12 12:08:11 +00:00
|
|
|
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
|
|
|
|
return &APIAlert{
|
2020-04-11 15:49:23 +00:00
|
|
|
ID: a.ID,
|
|
|
|
Name: a.Name,
|
|
|
|
Group: a.Group,
|
|
|
|
Expression: r.Expr,
|
|
|
|
Labels: a.Labels,
|
|
|
|
Annotations: a.Annotations,
|
|
|
|
State: a.State.String(),
|
|
|
|
ActiveAt: a.Start,
|
|
|
|
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
|
|
|
|
}
|
2020-04-11 09:40:24 +00:00
|
|
|
}
|
2020-04-27 21:18:02 +00:00
|
|
|
|
|
|
|
const (
|
|
|
|
// AlertMetricName is the metric name for synthetic alert timeseries.
|
|
|
|
alertMetricName = "ALERTS"
|
|
|
|
// AlertForStateMetricName is the metric name for 'for' state of alert.
|
|
|
|
alertForStateMetricName = "ALERTS_FOR_STATE"
|
|
|
|
|
|
|
|
// AlertNameLabel is the label name indicating the name of an alert.
|
|
|
|
alertNameLabel = "alertname"
|
|
|
|
// AlertStateLabel is the label name indicating the state of an alert.
|
|
|
|
alertStateLabel = "alertstate"
|
|
|
|
)
|
|
|
|
|
|
|
|
func (r *Rule) AlertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
|
|
|
|
var tss []prompbmarshal.TimeSeries
|
|
|
|
tss = append(tss, alertToTimeSeries(r.Name, a, timestamp))
|
|
|
|
if r.For > 0 {
|
|
|
|
tss = append(tss, alertForToTimeSeries(r.Name, a, timestamp))
|
|
|
|
}
|
|
|
|
return tss
|
|
|
|
}
|
|
|
|
|
|
|
|
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertMetricName
|
|
|
|
labels[alertNameLabel] = name
|
|
|
|
labels[alertStateLabel] = a.State.String()
|
|
|
|
return newTimeSeries(1, labels, timestamp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
|
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertForStateMetricName
|
|
|
|
labels[alertNameLabel] = name
|
|
|
|
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
|
|
|
|
ts := prompbmarshal.TimeSeries{}
|
|
|
|
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
|
|
|
|
Value: value,
|
|
|
|
Timestamp: timestamp.UnixNano() / 1e6,
|
|
|
|
})
|
|
|
|
keys := make([]string, 0, len(labels))
|
|
|
|
for k := range labels {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Strings(keys)
|
|
|
|
for _, key := range keys {
|
|
|
|
ts.Labels = append(ts.Labels, prompbmarshal.Label{
|
|
|
|
Name: key,
|
|
|
|
Value: labels[key],
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return ts
|
|
|
|
}
|