2020-02-16 18:59:02 +00:00
package main
import (
2020-03-13 10:19:31 +00:00
"context"
2020-02-16 18:59:02 +00:00
"flag"
2020-03-13 10:19:31 +00:00
"fmt"
2020-02-16 18:59:02 +00:00
"net/http"
2020-04-01 15:17:53 +00:00
"net/url"
"os"
2020-03-13 10:19:31 +00:00
"strings"
2020-04-06 11:44:03 +00:00
"sync"
2020-03-13 10:19:31 +00:00
"time"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
2020-04-06 11:44:03 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
2020-04-27 21:18:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
2020-03-28 23:48:30 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
2020-04-11 19:42:01 +00:00
"github.com/VictoriaMetrics/metrics"
2020-02-16 18:59:02 +00:00
)
var (
2020-04-12 11:47:26 +00:00
rulePath = flagutil . NewArray ( "rule" , ` Path to the file with alert rules .
Supports patterns . Flag can be specified multiple times .
2020-03-28 23:48:30 +00:00
Examples :
2020-04-12 11:47:26 +00:00
- rule / path / to / file . Path to a single file with alerting rules
- rule dir / * . yaml - rule / * . yaml . Relative path to all . yaml files in "dir" folder ,
absolute path to all . yaml files in root . ` )
2020-04-27 21:18:02 +00:00
validateTemplates = flag . Bool ( "rule.validateTemplates" , true , "Indicates to validate annotation and label templates" )
httpListenAddr = flag . String ( "httpListenAddr" , ":8880" , "Address to listen for http connections" )
2020-05-04 21:51:22 +00:00
datasourceURL = flag . String ( "datasource.url" , "" , "Victoria Metrics or VMSelect url. Required parameter." +
" E.g. http://127.0.0.1:8428" )
basicAuthUsername = flag . String ( "datasource.basicAuth.username" , "" , "Optional basic auth username for -datasource.url" )
basicAuthPassword = flag . String ( "datasource.basicAuth.password" , "" , "Optional basic auth password for -datasource.url" )
remoteWriteURL = flag . String ( "remotewrite.url" , "" , "Optional URL to Victoria Metrics or VMInsert where to persist alerts state" +
" in form of timeseries. E.g. http://127.0.0.1:8428" )
remoteWriteUsername = flag . String ( "remotewrite.basicAuth.username" , "" , "Optional basic auth username for -remotewrite.url" )
remoteWritePassword = flag . String ( "remotewrite.basicAuth.password" , "" , "Optional basic auth password for -remotewrite.url" )
remoteReadURL = flag . String ( "remoteread.url" , "" , "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts" +
" state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state." +
" E.g. http://127.0.0.1:8428" )
remoteReadUsername = flag . String ( "remoteread.basicAuth.username" , "" , "Optional basic auth username for -remoteread.url" )
remoteReadPassword = flag . String ( "remoteread.basicAuth.password" , "" , "Optional basic auth password for -remoteread.url" )
remoteReadLookBack = flag . Duration ( "remoteread.lookback" , time . Hour , "Lookback defines how far to look into past for alerts timeseries." +
" For example, if lookback=1h then range from now() to now()-1h will be scanned." )
evaluationInterval = flag . Duration ( "evaluationInterval" , time . Minute , "How often to evaluate the rules. Default 1m" )
2020-04-26 11:15:04 +00:00
notifierURL = flag . String ( "notifier.url" , "" , "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093" )
externalURL = flag . String ( "external.url" , "" , "External URL is used as alert's source for sent alerts to the notifier" )
2020-02-16 18:59:02 +00:00
)
func main ( ) {
envflag . Parse ( )
buildinfo . Init ( )
logger . Init ( )
2020-03-28 23:48:30 +00:00
checkFlags ( )
2020-03-13 10:19:31 +00:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
2020-04-01 19:29:11 +00:00
eu , err := getExternalURL ( * externalURL , * httpListenAddr , httpserver . IsTLS ( ) )
2020-04-01 15:17:53 +00:00
if err != nil {
logger . Fatalf ( "can not get external url:%s " , err )
}
2020-04-06 11:44:03 +00:00
notifier . InitTemplateFunc ( eu )
2020-02-16 18:59:02 +00:00
2020-03-28 23:48:30 +00:00
logger . Infof ( "reading alert rules configuration file from %s" , strings . Join ( * rulePath , ";" ) )
2020-05-09 09:32:12 +00:00
groups , err := readRules ( )
2020-02-16 18:59:02 +00:00
if err != nil {
2020-04-27 21:18:02 +00:00
logger . Fatalf ( "cannot parse configuration file: %s" , err )
2020-02-16 18:59:02 +00:00
}
2020-03-28 23:48:30 +00:00
2020-03-13 10:19:31 +00:00
w := & watchdog {
storage : datasource . NewVMStorage ( * datasourceURL , * basicAuthUsername , * basicAuthPassword , & http . Client { } ) ,
2020-04-11 15:49:23 +00:00
alertProvider : notifier . NewAlertManager ( * notifierURL , func ( group , name string ) string {
2020-04-11 09:40:24 +00:00
return fmt . Sprintf ( "%s/api/v1/%s/%s/status" , eu , group , name )
2020-03-13 10:19:31 +00:00
} , & http . Client { } ) ,
}
2020-04-27 21:18:02 +00:00
if * remoteWriteURL != "" {
c , err := remotewrite . NewClient ( ctx , remotewrite . Config {
Addr : * remoteWriteURL ,
FlushInterval : * evaluationInterval ,
2020-05-04 21:51:22 +00:00
BasicAuthUser : * remoteWriteUsername ,
BasicAuthPass : * remoteWritePassword ,
2020-04-27 21:18:02 +00:00
} )
if err != nil {
logger . Fatalf ( "failed to init remotewrite client: %s" , err )
}
w . rw = c
}
2020-05-04 21:51:22 +00:00
var restoreDS * datasource . VMStorage
if * remoteReadURL != "" {
restoreDS = datasource . NewVMStorage ( * remoteReadURL , * remoteReadUsername , * remoteReadPassword , & http . Client { } )
}
2020-04-06 11:44:03 +00:00
wg := sync . WaitGroup { }
2020-05-09 09:32:12 +00:00
groupUpdateStorage := startInitGroups ( ctx , w , restoreDS , groups , & wg )
rh := & requestHandler { groups : groups , mu : sync . RWMutex { } }
//run config updater
wg . Add ( 1 )
sigHup := procutil . NewSighupChan ( )
go rh . runConfigUpdater ( ctx , sigHup , groupUpdateStorage , w , & wg )
go httpserver . Serve ( * httpListenAddr , ( rh ) . handler )
2020-04-06 11:44:03 +00:00
2020-02-16 18:59:02 +00:00
sig := procutil . WaitForSigterm ( )
logger . Infof ( "service received signal %s" , sig )
2020-02-21 21:15:05 +00:00
if err := httpserver . Stop ( * httpListenAddr ) ; err != nil {
logger . Fatalf ( "cannot stop the webservice: %s" , err )
}
2020-03-13 10:19:31 +00:00
cancel ( )
2020-04-27 21:18:02 +00:00
if w . rw != nil {
err := w . rw . Close ( )
if err != nil {
logger . Fatalf ( "cannot stop the remotewrite: %s" , err )
}
}
2020-04-06 11:44:03 +00:00
wg . Wait ( )
2020-02-16 18:59:02 +00:00
}
type watchdog struct {
2020-03-13 10:19:31 +00:00
storage * datasource . VMStorage
2020-04-06 11:44:03 +00:00
alertProvider notifier . Notifier
2020-04-27 21:18:02 +00:00
rw * remotewrite . Client
2020-03-13 10:19:31 +00:00
}
2020-04-11 19:42:01 +00:00
var (
iterationTotal = metrics . NewCounter ( ` vmalert_iteration_total ` )
iterationDuration = metrics . NewSummary ( ` vmalert_iteration_duration_seconds ` )
execTotal = metrics . NewCounter ( ` vmalert_execution_total ` )
execErrors = metrics . NewCounter ( ` vmalert_execution_errors_total ` )
execDuration = metrics . NewSummary ( ` vmalert_execution_duration_seconds ` )
2020-04-27 21:18:02 +00:00
alertsFired = metrics . NewCounter ( ` vmalert_alerts_fired_total ` )
alertsSent = metrics . NewCounter ( ` vmalert_alerts_sent_total ` )
alertsSendErrors = metrics . NewCounter ( ` vmalert_alerts_send_errors_total ` )
remoteWriteSent = metrics . NewCounter ( ` vmalert_remotewrite_sent_total ` )
remoteWriteErrors = metrics . NewCounter ( ` vmalert_remotewrite_errors_total ` )
2020-05-09 09:32:12 +00:00
configReloadTotal = metrics . NewCounter ( ` vmalert_config_reload_total ` )
configReloadOkTotal = metrics . NewCounter ( ` vmalert_config_reload_ok_total ` )
configReloadErrorTotal = metrics . NewCounter ( ` vmalert_config_reload_error_total ` )
2020-04-11 19:42:01 +00:00
)
2020-05-09 09:32:12 +00:00
func ( w * watchdog ) run ( ctx context . Context , group Group , evaluationInterval time . Duration , groupUpdate chan Group ) {
2020-04-11 19:42:01 +00:00
logger . Infof ( "watchdog for %s has been started" , group . Name )
2020-03-13 10:19:31 +00:00
t := time . NewTicker ( evaluationInterval )
defer t . Stop ( )
for {
2020-04-11 19:42:01 +00:00
2020-03-13 10:19:31 +00:00
select {
2020-05-09 09:32:12 +00:00
case newGroup := <- groupUpdate :
if newGroup . Rules == nil || len ( newGroup . Rules ) == 0 {
//empty rules for group
//need to exit
logger . Infof ( "stopping group: %s, it contains 0 rules now" , group . Name )
return
}
logger . Infof ( "new group update received, group: %s" , group . Name )
group . Update ( newGroup )
logger . Infof ( "group was reconciled, group: %s" , group . Name )
2020-03-13 10:19:31 +00:00
case <- t . C :
2020-04-11 19:42:01 +00:00
iterationTotal . Inc ( )
iterationStart := time . Now ( )
2020-04-06 11:44:03 +00:00
for _ , rule := range group . Rules {
2020-04-11 19:42:01 +00:00
execTotal . Inc ( )
execStart := time . Now ( )
err := rule . Exec ( ctx , w . storage )
execDuration . UpdateDuration ( execStart )
if err != nil {
execErrors . Inc ( )
2020-04-06 11:44:03 +00:00
logger . Errorf ( "failed to execute rule %q.%q: %s" , group . Name , rule . Name , err )
2020-03-13 10:19:31 +00:00
continue
}
2020-04-11 19:42:01 +00:00
2020-04-27 21:18:02 +00:00
var alertsToSend [ ] notifier . Alert
for _ , a := range rule . alerts {
if a . State != notifier . StatePending {
alertsToSend = append ( alertsToSend , * a )
}
if a . State == notifier . StateInactive || w . rw == nil {
continue
}
tss := rule . AlertToTimeSeries ( a , execStart )
for _ , ts := range tss {
remoteWriteSent . Inc ( )
if err := w . rw . Push ( ts ) ; err != nil {
remoteWriteErrors . Inc ( )
logger . Errorf ( "failed to push timeseries to remotewrite: %s" , err )
}
}
}
alertsSent . Add ( len ( alertsToSend ) )
if err := w . alertProvider . Send ( alertsToSend ) ; err != nil {
alertsSendErrors . Inc ( )
2020-04-06 11:44:03 +00:00
logger . Errorf ( "failed to send alert for rule %q.%q: %s" , group . Name , rule . Name , err )
2020-03-13 10:19:31 +00:00
}
}
2020-04-11 19:42:01 +00:00
iterationDuration . UpdateDuration ( iterationStart )
2020-03-13 10:19:31 +00:00
case <- ctx . Done ( ) :
2020-04-06 11:44:03 +00:00
logger . Infof ( "%s received stop signal" , group . Name )
2020-03-13 10:19:31 +00:00
return
}
}
2020-02-16 18:59:02 +00:00
}
2020-04-01 15:17:53 +00:00
func getExternalURL ( externalURL , httpListenAddr string , isSecure bool ) ( * url . URL , error ) {
if externalURL != "" {
return url . Parse ( externalURL )
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
hname , err := os . Hostname ( )
2020-03-13 10:19:31 +00:00
if err != nil {
2020-04-01 15:17:53 +00:00
return nil , err
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
port := ""
if ipport := strings . Split ( httpListenAddr , ":" ) ; len ( ipport ) > 1 {
port = ":" + ipport [ 1 ]
}
schema := "http://"
if isSecure {
schema = "https://"
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
return url . Parse ( fmt . Sprintf ( "%s%s%s" , schema , hname , port ) )
2020-02-16 18:59:02 +00:00
}
2020-03-28 23:48:30 +00:00
func checkFlags ( ) {
2020-04-11 15:49:23 +00:00
if * notifierURL == "" {
2020-03-28 23:48:30 +00:00
flag . PrintDefaults ( )
2020-04-11 15:49:23 +00:00
logger . Fatalf ( "notifier.url is empty" )
2020-03-28 23:48:30 +00:00
}
if * datasourceURL == "" {
flag . PrintDefaults ( )
logger . Fatalf ( "datasource.url is empty" )
}
}
2020-05-09 09:32:12 +00:00
func startInitGroups ( ctx context . Context , w * watchdog , restoreDS * datasource . VMStorage , groups [ ] Group , wg * sync . WaitGroup ) map [ string ] chan Group {
groupUpdateStorage := map [ string ] chan Group { }
for _ , g := range groups {
if restoreDS != nil {
err := g . Restore ( ctx , restoreDS , * remoteReadLookBack )
if err != nil {
logger . Errorf ( "error while restoring state for group %q: %s" , g . Name , err )
}
}
groupUpdateChan := make ( chan Group , 1 )
groupUpdateStorage [ g . Name ] = groupUpdateChan
wg . Add ( 1 )
go func ( group Group ) {
w . run ( ctx , group , * evaluationInterval , groupUpdateChan )
wg . Done ( )
} ( g )
}
return groupUpdateStorage
}
//wrapper
func readRules ( ) ( [ ] Group , error ) {
return Parse ( * rulePath , * validateTemplates )
}