2020-02-16 18:59:02 +00:00
package main
import (
2020-03-13 10:19:31 +00:00
"context"
2020-06-23 19:45:45 +00:00
"crypto/tls"
"crypto/x509"
2020-02-16 18:59:02 +00:00
"flag"
2020-03-13 10:19:31 +00:00
"fmt"
2020-06-23 19:45:45 +00:00
"io/ioutil"
2020-02-16 18:59:02 +00:00
"net/http"
2020-04-01 15:17:53 +00:00
"net/url"
"os"
2020-06-21 10:32:46 +00:00
"strconv"
2020-03-13 10:19:31 +00:00
"strings"
"time"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
2020-04-06 11:44:03 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
2020-04-27 21:18:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
2020-05-14 19:01:51 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
2020-03-28 23:48:30 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
2020-02-16 18:59:02 +00:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
2020-06-23 19:45:45 +00:00
"github.com/VictoriaMetrics/fasthttp"
2020-04-11 19:42:01 +00:00
"github.com/VictoriaMetrics/metrics"
2020-02-16 18:59:02 +00:00
)
var (
2020-04-12 11:47:26 +00:00
rulePath = flagutil . NewArray ( "rule" , ` Path to the file with alert rules .
Supports patterns . Flag can be specified multiple times .
2020-03-28 23:48:30 +00:00
Examples :
2020-04-12 11:47:26 +00:00
- rule / path / to / file . Path to a single file with alerting rules
- rule dir / * . yaml - rule / * . yaml . Relative path to all . yaml files in "dir" folder ,
absolute path to all . yaml files in root . ` )
2020-06-06 20:27:09 +00:00
validateTemplates = flag . Bool ( "rule.validateTemplates" , true , "Whether to validate annotation and label templates" )
validateExpressions = flag . Bool ( "rule.validateExpressions" , true , "Whether to validate rules expressions via MetricsQL engine" )
httpListenAddr = flag . String ( "httpListenAddr" , ":8880" , "Address to listen for http connections" )
2020-05-04 21:51:22 +00:00
datasourceURL = flag . String ( "datasource.url" , "" , "Victoria Metrics or VMSelect url. Required parameter." +
" E.g. http://127.0.0.1:8428" )
2020-06-23 19:45:45 +00:00
basicAuthUsername = flag . String ( "datasource.basicAuth.username" , "" , "Optional basic auth username for -datasource.url" )
basicAuthPassword = flag . String ( "datasource.basicAuth.password" , "" , "Optional basic auth password for -datasource.url" )
datasourceTLSInsecureSkipVerify = flag . Bool ( "datasource.tlsInsecureSkipVerify" , false , "Whether to skip tls verification when connecting to -datasource.url" )
datasourceTLSCertFile = flag . String ( "datasource.tlsCertFile" , "" , "Optional path to client-side TLS certificate file to use when connecting to -datasource.url" )
datasourceTLSKeyFile = flag . String ( "datasource.tlsKeyFile" , "" , "Optional path to client-side TLS certificate key to use when connecting to -datasource.url" )
datasourceTLSCAFile = flag . String ( "datasource.tlsCAFile" , "" , "Optional path to TLS CA file to use for verifying connections to -datasource.url. " +
"By default system CA is used" )
datasourceTLSServerName = flag . String ( "datasource.tlsServerName" , "" , "Optional TLS server name to use for connections to -datasource.url. " +
"By default the server name from -datasource.url is used" )
2020-05-04 21:51:22 +00:00
2020-05-13 17:58:56 +00:00
remoteWriteURL = flag . String ( "remoteWrite.url" , "" , "Optional URL to Victoria Metrics or VMInsert where to persist alerts state" +
2020-06-09 12:21:20 +00:00
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428" )
2020-06-23 19:45:45 +00:00
remoteWriteUsername = flag . String ( "remoteWrite.basicAuth.username" , "" , "Optional basic auth username for -remoteWrite.url" )
remoteWritePassword = flag . String ( "remoteWrite.basicAuth.password" , "" , "Optional basic auth password for -remoteWrite.url" )
remoteWriteMaxQueueSize = flag . Int ( "remoteWrite.maxQueueSize" , 1e5 , "Defines the max number of pending datapoints to remote write endpoint" )
remoteWriteMaxBatchSize = flag . Int ( "remoteWrite.maxBatchSize" , 1e3 , "Defines defines max number of timeseries to be flushed at once" )
remoteWriteConcurrency = flag . Int ( "remoteWrite.concurrency" , 1 , "Defines number of writers for concurrent writing into remote storage" )
remoteWriteTLSInsecureSkipVerify = flag . Bool ( "remoteWrite.tlsInsecureSkipVerify" , false , "Whether to skip tls verification when connecting to -remoteWrite.url" )
remoteWriteTLSCertFile = flag . String ( "remoteWrite.tlsCertFile" , "" , "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url" )
remoteWriteTLSKeyFile = flag . String ( "remoteWrite.tlsKeyFile" , "" , "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url" )
remoteWriteTLSCAFile = flag . String ( "remoteWrite.tlsCAFile" , "" , "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. " +
"By default system CA is used" )
remoteWriteTLSServerName = flag . String ( "remoteWrite.tlsServerName" , "" , "Optional TLS server name to use for connections to -remoteWrite.url. " +
"By default the server name from -remoteWrite.url is used" )
2020-05-04 21:51:22 +00:00
2020-05-13 17:58:56 +00:00
remoteReadURL = flag . String ( "remoteRead.url" , "" , "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts" +
2020-05-13 18:32:21 +00:00
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state." +
2020-05-04 21:51:22 +00:00
" E.g. http://127.0.0.1:8428" )
2020-05-13 18:32:21 +00:00
remoteReadUsername = flag . String ( "remoteRead.basicAuth.username" , "" , "Optional basic auth username for -remoteRead.url" )
remoteReadPassword = flag . String ( "remoteRead.basicAuth.password" , "" , "Optional basic auth password for -remoteRead.url" )
2020-05-13 17:58:56 +00:00
remoteReadLookBack = flag . Duration ( "remoteRead.lookback" , time . Hour , "Lookback defines how far to look into past for alerts timeseries." +
2020-05-04 21:51:22 +00:00
" For example, if lookback=1h then range from now() to now()-1h will be scanned." )
2020-06-23 19:45:45 +00:00
remoteReadTLSInsecureSkipVerify = flag . Bool ( "remoteRead.tlsInsecureSkipVerify" , false , "Whether to skip tls verification when connecting to -remoteRead.url" )
remoteReadTLSCertFile = flag . String ( "remoteRead.tlsCertFile" , "" , "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url" )
remoteReadTLSKeyFile = flag . String ( "remoteRead.tlsKeyFile" , "" , "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url" )
remoteReadTLSCAFile = flag . String ( "remoteRead.tlsCAFile" , "" , "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. " +
"By default system CA is used" )
remoteReadTLSServerName = flag . String ( "remoteRead.tlsServerName" , "" , "Optional TLS server name to use for connections to -remoteRead.url. " +
"By default the server name from -remoteRead.url is used" )
2020-05-04 21:51:22 +00:00
2020-06-23 19:45:45 +00:00
evaluationInterval = flag . Duration ( "evaluationInterval" , time . Minute , "How often to evaluate the rules" )
notifierURL = flag . String ( "notifier.url" , "" , "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093" )
notifierTLSInsecureSkipVerify = flag . Bool ( "notifier.tlsInsecureSkipVerify" , false , "Whether to skip tls verification when connecting to -notifier.url" )
notifierTLSCertFile = flag . String ( "notifier.tlsCertFile" , "" , "Optional path to client-side TLS certificate file to use when connecting to -notifier.url" )
notifierTLSKeyFile = flag . String ( "notifier.tlsKeyFile" , "" , "Optional path to client-side TLS certificate key to use when connecting to -notifier.url" )
notifierTLSCAFile = flag . String ( "notifier.tlsCAFile" , "" , "Optional path to TLS CA file to use for verifying connections to -notifier.url. " +
"By default system CA is used" )
notifierTLSServerName = flag . String ( "notifier.tlsServerName" , "" , "Optional TLS server name to use for connections to -notifier.url. " +
"By default the server name from -notifier.url is used" )
2020-06-21 10:32:46 +00:00
externalURL = flag . String ( "external.url" , "" , "External URL is used as alert's source for sent alerts to the notifier" )
externalAlertSource = flag . String ( "external.alert.source" , "" , ` External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana , Prometheus or any other service .
eg . ' explore ? orgId = 1 & left = [ \ "now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\" ] } ] ' . If empty ' / api / v1 / : groupID / alertID / status ' is used ` )
2020-02-16 18:59:02 +00:00
)
func main ( ) {
2020-05-16 08:59:30 +00:00
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag . CommandLine . SetOutput ( os . Stdout )
2020-06-05 07:42:56 +00:00
flag . Usage = usage
2020-02-16 18:59:02 +00:00
envflag . Parse ( )
buildinfo . Init ( )
logger . Init ( )
2020-03-28 23:48:30 +00:00
checkFlags ( )
2020-03-13 10:19:31 +00:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
2020-04-01 19:29:11 +00:00
eu , err := getExternalURL ( * externalURL , * httpListenAddr , httpserver . IsTLS ( ) )
2020-04-01 15:17:53 +00:00
if err != nil {
2020-05-10 16:58:17 +00:00
logger . Fatalf ( "can not get external url: %s " , err )
2020-04-01 15:17:53 +00:00
}
2020-04-06 11:44:03 +00:00
notifier . InitTemplateFunc ( eu )
2020-06-21 10:32:46 +00:00
aug , err := getAlertURLGenerator ( eu , * externalAlertSource , * validateTemplates )
if err != nil {
logger . Fatalf ( "URL generator error: %s" , err )
}
2020-02-16 18:59:02 +00:00
2020-06-23 19:45:45 +00:00
dst , err := getTransport ( datasourceURL , datasourceTLSCertFile , datasourceTLSKeyFile , datasourceTLSCAFile , datasourceTLSServerName , datasourceTLSInsecureSkipVerify )
if err != nil {
logger . Fatalf ( "cannot create datasource transport: %s" , err )
}
nt , err := getTransport ( notifierURL , notifierTLSCertFile , notifierTLSKeyFile , notifierTLSCAFile , notifierTLSServerName , notifierTLSInsecureSkipVerify )
if err != nil {
logger . Fatalf ( "cannot create notifier transport: %s" , err )
}
2020-05-10 16:58:17 +00:00
manager := & manager {
2020-06-21 10:32:46 +00:00
groups : make ( map [ uint64 ] * Group ) ,
2020-06-23 19:45:45 +00:00
storage : datasource . NewVMStorage ( * datasourceURL , * basicAuthUsername , * basicAuthPassword , & http . Client { Transport : dst } ) ,
notifier : notifier . NewAlertManager ( * notifierURL , aug , & http . Client { Transport : nt } ) ,
2020-03-13 10:19:31 +00:00
}
2020-04-27 21:18:02 +00:00
if * remoteWriteURL != "" {
2020-06-23 19:45:45 +00:00
t , err := getTransport ( remoteWriteURL , remoteWriteTLSCertFile , remoteWriteTLSKeyFile , remoteWriteTLSCAFile , remoteWriteTLSServerName , remoteWriteTLSInsecureSkipVerify )
if err != nil {
logger . Fatalf ( "cannot create remoteWrite transport: %s" , err )
}
2020-04-27 21:18:02 +00:00
c , err := remotewrite . NewClient ( ctx , remotewrite . Config {
Addr : * remoteWriteURL ,
2020-06-01 10:46:37 +00:00
Concurrency : * remoteWriteConcurrency ,
2020-05-13 17:58:56 +00:00
MaxQueueSize : * remoteWriteMaxQueueSize ,
2020-06-01 10:46:37 +00:00
MaxBatchSize : * remoteWriteMaxBatchSize ,
2020-04-27 21:18:02 +00:00
FlushInterval : * evaluationInterval ,
2020-05-04 21:51:22 +00:00
BasicAuthUser : * remoteWriteUsername ,
BasicAuthPass : * remoteWritePassword ,
2020-06-23 19:45:45 +00:00
Transport : t ,
2020-04-27 21:18:02 +00:00
} )
if err != nil {
logger . Fatalf ( "failed to init remotewrite client: %s" , err )
}
2020-05-10 16:58:17 +00:00
manager . rw = c
2020-04-27 21:18:02 +00:00
}
2020-06-23 19:45:45 +00:00
2020-05-04 21:51:22 +00:00
if * remoteReadURL != "" {
2020-06-23 19:45:45 +00:00
t , err := getTransport ( remoteReadURL , remoteReadTLSCertFile , remoteReadTLSKeyFile , remoteReadTLSCAFile , remoteReadTLSServerName , remoteReadTLSInsecureSkipVerify )
if err != nil {
logger . Fatalf ( "cannot create remoteRead transport: %s" , err )
}
manager . rr = datasource . NewVMStorage ( * remoteReadURL , * remoteReadUsername , * remoteReadPassword , & http . Client { Transport : t } )
2020-05-04 21:51:22 +00:00
}
2020-06-06 20:27:09 +00:00
if err := manager . start ( ctx , * rulePath , * validateTemplates , * validateExpressions ) ; err != nil {
2020-05-10 16:58:17 +00:00
logger . Fatalf ( "failed to start: %s" , err )
}
2020-05-09 09:32:12 +00:00
2020-05-10 16:58:17 +00:00
go func ( ) {
// init reload metrics with positive values to improve alerting conditions
configSuccess . Set ( 1 )
2020-05-14 19:01:51 +00:00
configTimestamp . Set ( fasttime . UnixTimestamp ( ) )
2020-05-10 16:58:17 +00:00
sigHup := procutil . NewSighupChan ( )
for {
<- sigHup
configReloads . Inc ( )
logger . Infof ( "SIGHUP received. Going to reload rules %q ..." , * rulePath )
2020-06-06 20:27:09 +00:00
if err := manager . update ( ctx , * rulePath , * validateTemplates , * validateExpressions , false ) ; err != nil {
2020-05-10 16:58:17 +00:00
configReloadErrors . Inc ( )
configSuccess . Set ( 0 )
logger . Errorf ( "error while reloading rules: %s" , err )
continue
}
configSuccess . Set ( 1 )
2020-05-14 19:01:51 +00:00
configTimestamp . Set ( fasttime . UnixTimestamp ( ) )
2020-05-10 16:58:17 +00:00
logger . Infof ( "Rules reloaded successfully from %q" , * rulePath )
}
} ( )
2020-05-09 09:32:12 +00:00
2020-05-10 16:58:17 +00:00
rh := & requestHandler { m : manager }
2020-05-18 08:55:16 +00:00
go httpserver . Serve ( * httpListenAddr , rh . handler )
2020-04-06 11:44:03 +00:00
2020-02-16 18:59:02 +00:00
sig := procutil . WaitForSigterm ( )
logger . Infof ( "service received signal %s" , sig )
2020-02-21 21:15:05 +00:00
if err := httpserver . Stop ( * httpListenAddr ) ; err != nil {
logger . Fatalf ( "cannot stop the webservice: %s" , err )
}
2020-03-13 10:19:31 +00:00
cancel ( )
2020-05-10 16:58:17 +00:00
manager . close ( )
2020-03-13 10:19:31 +00:00
}
2020-04-11 19:42:01 +00:00
var (
2020-05-10 16:58:17 +00:00
configReloads = metrics . NewCounter ( ` vmalert_config_last_reload_total ` )
configReloadErrors = metrics . NewCounter ( ` vmalert_config_last_reload_errors_total ` )
configSuccess = metrics . NewCounter ( ` vmalert_config_last_reload_successful ` )
configTimestamp = metrics . NewCounter ( ` vmalert_config_last_reload_success_timestamp_seconds ` )
2020-04-11 19:42:01 +00:00
)
2020-04-01 15:17:53 +00:00
func getExternalURL ( externalURL , httpListenAddr string , isSecure bool ) ( * url . URL , error ) {
if externalURL != "" {
return url . Parse ( externalURL )
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
hname , err := os . Hostname ( )
2020-03-13 10:19:31 +00:00
if err != nil {
2020-04-01 15:17:53 +00:00
return nil , err
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
port := ""
if ipport := strings . Split ( httpListenAddr , ":" ) ; len ( ipport ) > 1 {
port = ":" + ipport [ 1 ]
}
schema := "http://"
if isSecure {
schema = "https://"
2020-03-13 10:19:31 +00:00
}
2020-04-01 15:17:53 +00:00
return url . Parse ( fmt . Sprintf ( "%s%s%s" , schema , hname , port ) )
2020-02-16 18:59:02 +00:00
}
2020-06-21 10:32:46 +00:00
func getAlertURLGenerator ( externalURL * url . URL , externalAlertSource string , validateTemplate bool ) ( notifier . AlertURLGenerator , error ) {
if externalAlertSource == "" {
return func ( alert notifier . Alert ) string {
return fmt . Sprintf ( "%s/api/v1/%s/%s/status" , externalURL , strconv . FormatUint ( alert . GroupID , 10 ) , strconv . FormatUint ( alert . ID , 10 ) )
} , nil
}
if validateTemplate {
if err := notifier . ValidateTemplates ( map [ string ] string {
"tpl" : externalAlertSource ,
} ) ; err != nil {
return nil , fmt . Errorf ( "error validating source template %s:%w" , externalAlertSource , err )
}
}
m := map [ string ] string {
"tpl" : externalAlertSource ,
}
return func ( alert notifier . Alert ) string {
templated , err := alert . ExecTemplate ( m )
if err != nil {
logger . Errorf ( "can not exec source template %s" , err )
}
return fmt . Sprintf ( "%s/%s" , externalURL , templated [ "tpl" ] )
} , nil
}
2020-06-23 19:45:45 +00:00
func getTLSConfig ( certFile , keyFile , CAFile , serverName * string , insecureSkipVerify * bool ) ( * tls . Config , error ) {
var certs [ ] tls . Certificate
if * certFile != "" {
cert , err := tls . LoadX509KeyPair ( * certFile , * keyFile )
if err != nil {
return nil , fmt . Errorf ( "cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %s" , * certFile , * keyFile , err )
}
certs = [ ] tls . Certificate { cert }
}
var rootCAs * x509 . CertPool
if * CAFile != "" {
pem , err := ioutil . ReadFile ( * CAFile )
if err != nil {
return nil , fmt . Errorf ( "cannot read `ca_file` %q: %s" , * CAFile , err )
}
rootCAs = x509 . NewCertPool ( )
if ! rootCAs . AppendCertsFromPEM ( pem ) {
return nil , fmt . Errorf ( "cannot parse data from `ca_file` %q" , * CAFile )
}
}
return & tls . Config {
Certificates : certs ,
InsecureSkipVerify : * insecureSkipVerify ,
RootCAs : rootCAs ,
ServerName : * serverName ,
} , nil
}
func getTransport ( URL , certFile , keyFile , CAFile , serverName * string , insecureSkipVerify * bool ) ( * http . Transport , error ) {
var u fasthttp . URI
u . Update ( * URL )
var t * http . Transport
if string ( u . Scheme ( ) ) == "https" {
t = http . DefaultTransport . ( * http . Transport ) . Clone ( )
tlsCfg , err := getTLSConfig ( certFile , keyFile , CAFile , serverName , insecureSkipVerify )
if err != nil {
return nil , err
}
t . TLSClientConfig = tlsCfg
}
return t , nil
}
2020-03-28 23:48:30 +00:00
func checkFlags ( ) {
2020-04-11 15:49:23 +00:00
if * notifierURL == "" {
2020-03-28 23:48:30 +00:00
flag . PrintDefaults ( )
2020-04-11 15:49:23 +00:00
logger . Fatalf ( "notifier.url is empty" )
2020-03-28 23:48:30 +00:00
}
if * datasourceURL == "" {
flag . PrintDefaults ( )
logger . Fatalf ( "datasource.url is empty" )
}
}
2020-06-05 07:42:56 +00:00
func usage ( ) {
const s = `
vmalert processes alerts and recording rules .
See the docs at https : //github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/README.md .
`
f := flag . CommandLine . Output ( )
fmt . Fprintf ( f , "%s\n" , s )
flag . PrintDefaults ( )
}