vmalert-tool: add -external.label and -external.url command-line … (#6766)

…flags to perform the same as vmalert

address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6735

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Hui Wang 2024-08-20 03:29:28 +08:00 committed by GitHub
parent febba3971b
commit 0fc1130f47
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 91 additions and 50 deletions

View file

@ -41,9 +41,19 @@ Examples:
Usage: "disable adding group's Name as label to generated alerts and time series.",
Required: false,
},
&cli.StringSliceFlag{
Name: "external.label",
Usage: `Optional label in the form 'name=value' to add to all generated recording rules and alerts. Supports an array of values separated by comma or specified via multiple flags.`,
Required: false,
},
&cli.StringFlag{
Name: "external.url",
Usage: `Optional external URL to template in rule's labels or annotations.`,
Required: false,
},
},
Action: func(c *cli.Context) error {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel"), c.StringSlice("external.label"), c.String("external.url")); failed {
return fmt.Errorf("unittest failed")
}
return nil

View file

@ -13,7 +13,7 @@ tests:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
- labels: '{__name__="suquery_interval_test", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
@ -24,20 +24,16 @@ tests:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
description: "localhost:9090 of job vmagent2 in cluster has been down for more than 5 minutes."
dashboard: "/d/dashboard?orgId=1"
- eval_time: 0
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- {}
- eval_time: 0
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -8,7 +8,8 @@ groups:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
description: "{{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $externalLabels.cluster }} has been down for more than 5 minutes."
dashboard: '{{ $externalURL }}/d/dashboard?orgId=1'
- alert: AlwaysFiring
expr: 1
- alert: SameAlertNameWithDifferentGroup

View file

@ -16,7 +16,8 @@ tests:
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- exp_labels:
cluster: prod
- eval_time: 1m
groupname: group2
alertname: SameAlertNameWithDifferentGroup
@ -25,7 +26,8 @@ tests:
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- exp_labels:
cluster: prod
- eval_time: 6m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
@ -61,18 +63,18 @@ tests:
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t1", datacenter="dc-123"}'
labels: '{__name__="t1", cluster="prod"}'
- expr: t2
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t2", datacenter="dc-123"}'
labels: '{__name__="t2", cluster="prod"}'
- expr: t3
eval_time: 4m
exp_samples:
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
- value: 3
labels: '{__name__="t3", datacenter="dc-123"}'
labels: '{__name__="t3", cluster="prod"}'
alert_rule_test:
- eval_time: 10m
@ -83,22 +85,21 @@ tests:
job: vmagent1
severity: page
instance: localhost:9090
datacenter: dc-123
cluster: prod
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
description: "localhost:9090 of job vmagent1 in cluster prod has been down for more than 5 minutes."
dashboard: "http://grafana:3000/d/dashboard?orgId=1"
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
cluster: prod
- eval_time: 0
groupname: alerts
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -13,7 +13,7 @@ tests:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
- labels: '{__name__="suquery_interval_test", cluster="prod", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
@ -25,22 +25,21 @@ tests:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
cluster: prod
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
description: "localhost:9090 of job vmagent2 in cluster prod has been down for more than 5 minutes."
dashboard: "http://grafana:3000/d/dashboard?orgId=1"
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
cluster: prod
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -55,7 +55,7 @@ const (
)
// UnitTest runs unittest for files
func UnitTest(files []string, disableGroupLabel bool) bool {
func UnitTest(files []string, disableGroupLabel bool, externalLabels []string, externalURL string) bool {
if err := templates.Load([]string{}, true); err != nil {
logger.Fatalf("failed to load template: %v", err)
}
@ -71,14 +71,34 @@ func UnitTest(files []string, disableGroupLabel bool) bool {
testfiles, err := config.ReadFromFS(files)
if err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to read test files: \n%v", err)
logger.Fatalf("failed to load test files %q: %v", files, err)
}
if len(testfiles) == 0 {
fmt.Println("no test file found")
return false
}
labels := make(map[string]string)
for _, s := range externalLabels {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
logger.Fatalf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
}
labels[s[:n]] = s[n+1:]
}
_, err = notifier.Init(nil, labels, externalURL)
if err != nil {
logger.Fatalf("failed to init notifier: %v", err)
}
var failed bool
for fileName, file := range testfiles {
if err := ruleUnitTest(fileName, file); err != nil {
if err := ruleUnitTest(fileName, file, labels); err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to run unit test for file %q: \n%v", file, err)
fmt.Printf("\nfailed to run unit test for file %q: \n%v", fileName, err)
failed = true
} else {
fmt.Println(" SUCCESS")
@ -88,7 +108,7 @@ func UnitTest(files []string, disableGroupLabel bool) bool {
return failed
}
func ruleUnitTest(filename string, content []byte) []error {
func ruleUnitTest(filename string, content []byte, externalLabels map[string]string) []error {
fmt.Println("\nUnit Testing: ", filename)
var unitTestInp unitTestFile
if err := yaml.UnmarshalStrict(content, &unitTestInp); err != nil {
@ -126,7 +146,7 @@ func ruleUnitTest(filename string, content []byte) []error {
errs = append(errs, err)
continue
}
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups, externalLabels)
errs = append(errs, testErrs...)
}
@ -163,6 +183,10 @@ func verifyTestGroup(group testGroup) error {
return fmt.Errorf("\n%s missing required field \"eval_time\"", testGroupName)
}
}
if group.ExternalLabels != nil {
fmt.Printf("\n%s warning: filed `external_labels` will be deprecated soon, please use `-external.label` cmd-line flag instead. "+
"Check https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6735 for details.\n", testGroupName)
}
return nil
}
@ -179,6 +203,7 @@ func processFlags() {
{flag: "retentionPeriod", value: "100y"},
{flag: "datasource.url", value: testDataSourcePath},
{flag: "remoteWrite.url", value: testRemoteWritePath},
{flag: "notifier.blackhole", value: "true"},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
@ -239,7 +264,7 @@ func tearDown() {
fs.MustRemoveAll(storagePath)
}
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group, externalLabels map[string]string) (checkErrs []error) {
// set up vmstorage and http server for ingest and read queries
setUp()
// tear down vmstorage and clean the data dir
@ -288,7 +313,14 @@ func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]i
// create groups with given rule
var groups []*rule.Group
for _, group := range testGroups {
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
mergedExternalLabels := make(map[string]string)
for k, v := range tg.ExternalLabels {
mergedExternalLabels[k] = v
}
for k, v := range externalLabels {
mergedExternalLabels[k] = v
}
ng := rule.NewGroup(group, q, time.Minute, mergedExternalLabels)
groups = append(groups, ng)
}

View file

@ -18,7 +18,7 @@ func TestUnitTest_Failure(t *testing.T) {
f := func(files []string) {
t.Helper()
failed := UnitTest(files, false)
failed := UnitTest(files, false, nil, "")
if !failed {
t.Fatalf("expecting failed test")
}
@ -29,18 +29,19 @@ func TestUnitTest_Failure(t *testing.T) {
}
func TestUnitTest_Success(t *testing.T) {
f := func(disableGroupLabel bool, files []string) {
f := func(disableGroupLabel bool, files []string, externalLabels []string, externalURL string) {
t.Helper()
failed := UnitTest(files, disableGroupLabel)
failed := UnitTest(files, disableGroupLabel, externalLabels, externalURL)
if failed {
t.Fatalf("unexpected failed test")
}
}
// run multi files
f(false, []string{"./testdata/test1.yaml", "./testdata/test2.yaml"})
f(false, []string{"./testdata/test1.yaml", "./testdata/test2.yaml"}, []string{"cluster=prod"}, "http://grafana:3000")
// disable group label
f(true, []string{"./testdata/disable-group-label.yaml"})
// template with null external values
f(true, []string{"./testdata/disable-group-label.yaml"}, nil, "")
}

View file

@ -29,10 +29,13 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
## tip
**Update note 1: The `external_labels` field in vmalert-tool [test file](https://docs.victoriametrics.com/vmalert-tool/#test-file-format) will be deprecated soon. Please use `-external.label` command-line flag instead, in the same way as vmalert uses it. This change is done for the sake of consistency between vmalert and vmalert-tool configuration. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6735).**
* FEATURE: add `/influx/health` health-check handler for Influx endpoints. This is needed as some clients use the health endpoint to determine if the server is healthy and ready for data ingestion. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6653) for the details.
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl/): add `--vm-backoff-retries`, `--vm-backoff-factor`, `--vm-backoff-min-duration` and `--vm-native-backoff-retries`, `--vm-native-backoff-factor`, `--vm-native-backoff-min-duration` command-line flags. These flags allow to change backoff policy config for import requests to VictoriaMetrics. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6622).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): allow overriding the `sample_limit` option at [scrape_configs](https://docs.victoriametrics.com/sd_configs/#scrape_configs) when a label `__sample_limit__` is specified for target. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6665). Thanks to @zoglam for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6666).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): reduce memory usage when scraping targets with big response body. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6759).
* FEATURE: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): add `-external.label` and `-external.url` command-line flags, in the same way as these flags are supported by vmalert. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6735).
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmrestore](https://docs.victoriametrics.com/vmrestore/), [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): use exponential backoff for retries when uploading or downloading data from S3. This should reduce the number of failed uploads and downloads when S3 is temporarily unavailable. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6732).
* FEATURE: [stream aggregation](https://docs.victoriametrics.com/stream-aggregation/): do not allow enabling `-stream.keepInput` and `keep_metric_names` options together in [stream aggregation config](https://docs.victoriametrics.com/stream-aggregation/#stream-aggregation-config), as it may result in time series collision.

View file

@ -100,7 +100,8 @@ alert_rule_test:
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
# external_labels is not accessible for [templating](https://docs.victoriametrics.com/vmalert/#templating), use "-external.label" cmd-line flag instead.
# Will be deprecated soon, check https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6735 for details.
external_labels:
[ <labelname>: <string> ... ]
@ -198,7 +199,7 @@ value: <number>
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml -external.label=cluster=prod`.
#### `test.yaml`
@ -218,7 +219,7 @@ tests:
- expr: subquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="subquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
- labels: '{__name__="subquery_interval_test", cluster="prod", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
@ -230,25 +231,22 @@ tests:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
cluster: prod
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
description: "localhost:9090 of job prometheus in cluster prod has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
cluster: prod
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
@ -266,7 +264,7 @@ groups:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
description: "{{ $labels.instance }} of job {{ $labels.job }} in cluster {{ $externalLabels.cluster }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1