vmalert-tool: implement unittest (#4789)

1. split package rule under /app/vmalert, expose needed objects
2. add vmalert-tool with unittest subcmd

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2945
This commit is contained in:
Haleygo 2023-10-13 19:54:33 +08:00 committed by GitHub
parent 98a5007d32
commit dc28196237
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
74 changed files with 3997 additions and 1683 deletions

View file

@ -28,7 +28,8 @@ all: \
vmauth-prod \
vmbackup-prod \
vmrestore-prod \
vmctl-prod
vmctl-prod \
vmalert-tool-prod
clean:
rm -rf bin/*
@ -40,7 +41,8 @@ publish: package-base \
publish-vmauth \
publish-vmbackup \
publish-vmrestore \
publish-vmctl
publish-vmctl \
publish-vmalert-tool
package: \
package-victoria-metrics \
@ -50,7 +52,8 @@ package: \
package-vmauth \
package-vmbackup \
package-vmrestore \
package-vmctl
package-vmctl \
package-vmalert-tool
vmutils: \
vmagent \
@ -58,7 +61,8 @@ vmutils: \
vmauth \
vmbackup \
vmrestore \
vmctl
vmctl \
vmalert-tool
vmutils-pure: \
vmagent-pure \
@ -66,7 +70,8 @@ vmutils-pure: \
vmauth-pure \
vmbackup-pure \
vmrestore-pure \
vmctl-pure
vmctl-pure \
vmalert-tool-pure
vmutils-linux-amd64: \
vmagent-linux-amd64 \
@ -74,7 +79,8 @@ vmutils-linux-amd64: \
vmauth-linux-amd64 \
vmbackup-linux-amd64 \
vmrestore-linux-amd64 \
vmctl-linux-amd64
vmctl-linux-amd64 \
vmalert-tool-linux-amd64
vmutils-linux-arm64: \
vmagent-linux-arm64 \
@ -82,7 +88,8 @@ vmutils-linux-arm64: \
vmauth-linux-arm64 \
vmbackup-linux-arm64 \
vmrestore-linux-arm64 \
vmctl-linux-arm64
vmctl-linux-arm64 \
vmalert-tool-linux-arm64
vmutils-linux-arm: \
vmagent-linux-arm \
@ -90,7 +97,8 @@ vmutils-linux-arm: \
vmauth-linux-arm \
vmbackup-linux-arm \
vmrestore-linux-arm \
vmctl-linux-arm
vmctl-linux-arm \
vmalert-tool-linux-arm
vmutils-linux-386: \
vmagent-linux-386 \
@ -98,7 +106,8 @@ vmutils-linux-386: \
vmauth-linux-386 \
vmbackup-linux-386 \
vmrestore-linux-386 \
vmctl-linux-386
vmctl-linux-386 \
vmalert-tool-linux-386
vmutils-linux-ppc64le: \
vmagent-linux-ppc64le \
@ -106,7 +115,8 @@ vmutils-linux-ppc64le: \
vmauth-linux-ppc64le \
vmbackup-linux-ppc64le \
vmrestore-linux-ppc64le \
vmctl-linux-ppc64le
vmctl-linux-ppc64le \
vmalert-tool-linux-ppc64le
vmutils-darwin-amd64: \
vmagent-darwin-amd64 \
@ -114,7 +124,8 @@ vmutils-darwin-amd64: \
vmauth-darwin-amd64 \
vmbackup-darwin-amd64 \
vmrestore-darwin-amd64 \
vmctl-darwin-amd64
vmctl-darwin-amd64 \
vmalert-tool-darwin-amd64
vmutils-darwin-arm64: \
vmagent-darwin-arm64 \
@ -122,7 +133,8 @@ vmutils-darwin-arm64: \
vmauth-darwin-arm64 \
vmbackup-darwin-arm64 \
vmrestore-darwin-arm64 \
vmctl-darwin-arm64
vmctl-darwin-arm64 \
vmalert-tool-darwin-arm64
vmutils-freebsd-amd64: \
vmagent-freebsd-amd64 \
@ -130,7 +142,8 @@ vmutils-freebsd-amd64: \
vmauth-freebsd-amd64 \
vmbackup-freebsd-amd64 \
vmrestore-freebsd-amd64 \
vmctl-freebsd-amd64
vmctl-freebsd-amd64 \
vmalert-tool-freebsd-amd64
vmutils-openbsd-amd64: \
vmagent-openbsd-amd64 \
@ -138,7 +151,8 @@ vmutils-openbsd-amd64: \
vmauth-openbsd-amd64 \
vmbackup-openbsd-amd64 \
vmrestore-openbsd-amd64 \
vmctl-openbsd-amd64
vmctl-openbsd-amd64 \
vmalert-tool-openbsd-amd64
vmutils-windows-amd64: \
vmagent-windows-amd64 \
@ -146,7 +160,8 @@ vmutils-windows-amd64: \
vmauth-windows-amd64 \
vmbackup-windows-amd64 \
vmrestore-windows-amd64 \
vmctl-windows-amd64
vmctl-windows-amd64 \
vmalert-tool-windows-amd64
victoria-metrics-crossbuild: \
victoria-metrics-linux-386 \
@ -342,7 +357,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
cd bin && \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
@ -351,6 +367,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \
@ -358,6 +375,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
vmagent-$(GOOS)-$(GOARCH)-prod \
@ -365,7 +383,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod
vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
release-vmutils-windows-goarch: \
vmagent-windows-$(GOARCH)-prod \
@ -373,7 +392,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod \
vmbackup-windows-$(GOARCH)-prod \
vmrestore-windows-$(GOARCH)-prod \
vmctl-windows-$(GOARCH)-prod
vmctl-windows-$(GOARCH)-prod \
vmalert-tool-windows-$(GOARCH)-prod
cd bin && \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
@ -382,6 +402,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \
@ -389,6 +410,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \
vmagent-windows-$(GOARCH)-prod.exe \
@ -396,7 +418,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe
vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@ -514,3 +537,4 @@ docs-sync:
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs

103
app/vmalert-tool/Makefile Normal file
View file

@ -0,0 +1,103 @@
# All these commands must run from repository root.
vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) app-local
vmalert-tool-race:
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
vmalert-tool-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker
vmalert-tool-pure-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
vmalert-tool-linux-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
vmalert-tool-linux-arm-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
vmalert-tool-linux-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
vmalert-tool-linux-ppc64le-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
vmalert-tool-linux-386-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
vmalert-tool-darwin-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
vmalert-tool-darwin-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
vmalert-tool-freebsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
vmalert-tool-openbsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
vmalert-tool-windows-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
package-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) package-via-docker
package-vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
package-vmalert-tool-amd64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
package-vmalert-tool-arm:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
package-vmalert-tool-arm64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
package-vmalert-tool-ppc64le:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
package-vmalert-tool-386:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
publish-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
vmalert-tool-linux-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-ppc64le:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
vmalert-tool-linux-s390x:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
vmalert-tool-linux-386:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-freebsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-openbsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-windows-amd64:
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) app-local-pure

244
app/vmalert-tool/README.md Normal file
View file

@ -0,0 +1,244 @@
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

54
app/vmalert-tool/main.go Normal file
View file

@ -0,0 +1,54 @@
package main
import (
"fmt"
"log"
"os"
"time"
"github.com/urfave/cli/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
)
func main() {
start := time.Now()
app := &cli.App{
Name: "vmalert-tool",
Usage: "VMAlert command-line tool",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "unittest",
Usage: "Run unittest for alerting and recording rules.",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
Flags: []cli.Flag{
&cli.StringSliceFlag{
Name: "files",
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
Required: true,
},
&cli.BoolFlag{
Name: "disableAlertgroupLabel",
Usage: "disable adding group's Name as label to generated alerts and time series.",
Required: false,
},
},
Action: func(c *cli.Context) error {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
return fmt.Errorf("unittest failed")
}
return nil
},
},
},
}
err := app.Run(os.Args)
if err != nil {
log.Fatalln(err)
}
log.Printf("Total time: %v", time.Since(start))
}

View file

@ -0,0 +1,19 @@
package unittest
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
// alertTestCase holds alert_rule_test cases defined in test file
type alertTestCase struct {
EvalTime *promutils.Duration `yaml:"eval_time"`
GroupName string `yaml:"groupname"`
Alertname string `yaml:"alertname"`
ExpAlerts []expAlert `yaml:"exp_alerts"`
}
// expAlert holds exp_alerts defined in test file
type expAlert struct {
ExpLabels map[string]string `yaml:"exp_labels"`
ExpAnnotations map[string]string `yaml:"exp_annotations"`
}

View file

@ -0,0 +1,182 @@
package unittest
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strconv"
"strings"
"time"
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// series holds input_series defined in the test file
type series struct {
Series string `yaml:"series"`
Values string `yaml:"values"`
}
// sequenceValue is an omittable value in a sequence of time series values.
type sequenceValue struct {
Value float64
Omitted bool
}
func httpWrite(address string, r io.Reader) {
resp, err := http.Post(address, "", r)
if err != nil {
logger.Fatalf("failed to send to storage: %v", err)
}
resp.Body.Close()
}
// writeInputSeries send input series to vmstorage and flush them
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
r := testutil.WriteRequest{}
for _, data := range input {
expr, err := metricsql.Parse(data.Series)
if err != nil {
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
}
promvals, err := parseInputValue(data.Values, true)
if err != nil {
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
}
metricExpr, ok := expr.(*metricsql.MetricExpr)
if !ok {
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
}
samples := make([]testutil.Sample, 0, len(promvals))
ts := startStamp
for _, v := range promvals {
if !v.Omitted {
samples = append(samples, testutil.Sample{
Timestamp: ts.UnixMilli(),
Value: v.Value,
})
}
ts = ts.Add(interval.Duration())
}
var ls []testutil.Label
for _, filter := range metricExpr.LabelFilterss[0] {
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
}
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
}
data, err := testutil.Compress(r)
if err != nil {
return fmt.Errorf("failed to compress data: %v", err)
}
// write input series to vm
httpWrite(dst, bytes.NewBuffer(data))
vmstorage.Storage.DebugFlush()
return nil
}
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
var res []sequenceValue
items := strings.Split(input, " ")
reg := regexp.MustCompile(`\D?\d*\D?`)
for _, item := range items {
if item == "stale" {
res = append(res, sequenceValue{Value: decimal.StaleNaN})
continue
}
vals := reg.FindAllString(item, -1)
switch len(vals) {
case 1:
if vals[0] == "_" {
res = append(res, sequenceValue{Omitted: true})
continue
}
v, err := strconv.ParseFloat(vals[0], 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v})
continue
case 2:
p1 := vals[0][:len(vals[0])-1]
v2, err := strconv.ParseInt(vals[1], 10, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
switch option {
case '+':
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v1 + float64(v2)})
case 'x':
for i := int64(0); i <= v2; i++ {
if p1 == "_" {
if i == 0 {
i = 1
}
res = append(res, sequenceValue{Omitted: true})
continue
}
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
if !origin || v1 == 0 {
res = append(res, sequenceValue{Value: v1 * float64(i)})
continue
}
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
newRes, err := parseInputValue(newVal, false)
if err != nil {
return nil, err
}
res = append(res, newRes...)
break
}
default:
return nil, fmt.Errorf("got invalid operation %b", option)
}
case 3:
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
if err != nil {
return nil, err
}
p1 := vals[0][:len(vals[0])-1]
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
var isAdd bool
if option == '+' {
isAdd = true
}
for _, r := range r1 {
if isAdd {
res = append(res, sequenceValue{
Value: r.Value + v1,
})
} else {
res = append(res, sequenceValue{
Value: v1 - r.Value,
})
}
}
default:
return nil, fmt.Errorf("unsupported input %s", input)
}
}
return res, nil
}

View file

@ -0,0 +1,93 @@
package unittest
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
)
func TestParseInputValue(t *testing.T) {
testCases := []struct {
input string
exp []sequenceValue
failed bool
}{
{
"",
nil,
true,
},
{
"testfailed",
nil,
true,
},
// stale doesn't support operations
{
"stalex3",
nil,
true,
},
{
"-4",
[]sequenceValue{{Value: -4}},
false,
},
{
"_",
[]sequenceValue{{Omitted: true}},
false,
},
{
"stale",
[]sequenceValue{{Value: decimal.StaleNaN}},
false,
},
{
"-4x1",
[]sequenceValue{{Value: -4}, {Value: -4}},
false,
},
{
"_x1",
[]sequenceValue{{Omitted: true}},
false,
},
{
"1+1x4",
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
false,
},
{
"2-1x4",
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
false,
},
{
"1+1x1 _ -4 stale 3+20x1",
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
false,
},
}
for _, tc := range testCases {
output, err := parseInputValue(tc.input, true)
if err != nil != tc.failed {
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
}
if len(tc.exp) != len(output) {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
for i := 0; i < len(tc.exp); i++ {
if tc.exp[i].Omitted != output[i].Omitted {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
if tc.exp[i].Value != output[i].Value {
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
continue
}
t.Fatalf("expect %v, got %v", tc.exp, output)
}
}
}
}

View file

@ -0,0 +1,92 @@
package unittest
import (
"context"
"fmt"
"net/url"
"reflect"
"sort"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
type metricsqlTestCase struct {
Expr string `yaml:"expr"`
EvalTime *promutils.Duration `yaml:"eval_time"`
ExpSamples []expSample `yaml:"exp_samples"`
}
type expSample struct {
Labels string `yaml:"labels"`
Value float64 `yaml:"value"`
}
// checkMetricsqlCase will check metricsql_expr_test cases
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
Outer:
for _, mt := range cases {
result, _, err := queries.Query(context.Background(), mt.Expr, mt.EvalTime.ParseTime())
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
mt.EvalTime.Duration().String(), err))
continue
}
var gotSamples []parsedSample
for _, s := range result.Data {
sort.Slice(s.Labels, func(i, j int) bool {
return s.Labels[i].Name < s.Labels[j].Name
})
gotSamples = append(gotSamples, parsedSample{
Labels: s.Labels,
Value: s.Values[0],
})
}
var expSamples []parsedSample
for _, s := range mt.ExpSamples {
expLb := datasource.Labels{}
if s.Labels != "" {
metricsqlExpr, err := metricsql.Parse(s.Labels)
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
continue Outer
}
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
if !ok {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
continue Outer
}
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
expLb = append(expLb, datasource.Label{
Name: l.Label,
Value: l.Value,
})
}
}
sort.Slice(expLb, func(i, j int) bool {
return expLb[i].Name < expLb[j].Name
})
expSamples = append(expSamples, parsedSample{
Labels: expLb,
Value: s.Value,
})
}
sort.Slice(expSamples, func(i, j int) bool {
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
})
sort.Slice(gotSamples, func(i, j int) bool {
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
})
if !reflect.DeepEqual(expSamples, gotSamples) {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
}
}
return
}

View file

@ -0,0 +1,43 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,49 @@
rule_files:
- rules.yaml
tests:
- interval: 1m
name: "Failing test"
input_series:
- series: test
values: "0"
metricsql_expr_test:
- expr: test
eval_time: 0m
exp_samples:
- value: 0
labels: test
# will failed cause there is no "Test" group and rule defined
alert_rule_test:
- eval_time: 0m
groupname: Test
alertname: Test
exp_alerts:
- exp_labels: {}
- interval: 1m
name: Failing alert test
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause rule is firing
- eval_time: 5m
groupname: group1
alertname: InstanceDown
exp_alerts: []
- interval: 1m
name: Failing alert test with missing groupname
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause missing groupname
- eval_time: 5m
alertname: AlwaysFiring
exp_alerts: []

View file

@ -0,0 +1,30 @@
# can be executed successfully but will take more than 1 minute
# not included in unit test now
evaluation_interval: 100d
rule_files:
- rules.yaml
tests:
- interval: 1d
input_series:
- series: test
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
# We input series to 2024-01-01T00:00:00 here.
values: "0+1x19723"
metricsql_expr_test:
- expr: timestamp(test)
eval_time: 0m
exp_samples:
- value: 0
- expr: test
eval_time: 100d
exp_samples:
- labels: test
value: 100
- expr: timestamp(test)
eval_time: 19000d
exp_samples:
- value: 1641600000 # 19000d -> seconds.

View file

@ -0,0 +1,39 @@
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 1m
- name: group2
rules:
- record: t1
expr: test
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 5m
- name: group3
rules:
- record: t2
expr: t1
- name: group4
rules:
- record: t3
expr: t1

View file

@ -0,0 +1,99 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
group_eval_order: ["group4", "group2", "group3"]
tests:
- interval: 1m
name: "basic test"
input_series:
- series: "test"
values: "_x5 1x5 _ stale"
alert_rule_test:
- eval_time: 1m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- eval_time: 1m
groupname: group2
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
- eval_time: 6m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
metricsql_expr_test:
- expr: test
eval_time: 11m
exp_samples:
- labels: '{__name__="test"}'
value: 1
- expr: test
eval_time: 12m
exp_samples: []
- interval: 1m
name: "basic test2"
input_series:
- series: 'up{job="vmagent1", instance="localhost:9090"}'
values: "0+0x1440"
- series: "test"
values: "0+1x1440"
metricsql_expr_test:
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
eval_time: 4m
exp_samples:
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
value: 1
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
value: 1
- expr: t1
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t1", datacenter="dc-123"}'
- expr: t2
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t2", datacenter="dc-123"}'
- expr: t3
eval_time: 4m
exp_samples:
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
- value: 3
labels: '{__name__="t3", datacenter="dc-123"}'
alert_rule_test:
- eval_time: 10m
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent1
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: alerts
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,46 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,83 @@
package unittest
import (
"fmt"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// parsedSample is a sample with parsed Labels
type parsedSample struct {
Labels datasource.Labels
Value float64
}
func (ps *parsedSample) String() string {
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
}
func parsedSamplesString(pss []parsedSample) string {
if len(pss) == 0 {
return "nil"
}
s := pss[0].String()
for _, ps := range pss[1:] {
s += ", " + ps.String()
}
return s
}
// labelAndAnnotation holds labels and annotations
type labelAndAnnotation struct {
Labels datasource.Labels
Annotations datasource.Labels
}
func (la *labelAndAnnotation) String() string {
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
}
// labelsAndAnnotations is collection of LabelAndAnnotation
type labelsAndAnnotations []labelAndAnnotation
func (la labelsAndAnnotations) Len() int { return len(la) }
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
func (la labelsAndAnnotations) Less(i, j int) bool {
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
if diff != 0 {
return diff < 0
}
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
}
func (la labelsAndAnnotations) String() string {
if len(la) == 0 {
return "[]"
}
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
for i, l := range la[1:] {
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
}
s += "\n]"
return s
}
// indentLines prefixes each line in the supplied string with the given "indent" string.
func indentLines(lines, indent string) string {
sb := strings.Builder{}
n := strings.Split(lines, "\n")
for i, l := range n {
if i > 0 {
sb.WriteString(indent)
}
sb.WriteString(l)
if i != len(n)-1 {
sb.WriteRune('\n')
}
}
return sb.String()
}

View file

@ -0,0 +1,443 @@
package unittest
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"reflect"
"sort"
"time"
"gopkg.in/yaml.v2"
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metrics"
)
var (
storagePath string
httpListenAddr = ":8880"
// insert series from 1970-01-01T00:00:00
testStartTime = time.Unix(0, 0).UTC()
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
disableAlertgroupLabel bool
)
const (
testStoragePath = "vmalert-unittest"
testLogLevel = "ERROR"
)
// UnitTest runs unittest for files
func UnitTest(files []string, disableGroupLabel bool) bool {
if err := templates.Load([]string{}, true); err != nil {
logger.Fatalf("failed to load template: %v", err)
}
storagePath = filepath.Join(os.TempDir(), testStoragePath)
processFlags()
vminsert.Init()
vmselect.Init()
// storagePath will be created again when closing vmselect, so remove it again.
defer fs.MustRemoveAll(storagePath)
defer vminsert.Stop()
defer vmselect.Stop()
disableAlertgroupLabel = disableGroupLabel
return rulesUnitTest(files)
}
func rulesUnitTest(files []string) bool {
var failed bool
for _, f := range files {
if err := ruleUnitTest(f); err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
failed = true
} else {
fmt.Println(" SUCCESS")
}
}
return failed
}
func ruleUnitTest(filename string) []error {
fmt.Println("\nUnit Testing: ", filename)
b, err := os.ReadFile(filename)
if err != nil {
return []error{fmt.Errorf("failed to read file: %w", err)}
}
var unitTestInp unitTestFile
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
}
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
}
if unitTestInp.EvaluationInterval.Duration() == 0 {
fmt.Println("evaluation_interval set to 1m by default")
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
}
groupOrderMap := make(map[string]int)
for i, gn := range unitTestInp.GroupEvalOrder {
if _, ok := groupOrderMap[gn]; ok {
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
}
groupOrderMap[gn] = i
}
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
if err != nil {
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
}
var errs []error
for _, t := range unitTestInp.Tests {
if err := verifyTestGroup(t); err != nil {
errs = append(errs, err)
continue
}
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
errs = append(errs, testErrs...)
}
if len(errs) > 0 {
return errs
}
return nil
}
func verifyTestGroup(group testGroup) error {
var testGroupName string
if group.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
}
for _, at := range group.AlertRuleTests {
if at.Alertname == "" {
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
}
if !disableAlertgroupLabel && at.GroupName == "" {
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
}
if disableAlertgroupLabel && at.GroupName != "" {
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
}
if at.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
for _, et := range group.MetricsqlExprTests {
if et.Expr == "" {
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
}
if et.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
return nil
}
func processFlags() {
flag.Parse()
for _, fv := range []struct {
flag string
value string
}{
{flag: "storageDataPath", value: storagePath},
{flag: "loggerLevel", value: testLogLevel},
{flag: "search.disableCache", value: "true"},
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
{flag: "retentionPeriod", value: "100y"},
{flag: "datasource.url", value: testDataSourcePath},
{flag: "remoteWrite.url", value: testRemoteWritePath},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
}
}
}
func setUp() {
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/prometheus/api/v1/query":
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
case "/prometheus/api/v1/write", "/api/v1/write":
if err := promremotewrite.InsertHandler(r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
default:
}
return false
})
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
readyCheckFunc := func() bool {
resp, err := http.Get(testHealthHTTPPath)
if err != nil {
return false
}
_ = resp.Body.Close()
return resp.StatusCode == 200
}
checkCheck:
for {
select {
case <-ctx.Done():
logger.Fatalf("http server can't be ready in 30s")
default:
if readyCheckFunc() {
break checkCheck
}
time.Sleep(3 * time.Second)
}
}
}
func tearDown() {
if err := httpserver.Stop(httpListenAddr); err != nil {
logger.Errorf("cannot stop the webservice: %s", err)
}
vmstorage.Stop()
metrics.UnregisterAllMetrics()
fs.MustRemoveAll(storagePath)
}
// resolveAndGlobFilepaths joins all relative paths in a configuration
// with a given base directory and replaces all globs with matching files.
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
for i, rf := range utf.RuleFiles {
if rf != "" && !filepath.IsAbs(rf) {
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
}
}
var globbedFiles []string
for _, rf := range utf.RuleFiles {
m, err := filepath.Glob(rf)
if err != nil {
return err
}
if len(m) == 0 {
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
}
globbedFiles = append(globbedFiles, m...)
}
utf.RuleFiles = globbedFiles
return nil
}
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
// set up vmstorage and http server for ingest and read queries
setUp()
// tear down vmstorage and clean the data dir
defer tearDown()
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
if err != nil {
return []error{err}
}
q, err := datasource.Init(nil)
if err != nil {
return []error{fmt.Errorf("failed to init datasource: %v", err)}
}
rw, err := remotewrite.NewDebugClient()
if err != nil {
return []error{fmt.Errorf("failed to init wr: %v", err)}
}
alertEvalTimesMap := map[time.Duration]struct{}{}
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
for _, at := range tg.AlertRuleTests {
et := at.EvalTime.Duration()
alertEvalTimesMap[et] = struct{}{}
if _, ok := alertExpResultMap[et]; !ok {
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
}
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
}
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
}
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
for k := range alertEvalTimesMap {
alertEvalTimes = append(alertEvalTimes, k)
}
sort.Slice(alertEvalTimes, func(i, j int) bool {
return alertEvalTimes[i] < alertEvalTimes[j]
})
// sort group eval order according to the given "group_eval_order".
sort.Slice(testGroups, func(i, j int) bool {
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
})
// create groups with given rule
var groups []*rule.Group
for _, group := range testGroups {
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
groups = append(groups, ng)
}
evalIndex := 0
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
for _, g := range groups {
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
for err := range errs {
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
ts, err))
}
}
// flush series after each group evaluation
vmstorage.Storage.DebugFlush()
}
// check alert_rule_test case at every eval time
for evalIndex < len(alertEvalTimes) {
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
break
}
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
for _, g := range groups {
if disableAlertgroupLabel {
g.Name = ""
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
continue
}
if _, ok := gotAlertsMap[g.Name]; !ok {
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
}
for _, r := range g.Rules {
ar, isAlertRule := r.(*rule.AlertingRule)
if !isAlertRule {
continue
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
for _, got := range ar.GetAlerts() {
if got.State != notifier.StateFiring {
continue
}
if disableAlertgroupLabel {
delete(got.Labels, "alertgroup")
}
laa := labelAndAnnotation{
Labels: datasource.ConvertToLabels(got.Labels),
Annotations: datasource.ConvertToLabels(got.Annotations),
}
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
}
}
}
}
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
for alertname, res := range gres {
var expAlerts labelsAndAnnotations
for _, expAlert := range res {
if expAlert.ExpLabels == nil {
expAlert.ExpLabels = make(map[string]string)
}
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
if !disableAlertgroupLabel {
expAlert.ExpLabels["alertgroup"] = groupname
}
// alertNameLabel is added as additional labels in vmalert.
expAlert.ExpLabels["alertname"] = alertname
expAlerts = append(expAlerts, labelAndAnnotation{
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
})
}
sort.Sort(expAlerts)
gotAlerts := gotAlertsMap[groupname][alertname]
sort.Sort(gotAlerts)
if !reflect.DeepEqual(expAlerts, gotAlerts) {
var testGroupName string
if tg.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
}
expString := indentLines(expAlerts.String(), " ")
gotString := indentLines(gotAlerts.String(), " ")
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
}
}
}
evalIndex++
}
}
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
return checkErrs
}
// unitTestFile holds the contents of a single unit test file
type unitTestFile struct {
RuleFiles []string `yaml:"rule_files"`
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
GroupEvalOrder []string `yaml:"group_eval_order"`
Tests []testGroup `yaml:"tests"`
}
// testGroup is a group of input series and test cases associated with it
type testGroup struct {
Interval *promutils.Duration `yaml:"interval"`
InputSeries []series `yaml:"input_series"`
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
ExternalLabels map[string]string `yaml:"external_labels"`
TestGroupName string `yaml:"name"`
}
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
func (tg *testGroup) maxEvalTime() time.Duration {
var maxd time.Duration
for _, alert := range tg.AlertRuleTests {
if alert.EvalTime.Duration() > maxd {
maxd = alert.EvalTime.Duration()
}
}
for _, met := range tg.MetricsqlExprTests {
if met.EvalTime.Duration() > maxd {
maxd = met.EvalTime.Duration()
}
}
return maxd
}

View file

@ -0,0 +1,47 @@
package unittest
import (
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUnitRule(t *testing.T) {
testCases := []struct {
name string
disableGroupLabel bool
files []string
failed bool
}{
{
name: "run multi files",
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
failed: false,
},
{
name: "disable group label",
disableGroupLabel: true,
files: []string{"./testdata/disable-group-label.yaml"},
failed: false,
},
{
name: "failing test",
files: []string{"./testdata/failed-test.yaml"},
failed: true,
},
}
for _, tc := range testCases {
fail := UnitTest(tc.files, tc.disableGroupLabel)
if fail != tc.failed {
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
}
}
}

View file

@ -754,6 +754,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.

View file

@ -0,0 +1,131 @@
package datasource
import (
"context"
"net/http"
"sync"
"time"
)
// FakeQuerier is a mock querier that return predefined results and error message
type FakeQuerier struct {
sync.Mutex
metrics []Metric
err error
}
// SetErr sets query error message
func (fq *FakeQuerier) SetErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
// Reset reset querier's error message and results
func (fq *FakeQuerier) Reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
// Add appends metrics to querier result metrics
func (fq *FakeQuerier) Add(metrics ...Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
// BuildWithParams return FakeQuerier itself
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
return fq
}
// QueryRange performs query
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return Result{}, nil, fq.err
}
cp := make([]Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithRegistry can store different results for different query expr
type FakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]Metric
}
// Set stores query result for given key
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
// Reset clean querier's results registry
func (fqr *FakeQuerierWithRegistry) Reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
// BuildWithParams returns itself
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
return fqr
}
// QueryRange performs query
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier registry
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return Result{}, req, nil
}
cp := make([]Metric, len(metrics))
copy(cp, metrics)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithDelay mock querier with given delay duration
type FakeQuerierWithDelay struct {
FakeQuerier
Delay time.Duration
}
// Query returns query result after delay duration
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
timer := time.NewTimer(fqd.Delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.FakeQuerier.Query(ctx, expr, ts)
}
// BuildWithParams returns itself
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
return fqd
}

View file

@ -18,6 +18,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
)
@ -229,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
groups: make(map[uint64]*rule.Group),
querierBuilder: q,
notifiers: nts,
labels: labels,

View file

@ -8,11 +8,19 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
rule.SkipRandSleepOnGroupStart = true
}
func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false)
@ -98,10 +106,10 @@ groups:
ctx, cancel := context.WithCancel(context.Background())
m := &manager{
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*Group),
querierBuilder: &datasource.FakeQuerier{},
groups: make(map[uint64]*rule.Group),
labels: map[string]string{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
rw: &remotewrite.Client{},
}

View file

@ -3,14 +3,13 @@ package main
import (
"context"
"fmt"
"net/url"
"sort"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
@ -19,7 +18,7 @@ type manager struct {
querierBuilder datasource.QuerierBuilder
notifiers func() []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
// remote read builder.
rr datasource.QuerierBuilder
@ -27,28 +26,28 @@ type manager struct {
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
groups map[uint64]*rule.Group
}
// RuleAPI generates APIRule object from alert by its ID(hash)
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
// ruleAPI generates apiRule object from alert by its ID(hash)
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
if rule.ID() == rID {
return rule.ToAPI(), nil
return ruleToAPI(rule), nil
}
}
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
}
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
// alertAPI generates apiAlert object from alert by its ID(hash)
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
if !ok {
return nil, fmt.Errorf("can't find group with id %d", gID)
}
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
for _, r := range g.Rules {
ar, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
return apiAlert, nil
}
}
@ -82,15 +81,15 @@ func (m *manager) close() {
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error {
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
m.wg.Add(1)
id := g.ID()
go func() {
defer m.wg.Done()
if restore {
g.start(ctx, m.notifiers, m.rw, m.rr)
g.Start(ctx, m.notifiers, m.rw, m.rr)
} else {
g.start(ctx, m.notifiers, m.rw, nil)
g.Start(ctx, m.notifiers, m.rw, nil)
}
}()
m.groups[id] = g
@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
var rrPresent, arPresent bool
groupsRegistry := make(map[uint64]*Group)
groupsRegistry := make(map[uint64]*rule.Group)
for _, cfg := range groupsCfg {
for _, r := range cfg.Rules {
if rrPresent && arPresent {
@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
arPresent = true
}
}
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
}
type updateItem struct {
old *Group
new *Group
old *rule.Group
new *rule.Group
}
var toUpdate []updateItem
@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
if !ok {
// old group is not present in new list,
// so must be stopped and deleted
og.close()
og.Close()
delete(m.groups, og.ID())
og = nil
continue
@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
go func(old *rule.Group, new *rule.Group) {
old.UpdateWith(new)
wg.Done()
}(item.old, item.new)
item.old.interruptEval()
item.old.InterruptEval()
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]APIRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, r.ToAPI())
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -10,8 +10,10 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
// successful cases of
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
m := &manager{groups: make(map[uint64]*rule.Group)}
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
if err := m.update(context.Background(), cfg, false); err != nil {
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
}()
var (
VMRows = &AlertingRule{
VMRows = &rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 10 * time.Second,
@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "{{$labels}}",
},
}
Conns = &AlertingRule{
Conns = &rule.AlertingRule{
Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{
@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
}
ExampleAlertAlwaysFiring = &AlertingRule{
ExampleAlertAlwaysFiring = &rule.AlertingRule{
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
name string
initPath string
updatePath string
want []*Group
want []*rule.Group
}{
{
name: "update good rules",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
Rules: []rule.Rule{
&rule.AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 5 * time.Minute,
@ -153,19 +155,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update good rules from 1 to 2 groups",
initPath: "config/testdata/dir/rules/rules1-good.rules",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -176,20 +179,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update with one bad rule file",
initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup",
Type: config.NewPrometheusType(),
Rules: []Rule{
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -200,19 +203,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{
want: []*rule.Group{
{
File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(),
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
Rules: []rule.Rule{VMRows},
},
{
File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval,
Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{
Name: "TestGroup",
Rules: []rule.Rule{
Conns,
ExampleAlertAlwaysFiring,
},
@ -224,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
}
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
@ -255,11 +259,36 @@ func TestManagerUpdate(t *testing.T) {
})
}
}
func compareGroups(t *testing.T, a, b *rule.Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := rule.CompareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func TestManagerUpdateNegative(t *testing.T) {
testCases := []struct {
notifiers []notifier.Notifier
rw *remotewrite.Client
rw remotewrite.RWClient
cfg config.Group
expErr string
}{
@ -286,7 +315,7 @@ func TestManagerUpdateNegative(t *testing.T) {
"contains alerting rules",
},
{
[]notifier.Notifier{&fakeNotifier{}},
[]notifier.Notifier{&notifier.FakeNotifier{}},
nil,
config.Group{
Name: "Recording and alerting rules",
@ -316,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.cfg.Name, func(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querierBuilder: &fakeQuerier{},
groups: make(map[uint64]*rule.Group),
querierBuilder: &datasource.FakeQuerier{},
rw: tc.rw,
}
if tc.notifiers != nil {
@ -346,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
}
return cfg
}
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -0,0 +1,59 @@
package notifier
import (
"context"
"fmt"
"sync"
"time"
)
// FakeNotifier is a mock notifier
type FakeNotifier struct {
sync.Mutex
alerts []Alert
// records number of received alerts in total
counter int
}
// Close does nothing
func (*FakeNotifier) Close() {}
// Addr returns ""
func (*FakeNotifier) Addr() string { return "" }
// Send sets alerts and increases counter
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
// GetCounter returns received alerts count
func (fn *FakeNotifier) GetCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
// GetAlerts returns stored alerts
func (fn *FakeNotifier) GetAlerts() []Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
// FaultyNotifier is a mock notifier that Send() will return failed response
type FaultyNotifier struct {
FakeNotifier
}
// Send returns failed response
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}

View file

@ -0,0 +1,322 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write client.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
}

View file

@ -0,0 +1,97 @@
package remotewrite
import (
"bytes"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// DebugClient won't push series periodically, but will write data to remote endpoint
// immediately when Push() is called
type DebugClient struct {
addr string
c *http.Client
wg sync.WaitGroup
}
// NewDebugClient initiates and returns a new DebugClient
func NewDebugClient() (*DebugClient, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &DebugClient{
c: &http.Client{
Timeout: *sendTimeout,
Transport: t,
},
addr: strings.TrimSuffix(*addr, "/"),
}
return c, nil
}
// Push sends the given timeseries to the remote storage.
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
c.wg.Add(1)
defer c.wg.Done()
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
data, err := wr.Marshal()
if err != nil {
return fmt.Errorf("failed to marshal the given time series: %w", err)
}
return c.send(data)
}
// Close stops the DebugClient
func (c *DebugClient) Close() error {
c.wg.Wait()
return nil
}
func (c *DebugClient) send(data []byte) error {
b := snappy.Encode(nil, data)
r := bytes.NewReader(b)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req)
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode/100 == 2 {
return nil
}
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}

View file

@ -0,0 +1,50 @@
package remotewrite
import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestDebugClient_Push(t *testing.T) {
testSrv := newRWServer()
oldAddr := *addr
*addr = testSrv.URL
defer func() {
*addr = oldAddr
}()
client, err := NewDebugClient()
if err != nil {
t.Fatalf("failed to create debug client: %s", err)
}
const rowsN = 100
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: float64(i),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}

View file

@ -1,322 +1,13 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
// RWClient represents an HTTP client for pushing data via remote write protocol
type RWClient interface {
// Push pushes the give time series to remote storage
Push(s prompbmarshal.TimeSeries) error
// Close stops the client. Client can't be reused after Close call.
Close() error
}

View file

@ -1,19 +1,16 @@
package main
import (
"context"
"flag"
"fmt"
"strings"
"time"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
var (
@ -33,7 +30,7 @@ var (
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
)
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
if *replayMaxDatapoints < 1 {
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
}
@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
var total int
for _, cfg := range groupsCfg {
ng := newGroup(cfg, qb, *evaluationInterval, labels)
total += ng.replay(tFrom, tTo, rw)
ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
}
logger.Infof("replay finished! Imported %d samples", total)
if rw != nil {
@ -77,99 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
}
return nil
}
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
var total int
step := g.Interval * time.Duration(*replayMaxDatapoints)
start = g.adjustReqTimestamp(start)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\neval_offset: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, g.EvalOffset, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !*disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(*replayRulesDelay)
}
return total
}
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < *replayRuleRetryAttempts; i++ {
tss, err = rule.ExecRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}

View file

@ -12,7 +12,7 @@ import (
)
type fakeReplayQuerier struct {
fakeQuerier
datasource.FakeQuerier
registry map[string]map[string]struct{}
}
@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
})
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,118 +0,0 @@
package main
import (
"context"
"errors"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// ExecRange executes the rule on the given time range.
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// ToAPI converts Rule into APIRule
ToAPI() APIRule
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []ruleStateEntry
cur int
}
type ruleStateEntry struct {
// stores last moment of time rule.Exec was called
time time.Time
// stores the timestamp rule.Exec was called with
at time.Time
// stores the duration of the last rule.Exec call
duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
err error
// stores the number of samples returned during
// the last evaluation
samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
seriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
curl string
}
func newRuleState(size int) *ruleState {
if size < 1 {
size = 1
}
return &ruleState{
entries: make([]ruleStateEntry, size),
}
}
func (s *ruleState) getLast() ruleStateEntry {
s.RLock()
defer s.RUnlock()
return s.entries[s.cur]
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getAll() []ruleStateEntry {
entries := make([]ruleStateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.time.IsZero() || !e.at.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e ruleStateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}

View file

@ -1,11 +1,10 @@
package main
package rule
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"time"
@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
seriesFetched *utils.Gauge
}
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
// NewAlertingRule creates a new AlertingRule
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
Type: group.Type,
RuleID: cfg.ID,
@ -80,10 +80,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
metrics: &alertingRuleMetrics{},
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
ar.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
ar.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
ar.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := ar.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
func() float64 {
e := ar.state.getLast()
if e.seriesFetched == nil {
if e.SeriesFetched == nil {
// means seriesFetched is unsupported
return -1
}
seriesFetched := float64(*e.seriesFetched)
if seriesFetched == 0 && e.samples > 0 {
seriesFetched := float64(*e.SeriesFetched)
if seriesFetched == 0 && e.Samples > 0 {
// `alert: 0.95` will fetch no series
// but will get one time series in response.
seriesFetched = float64(e.samples)
seriesFetched = float64(e.Samples)
}
return seriesFetched
})
return ar
}
// Close unregisters rule metrics
func (ar *AlertingRule) Close() {
// close unregisters rule metrics
func (ar *AlertingRule) close() {
ar.metrics.active.Unregister()
ar.metrics.pending.Unregister()
ar.metrics.errors.Unregister()
@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// GetAlerts returns active alerts of rule
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var alerts []*notifier.Alert
for _, a := range ar.alerts {
alerts = append(alerts, a)
}
return alerts
}
// GetAlert returns alert if id exists
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
if ar.alerts == nil {
return nil
}
return ar.alerts[id]
}
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
if !ar.Debug {
return
@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
logger.Infof("%s", prefix+msg)
}
// updateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) updateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
type labelSet struct {
// origin labels extracted from received time series
// plus extra labels (group labels, service labels like alertNameLabel).
@ -248,11 +294,11 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
return ls, nil
}
// ExecRange executes alerting rule on the given time range similarly to Exec.
// execRange executes alerting rule on the given time range similarly to exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil {
return nil, err
@ -297,19 +343,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute
// Exec executes AlertingRule expression via the given Querier.
// exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
err: err,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Err: err,
Curl: requestToCurl(req),
}
defer func() {
@ -323,7 +369,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
@ -342,15 +388,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
for _, m := range res.Data {
ls, err := ar.toLabels(m, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.Err
}
h := hash(ls.processed)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.Err
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
@ -373,8 +419,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
a, err := ar.newAlert(m, ls, start, qFn)
if err != nil {
curState.err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.Err
}
a.ID = h
a.State = notifier.StatePending
@ -423,8 +469,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
}
if limit > 0 && numActivePending > limit {
ar.alerts = map[uint64]*notifier.Alert{}
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.Err
}
return ar.toTimeSeries(ts.Unix()), nil
}
@ -441,26 +487,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
return tss
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
// TODO: consider hashing algorithm in VM
func hash(labels map[string]string) uint64 {
hash := fnv.New64a()
@ -503,102 +529,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
return a, err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// ToAPI returns Rule representation in form of APIRule
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
// satisfy APIRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// AlertsToAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
var alerts []*APIAlert
ar.alertsMu.RLock()
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.alertsMu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
aa := &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
const (
// alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
@ -646,10 +576,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
}
// Restore restores the value of ActiveAt field for active alerts,
// restore restores the value of ActiveAt field for active alerts,
// based on previously written time series `alertForStateMetricName`.
// Only rules with For > 0 can be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
if ar.For < 1 {
return nil
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
for i, step := range tc.steps {
fq.reset()
fq.add(step...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
fq.Reset()
fq.Add(step...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
@ -482,11 +482,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.data...)
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
fq.Add(tc.data...)
gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
@ -518,24 +518,24 @@ func TestAlertingRule_ExecRange(t *testing.T) {
func TestGroup_Restore(t *testing.T) {
defaultTS := time.Now()
fqr := &fakeQuerierWithRegistry{}
fqr := &datasource.FakeQuerierWithRegistry{}
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
t.Helper()
defer fqr.reset()
defer fqr.Reset()
for _, r := range rules {
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
}
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }
fg.start(context.Background(), nts, nil, fqr)
nts := func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} }
fg.Start(context.Background(), nts, nil, fqr)
wg.Done()
}()
fg.close()
fg.Close()
wg.Wait()
gotAlerts := make(map[uint64]*notifier.Alert)
@ -582,11 +582,11 @@ func TestGroup_Restore(t *testing.T) {
ActiveAt: defaultTS,
},
})
fqr.reset()
fqr.Reset()
// one active alert with state restore
ts := time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -598,7 +598,7 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, one with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{
@ -616,9 +616,9 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
@ -636,7 +636,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert but wrong state restore
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
stateMetric("wrong alert", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -648,7 +648,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with labels
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -660,7 +660,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with restore labels missmatch
ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev", "team", "foo"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -672,30 +672,30 @@ func TestGroup_Restore(t *testing.T) {
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.reset()
fq.Reset()
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), time.Now(), 0)
fq.SetErr(errors.New(expErr))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@ -705,7 +705,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
}
func TestAlertingRuleLimit(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
@ -737,15 +737,15 @@ func TestAlertingRuleLimit(t *testing.T) {
err error
timestamp = time.Now()
)
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
for _, testCase := range testCases {
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit)
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
}
fq.reset()
fq.Reset()
}
func TestAlertingRule_Template(t *testing.T) {
@ -870,12 +870,12 @@ func TestAlertingRule_Template(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
tc.rule.state = newRuleState(10)
fq.add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
fq.Add(tc.metrics...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
@ -989,7 +989,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
For: waitFor,
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(10),
state: &ruleState{entries: make([]StateEntry, 10)},
}
return &rule
}

View file

@ -1,8 +1,10 @@
package main
package rule
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"hash/fnv"
"net/url"
@ -11,7 +13,7 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/metrics"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@ -21,6 +23,18 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent ")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
)
// Group is an entity for grouping rules
@ -96,7 +110,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
return r
}
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
// NewGroup returns a new group
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Type: cfg.Type,
Name: cfg.Name,
@ -153,11 +168,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
return g
}
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(qb, g, rule)
func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
if r.Alert != "" {
return NewAlertingRule(qb, g, r)
}
return newRecordingRule(qb, g, rule)
return NewRecordingRule(qb, g, r)
}
// ID return unique group ID that consists of
@ -178,8 +193,8 @@ func (g *Group) ID() uint64 {
return hash.Sum64()
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
// restore restores alerts state for group rules
func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
if !ok {
@ -195,7 +210,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
Headers: g.Headers,
Debug: ar.Debug,
})
if err := ar.Restore(ctx, q, ts, lookback); err != nil {
if err := ar.restore(ctx, q, ts, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
@ -205,7 +220,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
// updateWith updates existing group with
// passed group object. This function ignores group
// evaluation interval change. It supposed to be updated
// in group.start function.
// in group.Start function.
// Not thread-safe.
func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule)
@ -218,11 +233,11 @@ func (g *Group) updateWith(newGroup *Group) error {
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i].Close()
g.Rules[i].close()
g.Rules[i] = nil
continue
}
if err := or.UpdateWith(nr); err != nil {
if err := or.updateWith(nr); err != nil {
return err
}
delete(rulesRegistry, nr.ID())
@ -255,10 +270,10 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil
}
// interruptEval interrupts in-flight rules evaluations
// InterruptEval interrupts in-flight rules evaluations
// within the group. It is expected that g.evalCancel
// will be repopulated after the call.
func (g *Group) interruptEval() {
func (g *Group) InterruptEval() {
g.mu.RLock()
defer g.mu.RUnlock()
@ -267,12 +282,13 @@ func (g *Group) interruptEval() {
}
}
func (g *Group) close() {
// Close stops the group and it's rules, unregisters group metrics
func (g *Group) Close() {
if g.doneCh == nil {
return
}
close(g.doneCh)
g.interruptEval()
g.InterruptEval()
<-g.finishedCh
g.metrics.iterationDuration.Unregister()
@ -280,19 +296,21 @@ func (g *Group) close() {
g.metrics.iterationMissed.Unregister()
g.metrics.iterationInterval.Unregister()
for _, rule := range g.Rules {
rule.Close()
rule.close()
}
}
var skipRandSleepOnGroupStart bool
// SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
var SkipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
// Start starts group's evaluation
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
defer func() { close(g.finishedCh) }()
evalTS := time.Now()
// sleep random duration to spread group rules evaluation
// over time in order to reduce load on datasource.
if !skipRandSleepOnGroupStart {
if !SkipRandSleepOnGroupStart {
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
g.infof("will start in %v", sleepBeforeStart)
@ -310,10 +328,10 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
e := &executor{
rw: rw,
notifiers: nts,
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
g.infof("started")
@ -355,7 +373,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
// restore the rules state after the first evaluation
// so only active alerts can be restored.
if rr != nil {
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack)
err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
}
@ -409,6 +427,22 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
}
}
// UpdateWith inserts new group to updateCh
func (g *Group) UpdateWith(new *Group) {
g.updateCh <- new
}
// DeepCopy returns a deep copy of group
func (g *Group) DeepCopy() *Group {
g.mu.RLock()
data, _ := json.Marshal(g)
g.mu.RUnlock()
newG := Group{}
_ = json.Unmarshal(data, &newG)
newG.Rules = g.Rules
return &newG
}
// delayBeforeStart returns a duration on the interval between [ts..ts+interval].
// delayBeforeStart accounts for `offset`, so returned duration should be always
// bigger than the `offset`.
@ -438,6 +472,89 @@ func (g *Group) infof(format string, args ...interface{}) {
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
}
// Replay performs group replay
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
var total int
step := g.Interval * time.Duration(maxDataPoint)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(replayDelay)
}
return total
}
// ExecOnce evaluates all the rules under group for once with given timestamp.
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
e := &executor{
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
if len(g.Rules) < 1 {
return nil
}
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}
// getResolveDuration returns the duration after which firing alert
// can be considered as resolved.
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
@ -477,20 +594,22 @@ func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
return timestamp
}
// executor contains group's notify and rw configs
type executor struct {
notifiers func() []notifier.Notifier
Notifiers func() []notifier.Notifier
notifierHeaders map[string]string
rw *remotewrite.Client
Rw remotewrite.RWClient
previouslySentSeriesToRWMu sync.Mutex
// previouslySentSeriesToRW stores series sent to RW on previous iteration
// PreviouslySentSeriesToRW stores series sent to RW on previous iteration
// map[ruleID]map[ruleLabels][]prompb.Label
// where `ruleID` is ID of the Rule within a Group
// and `ruleLabels` is []prompb.Label marshalled to a string
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
PreviouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
}
// execConcurrently executes rules concurrently if concurrency>1
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
res := make(chan error, len(rules))
if concurrency == 1 {
@ -505,14 +624,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
sem := make(chan struct{}, concurrency)
go func() {
wg := sync.WaitGroup{}
for _, rule := range rules {
for _, r := range rules {
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, ts, resolveDuration, limit)
<-sem
wg.Done()
}(rule)
}(r)
}
wg.Wait()
close(res)
@ -530,10 +649,10 @@ var (
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
execTotal.Inc()
tss, err := rule.Exec(ctx, ts, limit)
tss, err := r.exec(ctx, ts, limit)
if err != nil {
if errors.Is(err, context.Canceled) {
// the context can be cancelled on graceful shutdown
@ -541,17 +660,17 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return nil
}
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
return fmt.Errorf("rule %q: failed to execute: %w", r, err)
}
if e.rw != nil {
if e.Rw != nil {
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
var lastErr error
for _, ts := range tss {
remoteWriteTotal.Inc()
if err := e.rw.Push(ts); err != nil {
if err := e.Rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err)
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
}
}
return lastErr
@ -560,13 +679,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return err
}
staleSeries := e.getStaleSeries(rule, tss, ts)
staleSeries := e.getStaleSeries(r, tss, ts)
if err := pushToRW(staleSeries); err != nil {
return err
}
}
ar, ok := rule.(*AlertingRule)
ar, ok := r.(*AlertingRule)
if !ok {
return nil
}
@ -578,11 +697,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
wg := sync.WaitGroup{}
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers() {
for _, nt := range e.Notifiers() {
wg.Add(1)
go func(nt notifier.Notifier) {
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
}
wg.Done()
}(nt)
@ -592,7 +711,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
}
// getStaledSeries checks whether there are stale series from previously sent ones.
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
for _, ts := range tss {
// convert labels to strings so we can compare with previously sent series
@ -600,11 +719,11 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
ruleLabels[key] = ts.Labels
}
rID := rule.ID()
rID := r.ID()
var staleS []prompbmarshal.TimeSeries
// check whether there are series which disappeared and need to be marked as stale
e.previouslySentSeriesToRWMu.Lock()
for key, labels := range e.previouslySentSeriesToRW[rID] {
for key, labels := range e.PreviouslySentSeriesToRW[rID] {
if _, ok := ruleLabels[key]; ok {
continue
}
@ -613,7 +732,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
staleS = append(staleS, ss)
}
// set previous series to current
e.previouslySentSeriesToRW[rID] = ruleLabels
e.PreviouslySentSeriesToRW[rID] = ruleLabels
e.previouslySentSeriesToRWMu.Unlock()
return staleS
@ -631,14 +750,14 @@ func (e *executor) purgeStaleSeries(activeRules []Rule) {
for _, rule := range activeRules {
id := rule.ID()
prev, ok := e.previouslySentSeriesToRW[id]
prev, ok := e.PreviouslySentSeriesToRW[id]
if ok {
// keep previous series for staleness detection
newPreviouslySentSeriesToRW[id] = prev
}
}
e.previouslySentSeriesToRW = nil
e.previouslySentSeriesToRW = newPreviouslySentSeriesToRW
e.PreviouslySentSeriesToRW = nil
e.PreviouslySentSeriesToRW = newPreviouslySentSeriesToRW
e.previouslySentSeriesToRWMu.Unlock()
}

View file

@ -1,17 +1,22 @@
package main
package rule
import (
"context"
"fmt"
"math"
"os"
"reflect"
"sort"
"testing"
"time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
@ -20,7 +25,15 @@ import (
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true
SkipRandSleepOnGroupStart = true
}
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
fmt.Println("failed to load template for test")
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUpdateWith(t *testing.T) {
@ -138,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"}
qb := &fakeQuerier{}
qb := &datasource.FakeQuerier{}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(qb, r))
@ -170,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got)
}
if err := compareRules(t, got, want); err != nil {
if err := CompareRules(t, got, want); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
@ -179,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
}
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
fs := &fakeQuerier{}
fn := &fakeNotifier{}
fs := &datasource.FakeQuerier{}
fn := &notifier.FakeNotifier{}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
const inst1, inst2, job = "foo", "bar", "baz"
@ -204,7 +231,7 @@ func TestGroupStart(t *testing.T) {
alert1.State = notifier.StateFiring
// add external label
alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1
// add service labels
@ -219,7 +246,7 @@ func TestGroupStart(t *testing.T) {
alert2.State = notifier.StateFiring
// add external label
alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
// add rule labels
alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2
// add service labels
@ -228,40 +255,40 @@ func TestGroupStart(t *testing.T) {
alert2.ID = hash(alert2.Labels)
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
fs.Add(m1)
fs.Add(m2)
go func() {
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
close(finished)
}()
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts()
gotAlerts := fn.GetAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
gotAlertsNum := fn.getCounter()
gotAlertsNum := fn.GetCounter()
if gotAlertsNum < len(expectedAlerts)*2 {
t.Fatalf("expected to receive at least %d alerts; got %d instead",
len(expectedAlerts)*2, gotAlertsNum)
}
// reset previous data
fs.reset()
fs.Reset()
// and set only one datapoint for response
fs.add(m1)
fs.Add(m1)
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts()
gotAlerts = fn.GetAlerts()
alert2.State = notifier.StateInactive
expectedAlerts = []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
g.close()
g.Close()
<-finished
}
@ -294,15 +321,15 @@ func TestResolveDuration(t *testing.T) {
func TestGetStaleSeries(t *testing.T) {
ts := time.Now()
e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
t.Helper()
var tss []prompbmarshal.TimeSeries
for _, l := range labels {
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
}
staleS := e.getStaleSeries(rule, tss, ts)
staleS := e.getStaleSeries(r, tss, ts)
if staleS == nil && expLabels == nil {
return
}
@ -387,7 +414,7 @@ func TestPurgeStaleSeries(t *testing.T) {
f := func(curRules, newRules, expStaleRules []Rule) {
t.Helper()
e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
// seed executor with series for
// current rules
@ -397,13 +424,13 @@ func TestPurgeStaleSeries(t *testing.T) {
e.purgeStaleSeries(newRules)
if len(e.previouslySentSeriesToRW) != len(expStaleRules) {
if len(e.PreviouslySentSeriesToRW) != len(expStaleRules) {
t.Fatalf("expected to get %d stale series, got %d",
len(expStaleRules), len(e.previouslySentSeriesToRW))
len(expStaleRules), len(e.PreviouslySentSeriesToRW))
}
for _, exp := range expStaleRules {
if _, ok := e.previouslySentSeriesToRW[exp.ID()]; !ok {
if _, ok := e.PreviouslySentSeriesToRW[exp.ID()]; !ok {
t.Fatalf("expected to have rule %d; got nil instead", exp.ID())
}
}
@ -438,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
}
func TestFaultyNotifier(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := newTestAlertingRule("instant", 0)
r.q = fq
fn := &fakeNotifier{}
fn := &notifier.FakeNotifier{}
e := &executor{
notifiers: func() []notifier.Notifier {
Notifiers: func() []notifier.Notifier {
return []notifier.Notifier{
&faultyNotifier{},
&notifier.FaultyNotifier{},
fn,
}
},
@ -464,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
tn := time.Now()
deadline := tn.Add(delay / 2)
for {
if fn.getCounter() > 0 {
if fn.GetCounter() > 0 {
return
}
if tn.After(deadline) {
@ -477,18 +504,18 @@ func TestFaultyNotifier(t *testing.T) {
}
func TestFaultyRW(t *testing.T) {
fq := &fakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq := &datasource.FakeQuerier{}
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := &RecordingRule{
Name: "test",
state: newRuleState(10),
q: fq,
state: &ruleState{entries: make([]StateEntry, 10)},
}
e := &executor{
rw: &remotewrite.Client{},
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
Rw: &remotewrite.Client{},
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
err := e.exec(context.Background(), r, time.Now(), 0, 10)
@ -498,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
}
func TestCloseWithEvalInterruption(t *testing.T) {
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const delay = time.Second * 2
fq := &fakeQuerierWithDelay{delay: delay}
fq := &datasource.FakeQuerierWithDelay{Delay: delay}
const evalInterval = time.Millisecond
g := newGroup(groups[0], fq, evalInterval, nil)
g := NewGroup(groups[0], fq, evalInterval, nil)
go g.start(context.Background(), nil, nil, nil)
go g.Start(context.Background(), nil, nil, nil)
time.Sleep(evalInterval * 20)
go func() {
g.close()
g.Close()
}()
deadline := time.Tick(delay / 2)
@ -637,3 +679,81 @@ func TestGetPrometheusReqTimestamp(t *testing.T) {
}
}
}
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
// NewRecordingRule creates a new RecordingRule
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
Type: group.Type,
RuleID: cfg.ID,
@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
}),
}
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil {
rr.state = newRuleState(*cfg.UpdateEntriesLimit)
} else {
rr.state = newRuleState(*ruleUpdateEntriesLimit)
entrySize = *cfg.UpdateEntriesLimit
}
if entrySize < 1 {
entrySize = 1
}
rr.state = &ruleState{
entries: make([]StateEntry, entrySize),
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
e := rr.state.getLast()
if e.err == nil {
if e.Err == nil {
return 0
}
return 1
@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 {
e := rr.state.getLast()
return float64(e.samples)
return float64(e.Samples)
})
return rr
}
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
// close unregisters rule metrics
func (rr *RecordingRule) close() {
rr.metrics.errors.Unregister()
rr.metrics.samples.Unregister()
}
// ExecRange executes recording rule on the given time range similarly to Exec.
// execRange executes recording rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil {
return nil, err
@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
return tss, nil
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
// exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now()
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
curState := ruleStateEntry{
time: start,
at: ts,
duration: time.Since(start),
samples: len(res.Data),
seriesFetched: res.SeriesFetched,
curl: requestToCurl(req),
curState := StateEntry{
Time: start,
At: ts,
Duration: time.Since(start),
Samples: len(res.Data),
SeriesFetched: res.SeriesFetched,
Curl: requestToCurl(req),
}
defer func() {
@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
}()
if err != nil {
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.err
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.Err
}
qMetrics := res.Data
numSeries := len(qMetrics)
if limit > 0 && numSeries > limit {
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.err
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.Err
}
duplicates := make(map[string]struct{}, len(qMetrics))
@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
ts := rr.toTimeSeries(r)
key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok {
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.err
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.Err
}
duplicates[key] = struct{}{}
tss = append(tss, ts)
@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
return newTimeSeries(m.Values, m.Timestamps, labels)
}
// UpdateWith copies all significant fields.
func (rr *RecordingRule) UpdateWith(r Rule) error {
// updateWith copies all significant fields.
func (rr *RecordingRule) updateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
rr.q = nr.q
return nil
}
// ToAPI returns Rule's representation in form
// of APIRule
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
return r
}

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"context"
@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
Name: "job:foo",
Labels: map[string]string{
"source": "test",
}},
},
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tc.rule.state = newRuleState(10)
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0)
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo",
@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
fq := &datasource.FakeQuerier{}
fq.Add(tc.metrics...)
tc.rule.q = fq
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
}
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
"source": "test_limit",
}}
var err error
for _, testCase := range testCases {
fq := &fakeQuerier{}
fq.add(testMetrics...)
fq := &datasource.FakeQuerier{}
fq.Add(testMetrics...)
rule.q = fq
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit)
_, err = rule.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
@ -216,17 +219,16 @@ func TestRecordingRuleLimit(t *testing.T) {
func TestRecordingRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{
Name: "job:foo",
state: newRuleState(10),
Labels: map[string]string{
"job": "test",
},
state: &ruleState{entries: make([]StateEntry, 10)},
}
fq := &fakeQuerier{}
fq := &datasource.FakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
fq.SetErr(errors.New(expErr))
rr.q = fq
_, err := rr.Exec(context.TODO(), time.Now(), 0)
_, err := rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
fq.reset()
fq.Reset()
// add metrics which differs only by `job` label
// which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), time.Now(), 0)
_, err = rr.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}

174
app/vmalert/rule/rule.go Normal file
View file

@ -0,0 +1,174 @@
package rule
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// execRange executes the rule on the given time range.
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// updateWith performs modification of current Rule
// with fields of the given Rule.
updateWith(Rule) error
// close performs the shutdown procedures for rule
// such as metrics unregister
close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []StateEntry
cur int
}
// StateEntry stores rule's execution states
type StateEntry struct {
// stores last moment of time rule.Exec was called
Time time.Time
// stores the timesteamp with which rule.Exec was called
At time.Time
// stores the duration of the last rule.Exec call
Duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
Err error
// stores the number of samples returned during
// the last evaluation
Samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
SeriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
Curl string
}
// GetLastEntry returns latest stateEntry of rule
func GetLastEntry(r Rule) StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getLast()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getLast()
}
return StateEntry{}
}
// GetRuleStateSize returns size of rule stateEntry
func GetRuleStateSize(r Rule) int {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.size()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.size()
}
return 0
}
// GetAllRuleState returns rule entire stateEntries
func GetAllRuleState(r Rule) []StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getAll()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getAll()
}
return []StateEntry{}
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getLast() StateEntry {
s.RLock()
defer s.RUnlock()
if len(s.entries) == 0 {
return StateEntry{}
}
return s.entries[s.cur]
}
func (s *ruleState) getAll() []StateEntry {
entries := make([]StateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.Time.IsZero() || !e.At.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e StateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < replayRuleRetryAttempts; i++ {
tss, err = r.execRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}

View file

@ -0,0 +1,81 @@
package rule
import (
"sync"
"testing"
"time"
)
func TestRule_state(t *testing.T) {
stateEntriesN := 20
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, stateEntriesN)}}
e := r.state.getLast()
if !e.At.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
r.state.add(StateEntry{At: now})
e = r.state.getLast()
if e.At != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
r.state.add(StateEntry{At: now2})
e = r.state.getLast()
if e.At != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now2)
}
if len(r.state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(r.state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
r.state.add(StateEntry{At: last})
}
e = r.state.getLast()
if e.At != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, last)
}
if len(r.state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(r.state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, 20)}}
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
r.state.add(StateEntry{At: time.Now()})
r.state.getAll()
r.state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -1,239 +1,18 @@
package main
package rule
import (
"context"
"fmt"
"net/http"
"reflect"
"sort"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
err error
}
func (fq *fakeQuerier) setErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fq
}
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) (datasource.Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return datasource.Result{}, nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]datasource.Metric
}
func (fqr *fakeQuerierWithRegistry) set(key string, metrics ...datasource.Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]datasource.Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqr
}
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (datasource.Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return datasource.Result{}, req, nil
}
cp := make([]datasource.Metric, len(metrics))
copy(cp, metrics)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithDelay struct {
fakeQuerier
delay time.Duration
}
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (datasource.Result, *http.Request, error) {
timer := time.NewTimer(fqd.delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.fakeQuerier.Query(ctx, expr, ts)
}
func (fqd *fakeQuerierWithDelay) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqd
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
// records number of received alerts in total
counter int
}
func (*fakeNotifier) Close() {}
func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
type faultyNotifier struct {
fakeNotifier
}
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := compareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func compareRules(t *testing.T, a, b Rule) error {
// CompareRules is a test helper func for other tests
func CompareRules(t *testing.T, a, b Rule) error {
t.Helper()
switch v := a.(type) {
case *AlertingRule:
@ -287,6 +66,50 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
return nil
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
t.Helper()
if len(a) != len(b) {

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"fmt"

View file

@ -1,4 +1,4 @@
package main
package rule
import (
"net/http"

View file

@ -1,100 +0,0 @@
package main
import (
"sync"
"testing"
"time"
)
func TestRule_stateDisabled(t *testing.T) {
state := newRuleState(-1)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
if len(state.getAll()) != 1 {
// state should store at least one update at any circumstances
t.Fatalf("expected for state to have %d entries; got %d",
1, len(state.getAll()),
)
}
}
func TestRule_state(t *testing.T) {
stateEntriesN := 20
state := newRuleState(stateEntriesN)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
state.add(ruleStateEntry{at: now})
e = state.getLast()
if e.at != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
state.add(ruleStateEntry{at: now2})
e = state.getLast()
if e.at != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now2)
}
if len(state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
state.add(ruleStateEntry{at: last})
}
e = state.getLast()
if e.at != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, last)
}
if len(state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
state := newRuleState(20)
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
state.add(ruleStateEntry{at: time.Now()})
state.getAll()
state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -10,6 +10,7 @@ import (
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@ -143,38 +144,32 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
}
}
const (
paramGroupID = "group_id"
paramAlertID = "alert_id"
paramRuleID = "rule_id"
)
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
func (rh *requestHandler) getRule(r *http.Request) (apiRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
}
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 64)
if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
}
rule, err := rh.m.RuleAPI(groupID, ruleID)
obj, err := rh.m.ruleAPI(groupID, ruleID)
if err != nil {
return APIRule{}, errResponse(err, http.StatusNotFound)
return apiRule{}, errResponse(err, http.StatusNotFound)
}
return rule, nil
return obj, nil
}
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
func (rh *requestHandler) getAlert(r *http.Request) (*apiAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
}
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 0)
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramAlertID, err)
}
a, err := rh.m.AlertAPI(groupID, alertID)
a, err := rh.m.alertAPI(groupID, alertID)
if err != nil {
return nil, errResponse(err, http.StatusNotFound)
}
@ -184,17 +179,17 @@ func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
type listGroupsResponse struct {
Status string `json:"status"`
Data struct {
Groups []APIGroup `json:"groups"`
Groups []apiGroup `json:"groups"`
} `json:"data"`
}
func (rh *requestHandler) groups() []APIGroup {
func (rh *requestHandler) groups() []apiGroup {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
groups := make([]APIGroup, 0)
groups := make([]apiGroup, 0)
for _, g := range rh.m.groups {
groups = append(groups, g.toAPI())
groups = append(groups, groupToAPI(g))
}
// sort list of alerts for deterministic output
@ -221,35 +216,35 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
type listAlertsResponse struct {
Status string `json:"status"`
Data struct {
Alerts []*APIAlert `json:"alerts"`
Alerts []*apiAlert `json:"alerts"`
} `json:"data"`
}
func (rh *requestHandler) groupAlerts() []GroupAlerts {
func (rh *requestHandler) groupAlerts() []groupAlerts {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
var groupAlerts []GroupAlerts
var gAlerts []groupAlerts
for _, g := range rh.m.groups {
var alerts []*APIAlert
var alerts []*apiAlert
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)
a, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
alerts = append(alerts, a.AlertsToAPI()...)
alerts = append(alerts, ruleToAPIAlert(a)...)
}
if len(alerts) > 0 {
groupAlerts = append(groupAlerts, GroupAlerts{
Group: g.toAPI(),
gAlerts = append(gAlerts, groupAlerts{
Group: groupToAPI(g),
Alerts: alerts,
})
}
}
sort.Slice(groupAlerts, func(i, j int) bool {
return groupAlerts[i].Group.Name < groupAlerts[j].Group.Name
sort.Slice(gAlerts, func(i, j int) bool {
return gAlerts[i].Group.Name < gAlerts[j].Group.Name
})
return groupAlerts
return gAlerts
}
func (rh *requestHandler) listAlerts() ([]byte, error) {
@ -257,14 +252,14 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
lr.Data.Alerts = make([]*APIAlert, 0)
lr.Data.Alerts = make([]*apiAlert, 0)
for _, g := range rh.m.groups {
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)
a, ok := r.(*rule.AlertingRule)
if !ok {
continue
}
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsToAPI()...)
lr.Data.Alerts = append(lr.Data.Alerts, ruleToAPIAlert(a)...)
}
}

View file

@ -38,7 +38,7 @@ btn-primary
{% endif %}
{% endfunc %}
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
{% func ListGroups(r *http.Request, originGroups []apiGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
{%code
@ -46,9 +46,9 @@ btn-primary
rOk := make(map[string]int)
rNotOk := make(map[string]int)
rNoMatch := make(map[string]int)
var groups []APIGroup
var groups []apiGroup
for _, g := range originGroups {
var rules []APIRule
var rules []apiRule
for _, r := range g.Rules {
if r.LastError != "" {
rNotOk[g.ID]++
@ -166,7 +166,7 @@ btn-primary
{% endfunc %}
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
{% func ListAlerts(r *http.Request, groupAlerts []groupAlerts) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
{% if len(groupAlerts) > 0 %}
@ -183,7 +183,7 @@ btn-primary
</div>
{%code
var keys []string
alertsByRule := make(map[string][]*APIAlert)
alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID)
@ -310,7 +310,7 @@ btn-primary
{% endfunc %}
{% func Alert(r *http.Request, alert *APIAlert) %}
{% func Alert(r *http.Request, alert *apiAlert) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
@ -397,7 +397,7 @@ btn-primary
{% endfunc %}
{% func RuleDetails(r *http.Request, rule APIRule) %}
{% func RuleDetails(r *http.Request, rule apiRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code
@ -416,9 +416,9 @@ btn-primary
var seriesFetchedEnabled bool
var seriesFetchedWarning bool
for _, u := range rule.Updates {
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0{
if *u.SeriesFetched == 0 && u.Samples == 0{
seriesFetchedWarning = true
}
}
@ -537,23 +537,23 @@ btn-primary
<tbody>
{% for _, u := range rule.Updates %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.Time.Format(time.RFC3339) %}</span>
</td>
<td class="text-center">{%d u.samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.seriesFetched != nil %}{%d *u.seriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
<td class="text-center">{%d u.Samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.SeriesFetched != nil %}{%d *u.SeriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.Duration.Seconds() %}s</td>
<td class="text-center">{%s u.At.Format(time.RFC3339) %}</td>
<td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.curl %}</textarea>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.Curl %}</textarea>
</td>
</tr>
</li>
{% if u.err != nil %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
{% if u.Err != nil %}
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}">
<span class="alert-danger">{%v u.err %}</span>
<span class="alert-danger">{%v u.Err %}</span>
</td>
</tr>
{% endif %}
@ -582,7 +582,7 @@ btn-primary
<span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span>
{% endfunc %}
{% func seriesFetchedWarn(r APIRule) %}
{% func seriesFetchedWarn(r apiRule) %}
{% if isNoMatch(r) %}
<svg xmlns="http://www.w3.org/2000/svg"
data-bs-toggle="tooltip"
@ -596,7 +596,7 @@ btn-primary
{% endfunc %}
{%code
func isNoMatch (r APIRule) bool {
func isNoMatch (r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
}
%}

View file

@ -196,7 +196,7 @@ func buttonActive(filter, expValue string) string {
}
//line app/vmalert/web.qtpl:41
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []APIGroup) {
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:41
qw422016.N().S(`
`)
@ -216,9 +216,9 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
rOk := make(map[string]int)
rNotOk := make(map[string]int)
rNoMatch := make(map[string]int)
var groups []APIGroup
var groups []apiGroup
for _, g := range originGroups {
var rules []APIRule
var rules []apiRule
for _, r := range g.Rules {
if r.LastError != "" {
rNotOk[g.ID]++
@ -610,7 +610,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
}
//line app/vmalert/web.qtpl:166
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []APIGroup) {
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:166
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:166
@ -621,7 +621,7 @@ func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups [
}
//line app/vmalert/web.qtpl:166
func ListGroups(r *http.Request, originGroups []APIGroup) string {
func ListGroups(r *http.Request, originGroups []apiGroup) string {
//line app/vmalert/web.qtpl:166
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:166
@ -636,7 +636,7 @@ func ListGroups(r *http.Request, originGroups []APIGroup) string {
}
//line app/vmalert/web.qtpl:169
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:169
qw422016.N().S(`
`)
@ -712,7 +712,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
`)
//line app/vmalert/web.qtpl:185
var keys []string
alertsByRule := make(map[string][]*APIAlert)
alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID)
@ -891,7 +891,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
}
//line app/vmalert/web.qtpl:255
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:255
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:255
@ -902,7 +902,7 @@ func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []
}
//line app/vmalert/web.qtpl:255
func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) string {
func ListAlerts(r *http.Request, groupAlerts []groupAlerts) string {
//line app/vmalert/web.qtpl:255
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:255
@ -1091,7 +1091,7 @@ func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Tar
}
//line app/vmalert/web.qtpl:313
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:313
qw422016.N().S(`
`)
@ -1274,7 +1274,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
}
//line app/vmalert/web.qtpl:397
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:397
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:397
@ -1285,7 +1285,7 @@ func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
}
//line app/vmalert/web.qtpl:397
func Alert(r *http.Request, alert *APIAlert) string {
func Alert(r *http.Request, alert *apiAlert) string {
//line app/vmalert/web.qtpl:397
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:397
@ -1300,7 +1300,7 @@ func Alert(r *http.Request, alert *APIAlert) string {
}
//line app/vmalert/web.qtpl:400
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule) {
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:400
qw422016.N().S(`
`)
@ -1331,9 +1331,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
var seriesFetchedEnabled bool
var seriesFetchedWarning bool
for _, u := range rule.Updates {
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0 {
if *u.SeriesFetched == 0 && u.Samples == 0 {
seriesFetchedWarning = true
}
}
@ -1587,7 +1587,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`
<tr`)
//line app/vmalert/web.qtpl:540
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:540
qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:540
@ -1597,13 +1597,13 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
<td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">`)
//line app/vmalert/web.qtpl:542
qw422016.E().S(u.time.Format(time.RFC3339))
qw422016.E().S(u.Time.Format(time.RFC3339))
//line app/vmalert/web.qtpl:542
qw422016.N().S(`</span>
</td>
<td class="text-center">`)
//line app/vmalert/web.qtpl:544
qw422016.N().D(u.samples)
qw422016.N().D(u.Samples)
//line app/vmalert/web.qtpl:544
qw422016.N().S(`</td>
`)
@ -1612,9 +1612,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
//line app/vmalert/web.qtpl:545
qw422016.N().S(`<td class="text-center">`)
//line app/vmalert/web.qtpl:545
if u.seriesFetched != nil {
if u.SeriesFetched != nil {
//line app/vmalert/web.qtpl:545
qw422016.N().D(*u.seriesFetched)
qw422016.N().D(*u.SeriesFetched)
//line app/vmalert/web.qtpl:545
}
//line app/vmalert/web.qtpl:545
@ -1625,18 +1625,18 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`
<td class="text-center">`)
//line app/vmalert/web.qtpl:546
qw422016.N().FPrec(u.duration.Seconds(), 3)
qw422016.N().FPrec(u.Duration.Seconds(), 3)
//line app/vmalert/web.qtpl:546
qw422016.N().S(`s</td>
<td class="text-center">`)
//line app/vmalert/web.qtpl:547
qw422016.E().S(u.at.Format(time.RFC3339))
qw422016.E().S(u.At.Format(time.RFC3339))
//line app/vmalert/web.qtpl:547
qw422016.N().S(`</td>
<td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">`)
//line app/vmalert/web.qtpl:549
qw422016.E().S(u.curl)
qw422016.E().S(u.Curl)
//line app/vmalert/web.qtpl:549
qw422016.N().S(`</textarea>
</td>
@ -1644,12 +1644,12 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
</li>
`)
//line app/vmalert/web.qtpl:553
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:553
qw422016.N().S(`
<tr`)
//line app/vmalert/web.qtpl:554
if u.err != nil {
if u.Err != nil {
//line app/vmalert/web.qtpl:554
qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:554
@ -1671,7 +1671,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`">
<span class="alert-danger">`)
//line app/vmalert/web.qtpl:556
qw422016.E().V(u.err)
qw422016.E().V(u.Err)
//line app/vmalert/web.qtpl:556
qw422016.N().S(`</span>
</td>
@ -1697,7 +1697,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
}
//line app/vmalert/web.qtpl:563
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule) {
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:563
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:563
@ -1708,7 +1708,7 @@ func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule)
}
//line app/vmalert/web.qtpl:563
func RuleDetails(r *http.Request, rule APIRule) string {
func RuleDetails(r *http.Request, rule apiRule) string {
//line app/vmalert/web.qtpl:563
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:563
@ -1853,7 +1853,7 @@ func badgeStabilizing() string {
}
//line app/vmalert/web.qtpl:585
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:585
qw422016.N().S(`
`)
@ -1879,7 +1879,7 @@ func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
}
//line app/vmalert/web.qtpl:596
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:596
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:596
@ -1890,7 +1890,7 @@ func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
}
//line app/vmalert/web.qtpl:596
func seriesFetchedWarn(r APIRule) string {
func seriesFetchedWarn(r apiRule) string {
//line app/vmalert/web.qtpl:596
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:596
@ -1905,6 +1905,6 @@ func seriesFetchedWarn(r APIRule) string {
}
//line app/vmalert/web.qtpl:599
func isNoMatch(r APIRule) bool {
func isNoMatch(r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
}

View file

@ -1,6 +1,7 @@
package main
import (
"context"
"encoding/json"
"fmt"
"net/http"
@ -9,32 +10,29 @@ import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
)
func TestHandler(t *testing.T) {
ar := &AlertingRule{
Name: "alert",
alerts: map[uint64]*notifier.Alert{
0: {State: notifier.StateFiring},
},
state: newRuleState(10),
}
ar.state.add(ruleStateEntry{
time: time.Now(),
at: time.Now(),
samples: 10,
fq := &datasource.FakeQuerier{}
fq.Add(datasource.Metric{
Values: []float64{1}, Timestamps: []int64{0},
})
rr := &RecordingRule{
Name: "record",
state: newRuleState(10),
}
g := &Group{
g := &rule.Group{
Name: "group",
Rules: []Rule{ar, rr},
Concurrency: 1,
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
ar := rule.NewAlertingRule(fq, g, config.Rule{ID: 0, Alert: "alert"})
rr := rule.NewRecordingRule(fq, g, config.Rule{ID: 1, Record: "record"})
g.Rules = []rule.Rule{ar, rr}
g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, nil, time.Time{})
m := &manager{groups: map[uint64]*rule.Group{
g.ID(): g,
}}
rh := &requestHandler{m: m}
getResp := func(url string, to interface{}, code int) {
@ -70,13 +68,13 @@ func TestHandler(t *testing.T) {
})
t.Run("/vmalert/rule", func(t *testing.T) {
a := ar.ToAPI()
a := ruleToAPI(ar)
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
r := rr.ToAPI()
r := ruleToAPI(rr)
getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200)
})
t.Run("/vmalert/alert", func(t *testing.T) {
alerts := ar.AlertsToAPI()
alerts := ruleToAPIAlert(ar)
for _, a := range alerts {
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
}
@ -103,14 +101,14 @@ func TestHandler(t *testing.T) {
}
})
t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) {
expAlert := ar.newAlertAPI(*ar.alerts[0])
alert := &APIAlert{}
expAlert := newAlertAPI(ar, ar.GetAlerts()[0])
alert := &apiAlert{}
getResp(ts.URL+"/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}
alert = &APIAlert{}
alert = &apiAlert{}
getResp(ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
@ -148,7 +146,7 @@ func TestHandler(t *testing.T) {
}
func TestEmptyResponse(t *testing.T) {
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*Group)}}
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*rule.Group)}}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) }))
defer ts.Close()
@ -201,7 +199,7 @@ func TestEmptyResponse(t *testing.T) {
}
})
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*Group{0: {Name: "test"}}}}
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*rule.Group{0: {Name: "test"}}}}
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) })
t.Run("empty group /api/v1/rules", func(t *testing.T) {

View file

@ -2,13 +2,28 @@ package main
import (
"fmt"
"net/url"
"sort"
"strconv"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
)
// APIAlert represents a notifier.AlertingRule state
const (
// ParamGroupID is group id key in url parameter
paramGroupID = "group_id"
// ParamAlertID is alert id key in url parameter
paramAlertID = "alert_id"
// ParamRuleID is rule id key in url parameter
paramRuleID = "rule_id"
)
// apiAlert represents a notifier.AlertingRule state
// for WEB view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIAlert struct {
type apiAlert struct {
State string `json:"state"`
Name string `json:"name"`
Value string `json:"value"`
@ -38,24 +53,24 @@ type APIAlert struct {
}
// WebLink returns a link to the alert which can be used in UI.
func (aa *APIAlert) WebLink() string {
func (aa *apiAlert) WebLink() string {
return fmt.Sprintf("alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
}
// APILink returns a link to the alert's JSON representation.
func (aa *APIAlert) APILink() string {
func (aa *apiAlert) APILink() string {
return fmt.Sprintf("api/v1/alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
}
// APIGroup represents Group for WEB view
// apiGroup represents Group for web view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIGroup struct {
type apiGroup struct {
// Name is the group name as present in the config
Name string `json:"name"`
// Rules contains both recording and alerting rules
Rules []APIRule `json:"rules"`
Rules []apiRule `json:"rules"`
// Interval is the Group's evaluation interval in float seconds as present in the file.
Interval float64 `json:"interval"`
// LastEvaluation is the timestamp of the last time the Group was executed
@ -81,15 +96,15 @@ type APIGroup struct {
Labels map[string]string `json:"labels,omitempty"`
}
// GroupAlerts represents a group of alerts for WEB view
type GroupAlerts struct {
Group APIGroup
Alerts []*APIAlert
// groupAlerts represents a group of alerts for WEB view
type groupAlerts struct {
Group apiGroup
Alerts []*apiAlert
}
// APIRule represents a Rule for WEB view
// apiRule represents a Rule for web view
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIRule struct {
type apiRule struct {
// State must be one of these under following scenarios
// "pending": at least 1 alert in the rule in pending state and no other alert in firing ruleState.
// "firing": at least 1 alert in the rule in firing state.
@ -111,7 +126,7 @@ type APIRule struct {
// LastEvaluation is the timestamp of the last time the rule was executed
LastEvaluation time.Time `json:"lastEvaluation"`
// Alerts is the list of all the alerts in this rule that are currently pending or firing
Alerts []*APIAlert `json:"alerts,omitempty"`
Alerts []*apiAlert `json:"alerts,omitempty"`
// Health is the health of rule evaluation.
// It MUST be one of "ok", "err", "unknown"
Health string `json:"health"`
@ -138,11 +153,206 @@ type APIRule struct {
// MaxUpdates is the max number of recorded ruleStateEntry objects
MaxUpdates int `json:"max_updates_entries"`
// Updates contains the ordered list of recorded ruleStateEntry objects
Updates []ruleStateEntry `json:"-"`
Updates []rule.StateEntry `json:"-"`
}
// WebLink returns a link to the alert which can be used in UI.
func (ar APIRule) WebLink() string {
func (ar apiRule) WebLink() string {
return fmt.Sprintf("rule?%s=%s&%s=%s",
paramGroupID, ar.GroupID, paramRuleID, ar.ID)
}
func ruleToAPI(r interface{}) apiRule {
if ar, ok := r.(*rule.AlertingRule); ok {
return alertingToAPI(ar)
}
if rr, ok := r.(*rule.RecordingRule); ok {
return recordingToAPI(rr)
}
return apiRule{}
}
func recordingToAPI(rr *rule.RecordingRule) apiRule {
lastState := rule.GetLastEntry(rr)
r := apiRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(rr),
Updates: rule.GetAllRuleState(rr),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
return r
}
// alertingToAPI returns Rule representation in form of apiRule
func alertingToAPI(ar *rule.AlertingRule) apiRule {
lastState := rule.GetLastEntry(ar)
r := apiRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ruleToAPIAlert(ar),
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(ar),
Updates: rule.GetAllRuleState(ar),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
// satisfy apiRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// ruleToAPIAlert generates list of apiAlert objects from existing alerts
func ruleToAPIAlert(ar *rule.AlertingRule) []*apiAlert {
var alerts []*apiAlert
for _, a := range ar.GetAlerts() {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, newAlertAPI(ar, a))
}
return alerts
}
// alertToAPI generates apiAlert object from alert by its id(hash)
func alertToAPI(ar *rule.AlertingRule, id uint64) *apiAlert {
a := ar.GetAlert(id)
if a == nil {
return nil
}
return newAlertAPI(ar, a)
}
// NewAlertAPI creates apiAlert for notifier.Alert
func newAlertAPI(ar *rule.AlertingRule, a *notifier.Alert) *apiAlert {
aa := &apiAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(*a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
func groupToAPI(g *rule.Group) apiGroup {
g = g.DeepCopy()
ag := apiGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]apiRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, ruleToAPI(r))
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -0,0 +1,23 @@
package main
import (
"testing"
)
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -33,7 +33,7 @@ var (
)
var (
saCfgReloaderStopCh = make(chan struct{})
saCfgReloaderStopCh chan struct{}
saCfgReloaderWG sync.WaitGroup
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
@ -62,6 +62,8 @@ func CheckStreamAggrConfig() error {
//
// MustStopStreamAggr must be called when stream aggr is no longer needed.
func InitStreamAggr() {
saCfgReloaderStopCh = make(chan struct{})
if *streamAggrConfig == "" {
return
}

View file

@ -1,11 +1,11 @@
---
sort: 20
weight: 20
sort: 29
weight: 29
title: Articles
menu:
docs:
parent: "victoriametrics"
weight: 20
parent: 'victoriametrics'
weight: 29
aliases:
- /Articles.html
---

View file

@ -1,11 +1,11 @@
---
sort: 23
weight: 23
sort: 32
weight: 32
title: VictoriaMetrics best practices
menu:
docs:
parent: "victoriametrics"
weight: 23
parent: 'victoriametrics'
weight: 32
aliases:
- /BestPractices.html
---

View file

@ -1,11 +1,11 @@
---
sort: 16
weight: 16
sort: 25
weight: 25
title: CHANGELOG
menu:
docs:
parent: "victoriametrics"
weight: 16
parent: 'victoriametrics'
weight: 25
aliases:
- /CHANGELOG.html
---
@ -44,8 +44,10 @@ The sandbox cluster installation is running under the constant load generated by
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve repeated VMUI page load times by enabling caching of static js and css at web browser side according to [these recommendations](https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show information about lines with bigger values at the top of the legend under the graph in order to simplify graph analysis.
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reduce vertical space usage, so more information is visible on the screen without scrolling.
* FEATURE: [vmalert-tool](https://docs.victoriametrics.com/#vmalert-tool): add `unittest` command to run unittest for alerting and recording rules. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4789) for details.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087) for details.
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.

View file

@ -1,11 +1,11 @@
---
sort: 19
weight: 19
sort: 28
weight: 28
title: CHANGELOG for the year 2020
menu:
docs:
parent: "victoriametrics"
weight: 19
parent: 'victoriametrics'
weight: 28
aliases:
- /CHANGELOG.html
---

View file

@ -1,11 +1,11 @@
---
sort: 18
weight: 18
sort: 27
weight: 27
title: CHANGELOG for the year 2021
menu:
docs:
parent: "victoriametrics"
weight: 18
parent: 'victoriametrics'
weight: 27
aliases:
- /CHANGELOG.html
---

View file

@ -1,11 +1,11 @@
---
sort: 17
weight: 17
sort: 26
weight: 26
title: CHANGELOG for the year 2022
menu:
docs:
parent: "victoriametrics"
weight: 17
parent: 'victoriametrics'
weight: 26
aliases:
- /CHANGELOG.html
---

View file

@ -1,11 +1,11 @@
---
sort: 12
weight: 12
sort: 21
weight: 21
title: Case studies and talks
menu:
docs:
parent: "victoriametrics"
weight: 12
parent: 'victoriametrics'
weight: 21
aliases:
- /CaseStudies.html
---

View file

@ -1,11 +1,11 @@
---
sort: 15
weight: 15
sort: 24
weight: 24
title: FAQ
menu:
docs:
parent: "victoriametrics"
weight: 15
parent: 'victoriametrics'
weight: 24
aliases:
- /FAQ.html
---

View file

@ -1,11 +1,11 @@
---
sort: 14
weight: 14
sort: 23
weight: 23
title: MetricsQL
menu:
docs:
parent: "victoriametrics"
weight: 14
parent: 'victoriametrics'
weight: 23
aliases:
- /ExtendedPromQL.html
- /MetricsQL.html

View file

@ -1,11 +1,11 @@
---
sort: 22
weight: 22
sort: 31
weight: 31
title: VictoriaMetrics Cluster Per Tenant Statistic
menu:
docs:
parent: "victoriametrics"
weight: 22
parent: 'victoriametrics'
weight: 31
aliases:
- /PerTenantStatistic.html
---

View file

@ -1,11 +1,11 @@
---
sort: 13
weight: 13
sort: 22
weight: 22
title: Quick start
menu:
docs:
parent: "victoriametrics"
weight: 13
parent: 'victoriametrics'
weight: 22
aliases:
- /Quick-Start.html
---

View file

@ -1,11 +1,11 @@
---
sort: 21
weight: 21
sort: 30
weight: 30
title: Release process guidance
menu:
docs:
parent: "victoriametrics"
weight: 21
parent: 'victoriametrics'
weight: 30
aliases:
- /Release-Guide.html
---

View file

@ -1,11 +1,11 @@
---
sort: 26
weight: 26
sort: 35
weight: 35
title: Troubleshooting
menu:
docs:
parent: "victoriametrics"
weight: 26
parent: 'victoriametrics'
weight: 35
aliases:
- /Troubleshooting.html
---

View file

@ -4,7 +4,7 @@ weight: 99
title: VictoriaMetrics Enterprise
menu:
docs:
parent: "victoriametrics"
parent: 'victoriametrics'
weight: 99
aliases:
- /enterprise.html

View file

@ -1,11 +1,11 @@
---
sort: 25
weight: 25
sort: 34
weight: 34
title: Key concepts
menu:
docs:
parent: "victoriametrics"
weight: 25
parent: 'victoriametrics'
weight: 34
aliases:
- /keyConcepts.html
---

View file

@ -1,11 +1,11 @@
---
sort: 28
weight: 28
sort: 37
weight: 37
title: Relabeling cookbook
menu:
docs:
parent: "victoriametrics"
weight: 25
parent: 'victoriametrics'
weight: 37
aliases:
- /relabeling.html
---

View file

@ -1,11 +1,11 @@
---
sort: 27
weight: 27
sort: 36
weight: 36
title: Prometheus service discovery
menu:
docs:
parent: "victoriametrics"
weight: 27
parent: 'victoriametrics'
weight: 36
aliases:
- /sd_configs.html
---

View file

@ -4,7 +4,7 @@ weight: 98
title: Streaming aggregation
menu:
docs:
parent: "victoriametrics"
parent: 'victoriametrics'
weight: 98
aliases:
- /stream-aggregation.html

View file

@ -1,11 +1,11 @@
---
sort: 24
weight: 24
sort: 33
weight: 33
title: VictoriaMetrics API examples
menu:
docs:
parent: "victoriametrics"
weight: 24
parent: 'victoriametrics'
weight: 33
---
# VictoriaMetrics API examples

253
docs/vmalert-tool.md Normal file
View file

@ -0,0 +1,253 @@
---
sort: 12
weight: 12
menu:
docs:
parent: 'victoriametrics'
weight: 12
title: vmalert-tool
---
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

View file

@ -765,6 +765,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.

View file

@ -4,7 +4,7 @@ weight: 11
title: vmanomaly
menu:
docs:
parent: "victoriametrics"
parent: 'victoriametrics'
weight: 11
aliases:
- /vmanomaly.html

View file

@ -53,3 +53,11 @@ func ParseDuration(s string) (time.Duration, error) {
}
return time.Duration(ms) * time.Millisecond, nil
}
// ParseTime returns time for pd.
func (pd *Duration) ParseTime() time.Time {
if pd == nil {
return time.Time{}
}
return time.UnixMilli(pd.Duration().Milliseconds())
}