vmalert-tool: implement unittest (#4789)

1. split package rule under /app/vmalert, expose needed objects
2. add vmalert-tool with unittest subcmd

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2945
This commit is contained in:
Haleygo 2023-10-13 19:54:33 +08:00 committed by GitHub
parent 98a5007d32
commit dc28196237
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
74 changed files with 3997 additions and 1683 deletions

View file

@ -28,7 +28,8 @@ all: \
vmauth-prod \ vmauth-prod \
vmbackup-prod \ vmbackup-prod \
vmrestore-prod \ vmrestore-prod \
vmctl-prod vmctl-prod \
vmalert-tool-prod
clean: clean:
rm -rf bin/* rm -rf bin/*
@ -40,7 +41,8 @@ publish: package-base \
publish-vmauth \ publish-vmauth \
publish-vmbackup \ publish-vmbackup \
publish-vmrestore \ publish-vmrestore \
publish-vmctl publish-vmctl \
publish-vmalert-tool
package: \ package: \
package-victoria-metrics \ package-victoria-metrics \
@ -50,7 +52,8 @@ package: \
package-vmauth \ package-vmauth \
package-vmbackup \ package-vmbackup \
package-vmrestore \ package-vmrestore \
package-vmctl package-vmctl \
package-vmalert-tool
vmutils: \ vmutils: \
vmagent \ vmagent \
@ -58,7 +61,8 @@ vmutils: \
vmauth \ vmauth \
vmbackup \ vmbackup \
vmrestore \ vmrestore \
vmctl vmctl \
vmalert-tool
vmutils-pure: \ vmutils-pure: \
vmagent-pure \ vmagent-pure \
@ -66,7 +70,8 @@ vmutils-pure: \
vmauth-pure \ vmauth-pure \
vmbackup-pure \ vmbackup-pure \
vmrestore-pure \ vmrestore-pure \
vmctl-pure vmctl-pure \
vmalert-tool-pure
vmutils-linux-amd64: \ vmutils-linux-amd64: \
vmagent-linux-amd64 \ vmagent-linux-amd64 \
@ -74,7 +79,8 @@ vmutils-linux-amd64: \
vmauth-linux-amd64 \ vmauth-linux-amd64 \
vmbackup-linux-amd64 \ vmbackup-linux-amd64 \
vmrestore-linux-amd64 \ vmrestore-linux-amd64 \
vmctl-linux-amd64 vmctl-linux-amd64 \
vmalert-tool-linux-amd64
vmutils-linux-arm64: \ vmutils-linux-arm64: \
vmagent-linux-arm64 \ vmagent-linux-arm64 \
@ -82,7 +88,8 @@ vmutils-linux-arm64: \
vmauth-linux-arm64 \ vmauth-linux-arm64 \
vmbackup-linux-arm64 \ vmbackup-linux-arm64 \
vmrestore-linux-arm64 \ vmrestore-linux-arm64 \
vmctl-linux-arm64 vmctl-linux-arm64 \
vmalert-tool-linux-arm64
vmutils-linux-arm: \ vmutils-linux-arm: \
vmagent-linux-arm \ vmagent-linux-arm \
@ -90,7 +97,8 @@ vmutils-linux-arm: \
vmauth-linux-arm \ vmauth-linux-arm \
vmbackup-linux-arm \ vmbackup-linux-arm \
vmrestore-linux-arm \ vmrestore-linux-arm \
vmctl-linux-arm vmctl-linux-arm \
vmalert-tool-linux-arm
vmutils-linux-386: \ vmutils-linux-386: \
vmagent-linux-386 \ vmagent-linux-386 \
@ -98,7 +106,8 @@ vmutils-linux-386: \
vmauth-linux-386 \ vmauth-linux-386 \
vmbackup-linux-386 \ vmbackup-linux-386 \
vmrestore-linux-386 \ vmrestore-linux-386 \
vmctl-linux-386 vmctl-linux-386 \
vmalert-tool-linux-386
vmutils-linux-ppc64le: \ vmutils-linux-ppc64le: \
vmagent-linux-ppc64le \ vmagent-linux-ppc64le \
@ -106,7 +115,8 @@ vmutils-linux-ppc64le: \
vmauth-linux-ppc64le \ vmauth-linux-ppc64le \
vmbackup-linux-ppc64le \ vmbackup-linux-ppc64le \
vmrestore-linux-ppc64le \ vmrestore-linux-ppc64le \
vmctl-linux-ppc64le vmctl-linux-ppc64le \
vmalert-tool-linux-ppc64le
vmutils-darwin-amd64: \ vmutils-darwin-amd64: \
vmagent-darwin-amd64 \ vmagent-darwin-amd64 \
@ -114,7 +124,8 @@ vmutils-darwin-amd64: \
vmauth-darwin-amd64 \ vmauth-darwin-amd64 \
vmbackup-darwin-amd64 \ vmbackup-darwin-amd64 \
vmrestore-darwin-amd64 \ vmrestore-darwin-amd64 \
vmctl-darwin-amd64 vmctl-darwin-amd64 \
vmalert-tool-darwin-amd64
vmutils-darwin-arm64: \ vmutils-darwin-arm64: \
vmagent-darwin-arm64 \ vmagent-darwin-arm64 \
@ -122,7 +133,8 @@ vmutils-darwin-arm64: \
vmauth-darwin-arm64 \ vmauth-darwin-arm64 \
vmbackup-darwin-arm64 \ vmbackup-darwin-arm64 \
vmrestore-darwin-arm64 \ vmrestore-darwin-arm64 \
vmctl-darwin-arm64 vmctl-darwin-arm64 \
vmalert-tool-darwin-arm64
vmutils-freebsd-amd64: \ vmutils-freebsd-amd64: \
vmagent-freebsd-amd64 \ vmagent-freebsd-amd64 \
@ -130,7 +142,8 @@ vmutils-freebsd-amd64: \
vmauth-freebsd-amd64 \ vmauth-freebsd-amd64 \
vmbackup-freebsd-amd64 \ vmbackup-freebsd-amd64 \
vmrestore-freebsd-amd64 \ vmrestore-freebsd-amd64 \
vmctl-freebsd-amd64 vmctl-freebsd-amd64 \
vmalert-tool-freebsd-amd64
vmutils-openbsd-amd64: \ vmutils-openbsd-amd64: \
vmagent-openbsd-amd64 \ vmagent-openbsd-amd64 \
@ -138,7 +151,8 @@ vmutils-openbsd-amd64: \
vmauth-openbsd-amd64 \ vmauth-openbsd-amd64 \
vmbackup-openbsd-amd64 \ vmbackup-openbsd-amd64 \
vmrestore-openbsd-amd64 \ vmrestore-openbsd-amd64 \
vmctl-openbsd-amd64 vmctl-openbsd-amd64 \
vmalert-tool-openbsd-amd64
vmutils-windows-amd64: \ vmutils-windows-amd64: \
vmagent-windows-amd64 \ vmagent-windows-amd64 \
@ -146,7 +160,8 @@ vmutils-windows-amd64: \
vmauth-windows-amd64 \ vmauth-windows-amd64 \
vmbackup-windows-amd64 \ vmbackup-windows-amd64 \
vmrestore-windows-amd64 \ vmrestore-windows-amd64 \
vmctl-windows-amd64 vmctl-windows-amd64 \
vmalert-tool-windows-amd64
victoria-metrics-crossbuild: \ victoria-metrics-crossbuild: \
victoria-metrics-linux-386 \ victoria-metrics-linux-386 \
@ -342,7 +357,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \ vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \ vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \ vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
cd bin && \ cd bin && \
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \ tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \ vmagent-$(GOOS)-$(GOARCH)-prod \
@ -351,6 +367,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \ vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \ vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \ vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \ && sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
vmagent-$(GOOS)-$(GOARCH)-prod \ vmagent-$(GOOS)-$(GOARCH)-prod \
vmalert-$(GOOS)-$(GOARCH)-prod \ vmalert-$(GOOS)-$(GOARCH)-prod \
@ -358,6 +375,7 @@ release-vmutils-goos-goarch: \
vmbackup-$(GOOS)-$(GOARCH)-prod \ vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \ vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod \ vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt | sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \ cd bin && rm -rf \
vmagent-$(GOOS)-$(GOARCH)-prod \ vmagent-$(GOOS)-$(GOARCH)-prod \
@ -365,7 +383,8 @@ release-vmutils-goos-goarch: \
vmauth-$(GOOS)-$(GOARCH)-prod \ vmauth-$(GOOS)-$(GOARCH)-prod \
vmbackup-$(GOOS)-$(GOARCH)-prod \ vmbackup-$(GOOS)-$(GOARCH)-prod \
vmrestore-$(GOOS)-$(GOARCH)-prod \ vmrestore-$(GOOS)-$(GOARCH)-prod \
vmctl-$(GOOS)-$(GOARCH)-prod vmctl-$(GOOS)-$(GOARCH)-prod \
vmalert-tool-$(GOOS)-$(GOARCH)-prod
release-vmutils-windows-goarch: \ release-vmutils-windows-goarch: \
vmagent-windows-$(GOARCH)-prod \ vmagent-windows-$(GOARCH)-prod \
@ -373,7 +392,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod \ vmauth-windows-$(GOARCH)-prod \
vmbackup-windows-$(GOARCH)-prod \ vmbackup-windows-$(GOARCH)-prod \
vmrestore-windows-$(GOARCH)-prod \ vmrestore-windows-$(GOARCH)-prod \
vmctl-windows-$(GOARCH)-prod vmctl-windows-$(GOARCH)-prod \
vmalert-tool-windows-$(GOARCH)-prod
cd bin && \ cd bin && \
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \ zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \ vmagent-windows-$(GOARCH)-prod.exe \
@ -382,6 +402,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \ vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \ vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \ vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \ && sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
vmagent-windows-$(GOARCH)-prod.exe \ vmagent-windows-$(GOARCH)-prod.exe \
vmalert-windows-$(GOARCH)-prod.exe \ vmalert-windows-$(GOARCH)-prod.exe \
@ -389,6 +410,7 @@ release-vmutils-windows-goarch: \
vmbackup-windows-$(GOARCH)-prod.exe \ vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \ vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe \ vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe \
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt > vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
cd bin && rm -rf \ cd bin && rm -rf \
vmagent-windows-$(GOARCH)-prod.exe \ vmagent-windows-$(GOARCH)-prod.exe \
@ -396,7 +418,8 @@ release-vmutils-windows-goarch: \
vmauth-windows-$(GOARCH)-prod.exe \ vmauth-windows-$(GOARCH)-prod.exe \
vmbackup-windows-$(GOARCH)-prod.exe \ vmbackup-windows-$(GOARCH)-prod.exe \
vmrestore-windows-$(GOARCH)-prod.exe \ vmrestore-windows-$(GOARCH)-prod.exe \
vmctl-windows-$(GOARCH)-prod.exe vmctl-windows-$(GOARCH)-prod.exe \
vmalert-tool-windows-$(GOARCH)-prod.exe
pprof-cpu: pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE) go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
@ -514,3 +537,4 @@ docs-sync:
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs

103
app/vmalert-tool/Makefile Normal file
View file

@ -0,0 +1,103 @@
# All these commands must run from repository root.
vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) app-local
vmalert-tool-race:
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
vmalert-tool-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker
vmalert-tool-pure-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
vmalert-tool-linux-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
vmalert-tool-linux-arm-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
vmalert-tool-linux-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
vmalert-tool-linux-ppc64le-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
vmalert-tool-linux-386-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
vmalert-tool-darwin-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
vmalert-tool-darwin-arm64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
vmalert-tool-freebsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
vmalert-tool-openbsd-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
vmalert-tool-windows-amd64-prod:
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
package-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) package-via-docker
package-vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
package-vmalert-tool-amd64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
package-vmalert-tool-arm:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
package-vmalert-tool-arm64:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
package-vmalert-tool-ppc64le:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
package-vmalert-tool-386:
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
publish-vmalert-tool:
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
vmalert-tool-linux-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
vmalert-tool-linux-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-linux-ppc64le:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
vmalert-tool-linux-s390x:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
vmalert-tool-linux-386:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-darwin-arm64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
vmalert-tool-freebsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-openbsd-amd64:
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
vmalert-tool-windows-amd64:
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
vmalert-tool-pure:
APP_NAME=vmalert-tool $(MAKE) app-local-pure

244
app/vmalert-tool/README.md Normal file
View file

@ -0,0 +1,244 @@
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

54
app/vmalert-tool/main.go Normal file
View file

@ -0,0 +1,54 @@
package main
import (
"fmt"
"log"
"os"
"time"
"github.com/urfave/cli/v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
)
func main() {
start := time.Now()
app := &cli.App{
Name: "vmalert-tool",
Usage: "VMAlert command-line tool",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
Version: buildinfo.Version,
Commands: []*cli.Command{
{
Name: "unittest",
Usage: "Run unittest for alerting and recording rules.",
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
Flags: []cli.Flag{
&cli.StringSliceFlag{
Name: "files",
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
Required: true,
},
&cli.BoolFlag{
Name: "disableAlertgroupLabel",
Usage: "disable adding group's Name as label to generated alerts and time series.",
Required: false,
},
},
Action: func(c *cli.Context) error {
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
return fmt.Errorf("unittest failed")
}
return nil
},
},
},
}
err := app.Run(os.Args)
if err != nil {
log.Fatalln(err)
}
log.Printf("Total time: %v", time.Since(start))
}

View file

@ -0,0 +1,19 @@
package unittest
import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
// alertTestCase holds alert_rule_test cases defined in test file
type alertTestCase struct {
EvalTime *promutils.Duration `yaml:"eval_time"`
GroupName string `yaml:"groupname"`
Alertname string `yaml:"alertname"`
ExpAlerts []expAlert `yaml:"exp_alerts"`
}
// expAlert holds exp_alerts defined in test file
type expAlert struct {
ExpLabels map[string]string `yaml:"exp_labels"`
ExpAnnotations map[string]string `yaml:"exp_annotations"`
}

View file

@ -0,0 +1,182 @@
package unittest
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strconv"
"strings"
"time"
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// series holds input_series defined in the test file
type series struct {
Series string `yaml:"series"`
Values string `yaml:"values"`
}
// sequenceValue is an omittable value in a sequence of time series values.
type sequenceValue struct {
Value float64
Omitted bool
}
func httpWrite(address string, r io.Reader) {
resp, err := http.Post(address, "", r)
if err != nil {
logger.Fatalf("failed to send to storage: %v", err)
}
resp.Body.Close()
}
// writeInputSeries send input series to vmstorage and flush them
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
r := testutil.WriteRequest{}
for _, data := range input {
expr, err := metricsql.Parse(data.Series)
if err != nil {
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
}
promvals, err := parseInputValue(data.Values, true)
if err != nil {
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
}
metricExpr, ok := expr.(*metricsql.MetricExpr)
if !ok {
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
}
samples := make([]testutil.Sample, 0, len(promvals))
ts := startStamp
for _, v := range promvals {
if !v.Omitted {
samples = append(samples, testutil.Sample{
Timestamp: ts.UnixMilli(),
Value: v.Value,
})
}
ts = ts.Add(interval.Duration())
}
var ls []testutil.Label
for _, filter := range metricExpr.LabelFilterss[0] {
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
}
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
}
data, err := testutil.Compress(r)
if err != nil {
return fmt.Errorf("failed to compress data: %v", err)
}
// write input series to vm
httpWrite(dst, bytes.NewBuffer(data))
vmstorage.Storage.DebugFlush()
return nil
}
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
var res []sequenceValue
items := strings.Split(input, " ")
reg := regexp.MustCompile(`\D?\d*\D?`)
for _, item := range items {
if item == "stale" {
res = append(res, sequenceValue{Value: decimal.StaleNaN})
continue
}
vals := reg.FindAllString(item, -1)
switch len(vals) {
case 1:
if vals[0] == "_" {
res = append(res, sequenceValue{Omitted: true})
continue
}
v, err := strconv.ParseFloat(vals[0], 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v})
continue
case 2:
p1 := vals[0][:len(vals[0])-1]
v2, err := strconv.ParseInt(vals[1], 10, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
switch option {
case '+':
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
res = append(res, sequenceValue{Value: v1 + float64(v2)})
case 'x':
for i := int64(0); i <= v2; i++ {
if p1 == "_" {
if i == 0 {
i = 1
}
res = append(res, sequenceValue{Omitted: true})
continue
}
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
if !origin || v1 == 0 {
res = append(res, sequenceValue{Value: v1 * float64(i)})
continue
}
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
newRes, err := parseInputValue(newVal, false)
if err != nil {
return nil, err
}
res = append(res, newRes...)
break
}
default:
return nil, fmt.Errorf("got invalid operation %b", option)
}
case 3:
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
if err != nil {
return nil, err
}
p1 := vals[0][:len(vals[0])-1]
v1, err := strconv.ParseFloat(p1, 64)
if err != nil {
return nil, err
}
option := vals[0][len(vals[0])-1]
var isAdd bool
if option == '+' {
isAdd = true
}
for _, r := range r1 {
if isAdd {
res = append(res, sequenceValue{
Value: r.Value + v1,
})
} else {
res = append(res, sequenceValue{
Value: v1 - r.Value,
})
}
}
default:
return nil, fmt.Errorf("unsupported input %s", input)
}
}
return res, nil
}

View file

@ -0,0 +1,93 @@
package unittest
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
)
func TestParseInputValue(t *testing.T) {
testCases := []struct {
input string
exp []sequenceValue
failed bool
}{
{
"",
nil,
true,
},
{
"testfailed",
nil,
true,
},
// stale doesn't support operations
{
"stalex3",
nil,
true,
},
{
"-4",
[]sequenceValue{{Value: -4}},
false,
},
{
"_",
[]sequenceValue{{Omitted: true}},
false,
},
{
"stale",
[]sequenceValue{{Value: decimal.StaleNaN}},
false,
},
{
"-4x1",
[]sequenceValue{{Value: -4}, {Value: -4}},
false,
},
{
"_x1",
[]sequenceValue{{Omitted: true}},
false,
},
{
"1+1x4",
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
false,
},
{
"2-1x4",
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
false,
},
{
"1+1x1 _ -4 stale 3+20x1",
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
false,
},
}
for _, tc := range testCases {
output, err := parseInputValue(tc.input, true)
if err != nil != tc.failed {
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
}
if len(tc.exp) != len(output) {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
for i := 0; i < len(tc.exp); i++ {
if tc.exp[i].Omitted != output[i].Omitted {
t.Fatalf("expect %v, got %v", tc.exp, output)
}
if tc.exp[i].Value != output[i].Value {
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
continue
}
t.Fatalf("expect %v, got %v", tc.exp, output)
}
}
}
}

View file

@ -0,0 +1,92 @@
package unittest
import (
"context"
"fmt"
"net/url"
"reflect"
"sort"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metricsql"
)
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
type metricsqlTestCase struct {
Expr string `yaml:"expr"`
EvalTime *promutils.Duration `yaml:"eval_time"`
ExpSamples []expSample `yaml:"exp_samples"`
}
type expSample struct {
Labels string `yaml:"labels"`
Value float64 `yaml:"value"`
}
// checkMetricsqlCase will check metricsql_expr_test cases
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
Outer:
for _, mt := range cases {
result, _, err := queries.Query(context.Background(), mt.Expr, mt.EvalTime.ParseTime())
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
mt.EvalTime.Duration().String(), err))
continue
}
var gotSamples []parsedSample
for _, s := range result.Data {
sort.Slice(s.Labels, func(i, j int) bool {
return s.Labels[i].Name < s.Labels[j].Name
})
gotSamples = append(gotSamples, parsedSample{
Labels: s.Labels,
Value: s.Values[0],
})
}
var expSamples []parsedSample
for _, s := range mt.ExpSamples {
expLb := datasource.Labels{}
if s.Labels != "" {
metricsqlExpr, err := metricsql.Parse(s.Labels)
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
continue Outer
}
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
if !ok {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
continue Outer
}
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
expLb = append(expLb, datasource.Label{
Name: l.Label,
Value: l.Value,
})
}
}
sort.Slice(expLb, func(i, j int) bool {
return expLb[i].Name < expLb[j].Name
})
expSamples = append(expSamples, parsedSample{
Labels: expLb,
Value: s.Value,
})
}
sort.Slice(expSamples, func(i, j int) bool {
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
})
sort.Slice(gotSamples, func(i, j int) bool {
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
})
if !reflect.DeepEqual(expSamples, gotSamples) {
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
}
}
return
}

View file

@ -0,0 +1,43 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,49 @@
rule_files:
- rules.yaml
tests:
- interval: 1m
name: "Failing test"
input_series:
- series: test
values: "0"
metricsql_expr_test:
- expr: test
eval_time: 0m
exp_samples:
- value: 0
labels: test
# will failed cause there is no "Test" group and rule defined
alert_rule_test:
- eval_time: 0m
groupname: Test
alertname: Test
exp_alerts:
- exp_labels: {}
- interval: 1m
name: Failing alert test
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause rule is firing
- eval_time: 5m
groupname: group1
alertname: InstanceDown
exp_alerts: []
- interval: 1m
name: Failing alert test with missing groupname
input_series:
- series: 'up{job="test"}'
values: 0x10
alert_rule_test:
# will failed cause missing groupname
- eval_time: 5m
alertname: AlwaysFiring
exp_alerts: []

View file

@ -0,0 +1,30 @@
# can be executed successfully but will take more than 1 minute
# not included in unit test now
evaluation_interval: 100d
rule_files:
- rules.yaml
tests:
- interval: 1d
input_series:
- series: test
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
# We input series to 2024-01-01T00:00:00 here.
values: "0+1x19723"
metricsql_expr_test:
- expr: timestamp(test)
eval_time: 0m
exp_samples:
- value: 0
- expr: test
eval_time: 100d
exp_samples:
- labels: test
value: 100
- expr: timestamp(test)
eval_time: 19000d
exp_samples:
- value: 1641600000 # 19000d -> seconds.

View file

@ -0,0 +1,39 @@
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 1m
- name: group2
rules:
- record: t1
expr: test
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
- alert: SameAlertNameWithDifferentGroup
expr: absent(test)
for: 5m
- name: group3
rules:
- record: t2
expr: t1
- name: group4
rules:
- record: t3
expr: t1

View file

@ -0,0 +1,99 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
group_eval_order: ["group4", "group2", "group3"]
tests:
- interval: 1m
name: "basic test"
input_series:
- series: "test"
values: "_x5 1x5 _ stale"
alert_rule_test:
- eval_time: 1m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts:
- {}
- eval_time: 1m
groupname: group2
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
- eval_time: 6m
groupname: group1
alertname: SameAlertNameWithDifferentGroup
exp_alerts: []
metricsql_expr_test:
- expr: test
eval_time: 11m
exp_samples:
- labels: '{__name__="test"}'
value: 1
- expr: test
eval_time: 12m
exp_samples: []
- interval: 1m
name: "basic test2"
input_series:
- series: 'up{job="vmagent1", instance="localhost:9090"}'
values: "0+0x1440"
- series: "test"
values: "0+1x1440"
metricsql_expr_test:
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
eval_time: 4m
exp_samples:
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
value: 1
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
value: 1
- expr: t1
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t1", datacenter="dc-123"}'
- expr: t2
eval_time: 4m
exp_samples:
- value: 4
labels: '{__name__="t2", datacenter="dc-123"}'
- expr: t3
eval_time: 4m
exp_samples:
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
- value: 3
labels: '{__name__="t3", datacenter="dc-123"}'
alert_rule_test:
- eval_time: 10m
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent1
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: alerts
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,46 @@
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="vmagent2", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: vmagent2
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123

View file

@ -0,0 +1,83 @@
package unittest
import (
"fmt"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// parsedSample is a sample with parsed Labels
type parsedSample struct {
Labels datasource.Labels
Value float64
}
func (ps *parsedSample) String() string {
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
}
func parsedSamplesString(pss []parsedSample) string {
if len(pss) == 0 {
return "nil"
}
s := pss[0].String()
for _, ps := range pss[1:] {
s += ", " + ps.String()
}
return s
}
// labelAndAnnotation holds labels and annotations
type labelAndAnnotation struct {
Labels datasource.Labels
Annotations datasource.Labels
}
func (la *labelAndAnnotation) String() string {
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
}
// labelsAndAnnotations is collection of LabelAndAnnotation
type labelsAndAnnotations []labelAndAnnotation
func (la labelsAndAnnotations) Len() int { return len(la) }
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
func (la labelsAndAnnotations) Less(i, j int) bool {
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
if diff != 0 {
return diff < 0
}
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
}
func (la labelsAndAnnotations) String() string {
if len(la) == 0 {
return "[]"
}
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
for i, l := range la[1:] {
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
}
s += "\n]"
return s
}
// indentLines prefixes each line in the supplied string with the given "indent" string.
func indentLines(lines, indent string) string {
sb := strings.Builder{}
n := strings.Split(lines, "\n")
for i, l := range n {
if i > 0 {
sb.WriteString(indent)
}
sb.WriteString(l)
if i != len(n)-1 {
sb.WriteRune('\n')
}
}
return sb.String()
}

View file

@ -0,0 +1,443 @@
package unittest
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"reflect"
"sort"
"time"
"gopkg.in/yaml.v2"
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/VictoriaMetrics/metrics"
)
var (
storagePath string
httpListenAddr = ":8880"
// insert series from 1970-01-01T00:00:00
testStartTime = time.Unix(0, 0).UTC()
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
disableAlertgroupLabel bool
)
const (
testStoragePath = "vmalert-unittest"
testLogLevel = "ERROR"
)
// UnitTest runs unittest for files
func UnitTest(files []string, disableGroupLabel bool) bool {
if err := templates.Load([]string{}, true); err != nil {
logger.Fatalf("failed to load template: %v", err)
}
storagePath = filepath.Join(os.TempDir(), testStoragePath)
processFlags()
vminsert.Init()
vmselect.Init()
// storagePath will be created again when closing vmselect, so remove it again.
defer fs.MustRemoveAll(storagePath)
defer vminsert.Stop()
defer vmselect.Stop()
disableAlertgroupLabel = disableGroupLabel
return rulesUnitTest(files)
}
func rulesUnitTest(files []string) bool {
var failed bool
for _, f := range files {
if err := ruleUnitTest(f); err != nil {
fmt.Println(" FAILED")
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
failed = true
} else {
fmt.Println(" SUCCESS")
}
}
return failed
}
func ruleUnitTest(filename string) []error {
fmt.Println("\nUnit Testing: ", filename)
b, err := os.ReadFile(filename)
if err != nil {
return []error{fmt.Errorf("failed to read file: %w", err)}
}
var unitTestInp unitTestFile
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
}
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
}
if unitTestInp.EvaluationInterval.Duration() == 0 {
fmt.Println("evaluation_interval set to 1m by default")
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
}
groupOrderMap := make(map[string]int)
for i, gn := range unitTestInp.GroupEvalOrder {
if _, ok := groupOrderMap[gn]; ok {
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
}
groupOrderMap[gn] = i
}
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
if err != nil {
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
}
var errs []error
for _, t := range unitTestInp.Tests {
if err := verifyTestGroup(t); err != nil {
errs = append(errs, err)
continue
}
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
errs = append(errs, testErrs...)
}
if len(errs) > 0 {
return errs
}
return nil
}
func verifyTestGroup(group testGroup) error {
var testGroupName string
if group.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
}
for _, at := range group.AlertRuleTests {
if at.Alertname == "" {
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
}
if !disableAlertgroupLabel && at.GroupName == "" {
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
}
if disableAlertgroupLabel && at.GroupName != "" {
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
}
if at.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
for _, et := range group.MetricsqlExprTests {
if et.Expr == "" {
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
}
if et.EvalTime == nil {
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
}
}
return nil
}
func processFlags() {
flag.Parse()
for _, fv := range []struct {
flag string
value string
}{
{flag: "storageDataPath", value: storagePath},
{flag: "loggerLevel", value: testLogLevel},
{flag: "search.disableCache", value: "true"},
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
{flag: "retentionPeriod", value: "100y"},
{flag: "datasource.url", value: testDataSourcePath},
{flag: "remoteWrite.url", value: testRemoteWritePath},
} {
// panics if flag doesn't exist
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
}
}
}
func setUp() {
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/prometheus/api/v1/query":
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
case "/prometheus/api/v1/write", "/api/v1/write":
if err := promremotewrite.InsertHandler(r); err != nil {
httpserver.Errorf(w, r, "%s", err)
}
return true
default:
}
return false
})
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
readyCheckFunc := func() bool {
resp, err := http.Get(testHealthHTTPPath)
if err != nil {
return false
}
_ = resp.Body.Close()
return resp.StatusCode == 200
}
checkCheck:
for {
select {
case <-ctx.Done():
logger.Fatalf("http server can't be ready in 30s")
default:
if readyCheckFunc() {
break checkCheck
}
time.Sleep(3 * time.Second)
}
}
}
func tearDown() {
if err := httpserver.Stop(httpListenAddr); err != nil {
logger.Errorf("cannot stop the webservice: %s", err)
}
vmstorage.Stop()
metrics.UnregisterAllMetrics()
fs.MustRemoveAll(storagePath)
}
// resolveAndGlobFilepaths joins all relative paths in a configuration
// with a given base directory and replaces all globs with matching files.
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
for i, rf := range utf.RuleFiles {
if rf != "" && !filepath.IsAbs(rf) {
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
}
}
var globbedFiles []string
for _, rf := range utf.RuleFiles {
m, err := filepath.Glob(rf)
if err != nil {
return err
}
if len(m) == 0 {
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
}
globbedFiles = append(globbedFiles, m...)
}
utf.RuleFiles = globbedFiles
return nil
}
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
// set up vmstorage and http server for ingest and read queries
setUp()
// tear down vmstorage and clean the data dir
defer tearDown()
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
if err != nil {
return []error{err}
}
q, err := datasource.Init(nil)
if err != nil {
return []error{fmt.Errorf("failed to init datasource: %v", err)}
}
rw, err := remotewrite.NewDebugClient()
if err != nil {
return []error{fmt.Errorf("failed to init wr: %v", err)}
}
alertEvalTimesMap := map[time.Duration]struct{}{}
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
for _, at := range tg.AlertRuleTests {
et := at.EvalTime.Duration()
alertEvalTimesMap[et] = struct{}{}
if _, ok := alertExpResultMap[et]; !ok {
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
}
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
}
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
}
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
for k := range alertEvalTimesMap {
alertEvalTimes = append(alertEvalTimes, k)
}
sort.Slice(alertEvalTimes, func(i, j int) bool {
return alertEvalTimes[i] < alertEvalTimes[j]
})
// sort group eval order according to the given "group_eval_order".
sort.Slice(testGroups, func(i, j int) bool {
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
})
// create groups with given rule
var groups []*rule.Group
for _, group := range testGroups {
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
groups = append(groups, ng)
}
evalIndex := 0
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
for _, g := range groups {
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
for err := range errs {
if err != nil {
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
ts, err))
}
}
// flush series after each group evaluation
vmstorage.Storage.DebugFlush()
}
// check alert_rule_test case at every eval time
for evalIndex < len(alertEvalTimes) {
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
break
}
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
for _, g := range groups {
if disableAlertgroupLabel {
g.Name = ""
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
continue
}
if _, ok := gotAlertsMap[g.Name]; !ok {
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
}
for _, r := range g.Rules {
ar, isAlertRule := r.(*rule.AlertingRule)
if !isAlertRule {
continue
}
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
for _, got := range ar.GetAlerts() {
if got.State != notifier.StateFiring {
continue
}
if disableAlertgroupLabel {
delete(got.Labels, "alertgroup")
}
laa := labelAndAnnotation{
Labels: datasource.ConvertToLabels(got.Labels),
Annotations: datasource.ConvertToLabels(got.Annotations),
}
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
}
}
}
}
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
for alertname, res := range gres {
var expAlerts labelsAndAnnotations
for _, expAlert := range res {
if expAlert.ExpLabels == nil {
expAlert.ExpLabels = make(map[string]string)
}
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
if !disableAlertgroupLabel {
expAlert.ExpLabels["alertgroup"] = groupname
}
// alertNameLabel is added as additional labels in vmalert.
expAlert.ExpLabels["alertname"] = alertname
expAlerts = append(expAlerts, labelAndAnnotation{
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
})
}
sort.Sort(expAlerts)
gotAlerts := gotAlertsMap[groupname][alertname]
sort.Sort(gotAlerts)
if !reflect.DeepEqual(expAlerts, gotAlerts) {
var testGroupName string
if tg.TestGroupName != "" {
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
}
expString := indentLines(expAlerts.String(), " ")
gotString := indentLines(gotAlerts.String(), " ")
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
}
}
}
evalIndex++
}
}
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
return checkErrs
}
// unitTestFile holds the contents of a single unit test file
type unitTestFile struct {
RuleFiles []string `yaml:"rule_files"`
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
GroupEvalOrder []string `yaml:"group_eval_order"`
Tests []testGroup `yaml:"tests"`
}
// testGroup is a group of input series and test cases associated with it
type testGroup struct {
Interval *promutils.Duration `yaml:"interval"`
InputSeries []series `yaml:"input_series"`
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
ExternalLabels map[string]string `yaml:"external_labels"`
TestGroupName string `yaml:"name"`
}
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
func (tg *testGroup) maxEvalTime() time.Duration {
var maxd time.Duration
for _, alert := range tg.AlertRuleTests {
if alert.EvalTime.Duration() > maxd {
maxd = alert.EvalTime.Duration()
}
}
for _, met := range tg.MetricsqlExprTests {
if met.EvalTime.Duration() > maxd {
maxd = met.EvalTime.Duration()
}
}
return maxd
}

View file

@ -0,0 +1,47 @@
package unittest
import (
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
)
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
os.Exit(1)
}
os.Exit(m.Run())
}
func TestUnitRule(t *testing.T) {
testCases := []struct {
name string
disableGroupLabel bool
files []string
failed bool
}{
{
name: "run multi files",
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
failed: false,
},
{
name: "disable group label",
disableGroupLabel: true,
files: []string{"./testdata/disable-group-label.yaml"},
failed: false,
},
{
name: "failing test",
files: []string{"./testdata/failed-test.yaml"},
failed: true,
},
}
for _, tc := range testCases {
fail := UnitTest(tc.files, tc.disableGroupLabel)
if fail != tc.failed {
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
}
}
}

View file

@ -754,6 +754,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future); * `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future). * `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring ## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. `vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.

View file

@ -0,0 +1,131 @@
package datasource
import (
"context"
"net/http"
"sync"
"time"
)
// FakeQuerier is a mock querier that return predefined results and error message
type FakeQuerier struct {
sync.Mutex
metrics []Metric
err error
}
// SetErr sets query error message
func (fq *FakeQuerier) SetErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
// Reset reset querier's error message and results
func (fq *FakeQuerier) Reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
// Add appends metrics to querier result metrics
func (fq *FakeQuerier) Add(metrics ...Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
// BuildWithParams return FakeQuerier itself
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
return fq
}
// QueryRange performs query
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return Result{}, nil, fq.err
}
cp := make([]Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithRegistry can store different results for different query expr
type FakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]Metric
}
// Set stores query result for given key
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
// Reset clean querier's results registry
func (fqr *FakeQuerierWithRegistry) Reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
// BuildWithParams returns itself
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
return fqr
}
// QueryRange performs query
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
// Query returns metrics restored in querier registry
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return Result{}, req, nil
}
cp := make([]Metric, len(metrics))
copy(cp, metrics)
return Result{Data: cp}, req, nil
}
// FakeQuerierWithDelay mock querier with given delay duration
type FakeQuerierWithDelay struct {
FakeQuerier
Delay time.Duration
}
// Query returns query result after delay duration
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
timer := time.NewTimer(fqd.Delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.FakeQuerier.Query(ctx, expr, ts)
}
// BuildWithParams returns itself
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
return fqd
}

View file

@ -18,6 +18,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag" "github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates") validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent group.")
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.") externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+ externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+ externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.") "Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.") remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.") dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
) )
@ -229,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
return nil, fmt.Errorf("failed to init notifier: %w", err) return nil, fmt.Errorf("failed to init notifier: %w", err)
} }
manager := &manager{ manager := &manager{
groups: make(map[uint64]*Group), groups: make(map[uint64]*rule.Group),
querierBuilder: q, querierBuilder: q,
notifiers: nts, notifiers: nts,
labels: labels, labels: labels,

View file

@ -8,11 +8,19 @@ import (
"testing" "testing"
"time" "time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
) )
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
rule.SkipRandSleepOnGroupStart = true
}
func TestGetExternalURL(t *testing.T) { func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path" expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false) u, err := getExternalURL(expURL, "", false)
@ -98,10 +106,10 @@ groups:
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
m := &manager{ m := &manager{
querierBuilder: &fakeQuerier{}, querierBuilder: &datasource.FakeQuerier{},
groups: make(map[uint64]*Group), groups: make(map[uint64]*rule.Group),
labels: map[string]string{}, labels: map[string]string{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }, notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
rw: &remotewrite.Client{}, rw: &remotewrite.Client{},
} }

View file

@ -3,14 +3,13 @@ package main
import ( import (
"context" "context"
"fmt" "fmt"
"net/url"
"sort"
"sync" "sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
) )
@ -19,7 +18,7 @@ type manager struct {
querierBuilder datasource.QuerierBuilder querierBuilder datasource.QuerierBuilder
notifiers func() []notifier.Notifier notifiers func() []notifier.Notifier
rw *remotewrite.Client rw remotewrite.RWClient
// remote read builder. // remote read builder.
rr datasource.QuerierBuilder rr datasource.QuerierBuilder
@ -27,28 +26,28 @@ type manager struct {
labels map[string]string labels map[string]string
groupsMu sync.RWMutex groupsMu sync.RWMutex
groups map[uint64]*Group groups map[uint64]*rule.Group
} }
// RuleAPI generates APIRule object from alert by its ID(hash) // ruleAPI generates apiRule object from alert by its ID(hash)
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) { func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
m.groupsMu.RLock() m.groupsMu.RLock()
defer m.groupsMu.RUnlock() defer m.groupsMu.RUnlock()
g, ok := m.groups[gID] g, ok := m.groups[gID]
if !ok { if !ok {
return APIRule{}, fmt.Errorf("can't find group with id %d", gID) return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
} }
for _, rule := range g.Rules { for _, rule := range g.Rules {
if rule.ID() == rID { if rule.ID() == rID {
return rule.ToAPI(), nil return ruleToAPI(rule), nil
} }
} }
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name) return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
} }
// AlertAPI generates APIAlert object from alert by its ID(hash) // alertAPI generates apiAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
m.groupsMu.RLock() m.groupsMu.RLock()
defer m.groupsMu.RUnlock() defer m.groupsMu.RUnlock()
@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
if !ok { if !ok {
return nil, fmt.Errorf("can't find group with id %d", gID) return nil, fmt.Errorf("can't find group with id %d", gID)
} }
for _, rule := range g.Rules { for _, r := range g.Rules {
ar, ok := rule.(*AlertingRule) ar, ok := r.(*rule.AlertingRule)
if !ok { if !ok {
continue continue
} }
if apiAlert := ar.AlertAPI(aID); apiAlert != nil { if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
return apiAlert, nil return apiAlert, nil
} }
} }
@ -82,15 +81,15 @@ func (m *manager) close() {
m.wg.Wait() m.wg.Wait()
} }
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error { func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
m.wg.Add(1) m.wg.Add(1)
id := g.ID() id := g.ID()
go func() { go func() {
defer m.wg.Done() defer m.wg.Done()
if restore { if restore {
g.start(ctx, m.notifiers, m.rw, m.rr) g.Start(ctx, m.notifiers, m.rw, m.rr)
} else { } else {
g.start(ctx, m.notifiers, m.rw, nil) g.Start(ctx, m.notifiers, m.rw, nil)
} }
}() }()
m.groups[id] = g m.groups[id] = g
@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error { func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
var rrPresent, arPresent bool var rrPresent, arPresent bool
groupsRegistry := make(map[uint64]*Group) groupsRegistry := make(map[uint64]*rule.Group)
for _, cfg := range groupsCfg { for _, cfg := range groupsCfg {
for _, r := range cfg.Rules { for _, r := range cfg.Rules {
if rrPresent && arPresent { if rrPresent && arPresent {
@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
arPresent = true arPresent = true
} }
} }
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels) ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng groupsRegistry[ng.ID()] = ng
} }
@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
} }
type updateItem struct { type updateItem struct {
old *Group old *rule.Group
new *Group new *rule.Group
} }
var toUpdate []updateItem var toUpdate []updateItem
@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
if !ok { if !ok {
// old group is not present in new list, // old group is not present in new list,
// so must be stopped and deleted // so must be stopped and deleted
og.close() og.Close()
delete(m.groups, og.ID()) delete(m.groups, og.ID())
og = nil og = nil
continue continue
@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
var wg sync.WaitGroup var wg sync.WaitGroup
for _, item := range toUpdate { for _, item := range toUpdate {
wg.Add(1) wg.Add(1)
go func(old *Group, new *Group) { go func(old *rule.Group, new *rule.Group) {
old.updateCh <- new old.UpdateWith(new)
wg.Done() wg.Done()
}(item.old, item.new) }(item.old, item.new)
item.old.interruptEval() item.old.InterruptEval()
} }
wg.Wait() wg.Wait()
} }
return nil return nil
} }
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]APIRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, r.ToAPI())
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -10,8 +10,10 @@ import (
"time" "time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
) )
@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
// successful cases of // successful cases of
// starting with empty rules folder // starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) { func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)} m := &manager{groups: make(map[uint64]*rule.Group)}
cfg := loadCfg(t, []string{"foo/bar"}, true, true) cfg := loadCfg(t, []string{"foo/bar"}, true, true)
if err := m.update(context.Background(), cfg, false); err != nil { if err := m.update(context.Background(), cfg, false); err != nil {
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err) t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
// Should be executed with -race flag // Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) { func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{ m := &manager{
groups: make(map[uint64]*Group), groups: make(map[uint64]*rule.Group),
querierBuilder: &fakeQuerier{}, querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }, notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
} }
paths := []string{ paths := []string{
"config/testdata/dir/rules0-good.rules", "config/testdata/dir/rules0-good.rules",
@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
}() }()
var ( var (
VMRows = &AlertingRule{ VMRows = &rule.AlertingRule{
Name: "VMRows", Name: "VMRows",
Expr: "vm_rows > 0", Expr: "vm_rows > 0",
For: 10 * time.Second, For: 10 * time.Second,
@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "{{$labels}}", "description": "{{$labels}}",
}, },
} }
Conns = &AlertingRule{ Conns = &rule.AlertingRule{
Name: "Conns", Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1", Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{ Annotations: map[string]string{
@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
"description": "It is {{ $value }} connections for {{$labels.instance}}", "description": "It is {{ $value }} connections for {{$labels.instance}}",
}, },
} }
ExampleAlertAlwaysFiring = &AlertingRule{ ExampleAlertAlwaysFiring = &rule.AlertingRule{
Name: "ExampleAlertAlwaysFiring", Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)", Expr: "sum by(job) (up == 1)",
} }
@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
name string name string
initPath string initPath string
updatePath string updatePath string
want []*Group want []*rule.Group
}{ }{
{ {
name: "update good rules", name: "update good rules",
initPath: "config/testdata/rules/rules0-good.rules", initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules", updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{ want: []*rule.Group{
{ {
File: "config/testdata/dir/rules1-good.rules", File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles", Name: "duplicatedGroupDiffFiles",
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Rules: []Rule{ Rules: []rule.Rule{
&AlertingRule{ &rule.AlertingRule{
Name: "VMRows", Name: "VMRows",
Expr: "vm_rows > 0", Expr: "vm_rows > 0",
For: 5 * time.Minute, For: 5 * time.Minute,
@ -153,19 +155,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update good rules from 1 to 2 groups", name: "update good rules from 1 to 2 groups",
initPath: "config/testdata/dir/rules/rules1-good.rules", initPath: "config/testdata/dir/rules/rules1-good.rules",
updatePath: "config/testdata/rules/rules0-good.rules", updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{ want: []*rule.Group{
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert", Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Rules: []Rule{VMRows},
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Rules: []rule.Rule{VMRows},
}, },
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{ Name: "TestGroup",
Rules: []rule.Rule{
Conns, Conns,
ExampleAlertAlwaysFiring, ExampleAlertAlwaysFiring,
}, },
@ -176,20 +179,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update with one bad rule file", name: "update with one bad rule file",
initPath: "config/testdata/rules/rules0-good.rules", initPath: "config/testdata/rules/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules", updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{ want: []*rule.Group{
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert", Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Rules: []Rule{VMRows}, Rules: []rule.Rule{VMRows},
}, },
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Name: "TestGroup", Name: "TestGroup",
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Rules: []Rule{ Rules: []rule.Rule{
Conns, Conns,
ExampleAlertAlwaysFiring, ExampleAlertAlwaysFiring,
}, },
@ -200,19 +203,20 @@ func TestManagerUpdate(t *testing.T) {
name: "update empty dir rules from 0 to 2 groups", name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*", initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules/rules0-good.rules", updatePath: "config/testdata/rules/rules0-good.rules",
want: []*Group{ want: []*rule.Group{
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Name: "groupGorSingleAlert", Name: "groupGorSingleAlert",
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Rules: []Rule{VMRows}, Rules: []rule.Rule{VMRows},
}, },
{ {
File: "config/testdata/rules/rules0-good.rules", File: "config/testdata/rules/rules0-good.rules",
Interval: defaultEvalInterval, Interval: defaultEvalInterval,
Type: config.NewPrometheusType(), Type: config.NewPrometheusType(),
Name: "TestGroup", Rules: []Rule{ Name: "TestGroup",
Rules: []rule.Rule{
Conns, Conns,
ExampleAlertAlwaysFiring, ExampleAlertAlwaysFiring,
}, },
@ -224,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO()) ctx, cancel := context.WithCancel(context.TODO())
m := &manager{ m := &manager{
groups: make(map[uint64]*Group), groups: make(map[uint64]*rule.Group),
querierBuilder: &fakeQuerier{}, querierBuilder: &datasource.FakeQuerier{},
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }, notifiers: func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} },
} }
cfgInit := loadCfg(t, []string{tc.initPath}, true, true) cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
@ -255,11 +259,36 @@ func TestManagerUpdate(t *testing.T) {
}) })
} }
} }
func compareGroups(t *testing.T, a, b *rule.Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := rule.CompareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func TestManagerUpdateNegative(t *testing.T) { func TestManagerUpdateNegative(t *testing.T) {
testCases := []struct { testCases := []struct {
notifiers []notifier.Notifier notifiers []notifier.Notifier
rw *remotewrite.Client rw remotewrite.RWClient
cfg config.Group cfg config.Group
expErr string expErr string
}{ }{
@ -286,7 +315,7 @@ func TestManagerUpdateNegative(t *testing.T) {
"contains alerting rules", "contains alerting rules",
}, },
{ {
[]notifier.Notifier{&fakeNotifier{}}, []notifier.Notifier{&notifier.FakeNotifier{}},
nil, nil,
config.Group{ config.Group{
Name: "Recording and alerting rules", Name: "Recording and alerting rules",
@ -316,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.cfg.Name, func(t *testing.T) { t.Run(tc.cfg.Name, func(t *testing.T) {
m := &manager{ m := &manager{
groups: make(map[uint64]*Group), groups: make(map[uint64]*rule.Group),
querierBuilder: &fakeQuerier{}, querierBuilder: &datasource.FakeQuerier{},
rw: tc.rw, rw: tc.rw,
} }
if tc.notifiers != nil { if tc.notifiers != nil {
@ -346,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
} }
return cfg return cfg
} }
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -0,0 +1,59 @@
package notifier
import (
"context"
"fmt"
"sync"
"time"
)
// FakeNotifier is a mock notifier
type FakeNotifier struct {
sync.Mutex
alerts []Alert
// records number of received alerts in total
counter int
}
// Close does nothing
func (*FakeNotifier) Close() {}
// Addr returns ""
func (*FakeNotifier) Addr() string { return "" }
// Send sets alerts and increases counter
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
// GetCounter returns received alerts count
func (fn *FakeNotifier) GetCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
// GetAlerts returns stored alerts
func (fn *FakeNotifier) GetAlerts() []Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
// FaultyNotifier is a mock notifier that Send() will return failed response
type FaultyNotifier struct {
FakeNotifier
}
// Send returns failed response
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}

View file

@ -0,0 +1,322 @@
package remotewrite
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write client.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
}

View file

@ -0,0 +1,97 @@
package remotewrite
import (
"bytes"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// DebugClient won't push series periodically, but will write data to remote endpoint
// immediately when Push() is called
type DebugClient struct {
addr string
c *http.Client
wg sync.WaitGroup
}
// NewDebugClient initiates and returns a new DebugClient
func NewDebugClient() (*DebugClient, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &DebugClient{
c: &http.Client{
Timeout: *sendTimeout,
Transport: t,
},
addr: strings.TrimSuffix(*addr, "/"),
}
return c, nil
}
// Push sends the given timeseries to the remote storage.
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
c.wg.Add(1)
defer c.wg.Done()
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
data, err := wr.Marshal()
if err != nil {
return fmt.Errorf("failed to marshal the given time series: %w", err)
}
return c.send(data)
}
// Close stops the DebugClient
func (c *DebugClient) Close() error {
c.wg.Wait()
return nil
}
func (c *DebugClient) send(data []byte) error {
b := snappy.Encode(nil, data)
r := bytes.NewReader(b)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req)
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode/100 == 2 {
return nil
}
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}

View file

@ -0,0 +1,50 @@
package remotewrite
import (
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestDebugClient_Push(t *testing.T) {
testSrv := newRWServer()
oldAddr := *addr
*addr = testSrv.URL
defer func() {
*addr = oldAddr
}()
client, err := NewDebugClient()
if err != nil {
t.Fatalf("failed to create debug client: %s", err)
}
const rowsN = 100
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: float64(i),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}

View file

@ -1,322 +1,13 @@
package remotewrite package remotewrite
import ( import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
) )
var ( // RWClient represents an HTTP client for pushing data via remote write protocol
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.") type RWClient interface {
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.") // Push pushes the give time series to remote storage
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval") Push(s prompbmarshal.TimeSeries) error
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval") // Close stops the client. Client can't be reused after Close call.
) Close() error
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
authCfg *promauth.Config
input chan prompbmarshal.TimeSeries
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg *promauth.Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
c := &Client{
c: &http.Client{
Timeout: *sendTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/"),
authCfg: cfg.AuthCfg,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
return float64(*concurrency)
})
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
defer bufferFlushDuration.UpdateDuration(time.Now())
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
b := snappy.Encode(nil, data)
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
defer func() {
sendDuration.Add(time.Since(timeStart).Seconds())
}()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
_, isNotRetriable := err.(*nonRetriableError)
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <-ctx.Done():
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
break L
default:
}
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest(http.MethodPost, c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
// RFC standard compliant headers
req.Header.Set("Content-Encoding", "snappy")
req.Header.Set("Content-Type", "application/x-protobuf")
// Prometheus compliant headers
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authCfg != nil {
c.authCfg.SetHeaders(req, true)
}
if !*disablePathAppend {
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL.Redacted(), err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp.StatusCode / 100 {
case 2:
// respond with a HTTP 2xx status code when the write is successful.
return nil
case 4:
if resp.StatusCode != http.StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)}
}
fallthrough
default:
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL.Redacted(), body)
}
}
type nonRetriableError struct {
err error
}
func (e *nonRetriableError) Error() string {
return e.err.Error()
} }

View file

@ -1,19 +1,16 @@
package main package main
import ( import (
"context"
"flag" "flag"
"fmt" "fmt"
"strings" "strings"
"time" "time"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
) )
var ( var (
@ -33,7 +30,7 @@ var (
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.") "Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
) )
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error { func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
if *replayMaxDatapoints < 1 { if *replayMaxDatapoints < 1 {
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1") return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
} }
@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
var total int var total int
for _, cfg := range groupsCfg { for _, cfg := range groupsCfg {
ng := newGroup(cfg, qb, *evaluationInterval, labels) ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
total += ng.replay(tFrom, tTo, rw) total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
} }
logger.Infof("replay finished! Imported %d samples", total) logger.Infof("replay finished! Imported %d samples", total)
if rw != nil { if rw != nil {
@ -77,99 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
} }
return nil return nil
} }
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
var total int
step := g.Interval * time.Duration(*replayMaxDatapoints)
start = g.adjustReqTimestamp(start)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\neval_offset: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, g.EvalOffset, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !*disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(*replayRulesDelay)
}
return total
}
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < *replayRuleRetryAttempts; i++ {
tss, err = rule.ExecRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}

View file

@ -12,7 +12,7 @@ import (
) )
type fakeReplayQuerier struct { type fakeReplayQuerier struct {
fakeQuerier datasource.FakeQuerier
registry map[string]map[string]struct{} registry map[string]map[string]struct{}
} }
@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
}) })
} }
} }
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,118 +0,0 @@
package main
import (
"context"
"errors"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// ExecRange executes the rule on the given time range.
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// ToAPI converts Rule into APIRule
ToAPI() APIRule
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []ruleStateEntry
cur int
}
type ruleStateEntry struct {
// stores last moment of time rule.Exec was called
time time.Time
// stores the timestamp rule.Exec was called with
at time.Time
// stores the duration of the last rule.Exec call
duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
err error
// stores the number of samples returned during
// the last evaluation
samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
seriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
curl string
}
func newRuleState(size int) *ruleState {
if size < 1 {
size = 1
}
return &ruleState{
entries: make([]ruleStateEntry, size),
}
}
func (s *ruleState) getLast() ruleStateEntry {
s.RLock()
defer s.RUnlock()
return s.entries[s.cur]
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getAll() []ruleStateEntry {
entries := make([]ruleStateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.time.IsZero() || !e.at.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e ruleStateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}

View file

@ -1,11 +1,10 @@
package main package rule
import ( import (
"context" "context"
"fmt" "fmt"
"hash/fnv" "hash/fnv"
"sort" "sort"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
seriesFetched *utils.Gauge seriesFetched *utils.Gauge
} }
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule { // NewAlertingRule creates a new AlertingRule
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{ ar := &AlertingRule{
Type: group.Type, Type: group.Type,
RuleID: cfg.ID, RuleID: cfg.ID,
@ -80,10 +80,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
metrics: &alertingRuleMetrics{}, metrics: &alertingRuleMetrics{},
} }
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil { if cfg.UpdateEntriesLimit != nil {
ar.state = newRuleState(*cfg.UpdateEntriesLimit) entrySize = *cfg.UpdateEntriesLimit
} else { }
ar.state = newRuleState(*ruleUpdateEntriesLimit) if entrySize < 1 {
entrySize = 1
}
ar.state = &ruleState{
entries: make([]StateEntry, entrySize),
} }
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID()) labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels), ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
func() float64 { func() float64 {
e := ar.state.getLast() e := ar.state.getLast()
if e.err == nil { if e.Err == nil {
return 0 return 0
} }
return 1 return 1
@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels), ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
func() float64 { func() float64 {
e := ar.state.getLast() e := ar.state.getLast()
return float64(e.samples) return float64(e.Samples)
}) })
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels), ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
func() float64 { func() float64 {
e := ar.state.getLast() e := ar.state.getLast()
if e.seriesFetched == nil { if e.SeriesFetched == nil {
// means seriesFetched is unsupported // means seriesFetched is unsupported
return -1 return -1
} }
seriesFetched := float64(*e.seriesFetched) seriesFetched := float64(*e.SeriesFetched)
if seriesFetched == 0 && e.samples > 0 { if seriesFetched == 0 && e.Samples > 0 {
// `alert: 0.95` will fetch no series // `alert: 0.95` will fetch no series
// but will get one time series in response. // but will get one time series in response.
seriesFetched = float64(e.samples) seriesFetched = float64(e.Samples)
} }
return seriesFetched return seriesFetched
}) })
return ar return ar
} }
// Close unregisters rule metrics // close unregisters rule metrics
func (ar *AlertingRule) Close() { func (ar *AlertingRule) close() {
ar.metrics.active.Unregister() ar.metrics.active.Unregister()
ar.metrics.pending.Unregister() ar.metrics.pending.Unregister()
ar.metrics.errors.Unregister() ar.metrics.errors.Unregister()
@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID return ar.RuleID
} }
// GetAlerts returns active alerts of rule
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
var alerts []*notifier.Alert
for _, a := range ar.alerts {
alerts = append(alerts, a)
}
return alerts
}
// GetAlert returns alert if id exists
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
if ar.alerts == nil {
return nil
}
return ar.alerts[id]
}
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) { func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
if !ar.Debug { if !ar.Debug {
return return
@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
logger.Infof("%s", prefix+msg) logger.Infof("%s", prefix+msg)
} }
// updateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) updateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
type labelSet struct { type labelSet struct {
// origin labels extracted from received time series // origin labels extracted from received time series
// plus extra labels (group labels, service labels like alertNameLabel). // plus extra labels (group labels, service labels like alertNameLabel).
@ -248,11 +294,11 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
return ls, nil return ls, nil
} }
// ExecRange executes alerting rule on the given time range similarly to Exec. // execRange executes alerting rule on the given time range similarly to exec.
// It doesn't update internal states of the Rule and meant to be used just // It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling. // to get time series for backfilling.
// It returns ALERT and ALERT_FOR_STATE time series as result. // It returns ALERT and ALERT_FOR_STATE time series as result.
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) { func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end) res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil { if err != nil {
return nil, err return nil, err
@ -297,19 +343,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
// is kept in memory state and consequently repeatedly sent to the AlertManager. // is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute const resolvedRetention = 15 * time.Minute
// Exec executes AlertingRule expression via the given Querier. // exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts // Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) { func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now() start := time.Now()
res, req, err := ar.q.Query(ctx, ar.Expr, ts) res, req, err := ar.q.Query(ctx, ar.Expr, ts)
curState := ruleStateEntry{ curState := StateEntry{
time: start, Time: start,
at: ts, At: ts,
duration: time.Since(start), Duration: time.Since(start),
samples: len(res.Data), Samples: len(res.Data),
seriesFetched: res.SeriesFetched, SeriesFetched: res.SeriesFetched,
err: err, Err: err,
curl: requestToCurl(req), Curl: requestToCurl(req),
} }
defer func() { defer func() {
@ -323,7 +369,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err) return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
} }
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration) ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
for h, a := range ar.alerts { for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec // cleanup inactive alerts from previous Exec
@ -342,15 +388,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
for _, m := range res.Data { for _, m := range res.Data {
ls, err := ar.toLabels(m, qFn) ls, err := ar.toLabels(m, qFn)
if err != nil { if err != nil {
curState.err = fmt.Errorf("failed to expand labels: %s", err) curState.Err = fmt.Errorf("failed to expand labels: %s", err)
return nil, curState.err return nil, curState.Err
} }
h := hash(ls.processed) h := hash(ls.processed)
if _, ok := updated[h]; ok { if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels // duplicate may be caused by extra labels
// conflicting with the metric labels // conflicting with the metric labels
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate) curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
return nil, curState.err return nil, curState.Err
} }
updated[h] = struct{}{} updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok { if a, ok := ar.alerts[h]; ok {
@ -373,8 +419,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
} }
a, err := ar.newAlert(m, ls, start, qFn) a, err := ar.newAlert(m, ls, start, qFn)
if err != nil { if err != nil {
curState.err = fmt.Errorf("failed to create alert: %w", err) curState.Err = fmt.Errorf("failed to create alert: %w", err)
return nil, curState.err return nil, curState.Err
} }
a.ID = h a.ID = h
a.State = notifier.StatePending a.State = notifier.StatePending
@ -423,8 +469,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
} }
if limit > 0 && numActivePending > limit { if limit > 0 && numActivePending > limit {
ar.alerts = map[uint64]*notifier.Alert{} ar.alerts = map[uint64]*notifier.Alert{}
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending) curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
return nil, curState.err return nil, curState.Err
} }
return ar.toTimeSeries(ts.Unix()), nil return ar.toTimeSeries(ts.Unix()), nil
} }
@ -441,26 +487,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
return tss return tss
} }
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.KeepFiringFor = nr.KeepFiringFor
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
ar.EvalInterval = nr.EvalInterval
ar.Debug = nr.Debug
ar.q = nr.q
ar.state = nr.state
return nil
}
// TODO: consider hashing algorithm in VM // TODO: consider hashing algorithm in VM
func hash(labels map[string]string) uint64 { func hash(labels map[string]string) uint64 {
hash := fnv.New64a() hash := fnv.New64a()
@ -503,102 +529,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
return a, err return a, err
} }
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.alertsMu.RLock()
defer ar.alertsMu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// ToAPI returns Rule representation in form of APIRule
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
func (ar *AlertingRule) ToAPI() APIRule {
lastState := ar.state.getLast()
r := APIRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ar.AlertsToAPI(),
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: ar.state.size(),
Updates: ar.state.getAll(),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
// satisfy APIRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// AlertsToAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
var alerts []*APIAlert
ar.alertsMu.RLock()
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.alertsMu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
aa := &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
const ( const (
// alertMetricName is the metric name for synthetic alert timeseries. // alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS" alertMetricName = "ALERTS"
@ -646,10 +576,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels) return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
} }
// Restore restores the value of ActiveAt field for active alerts, // restore restores the value of ActiveAt field for active alerts,
// based on previously written time series `alertForStateMetricName`. // based on previously written time series `alertForStateMetricName`.
// Only rules with For > 0 can be restored. // Only rules with For > 0 can be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error { func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
if ar.For < 1 { if ar.For < 1 {
return nil return nil
} }

View file

@ -1,4 +1,4 @@
package main package rule
import ( import (
"context" "context"
@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"} fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
tc.rule.q = fq tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID() tc.rule.GroupID = fakeGroup.ID()
for i, step := range tc.steps { for i, step := range tc.steps {
fq.reset() fq.Reset()
fq.add(step...) fq.Add(step...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil { if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err) t.Fatalf("unexpected err: %s", err)
} }
// artificial delay between applying steps // artificial delay between applying steps
@ -482,11 +482,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"} fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
tc.rule.q = fq tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID() tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.data...) fq.Add(tc.data...)
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now()) gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil { if err != nil {
t.Fatalf("unexpected err: %s", err) t.Fatalf("unexpected err: %s", err)
} }
@ -518,24 +518,24 @@ func TestAlertingRule_ExecRange(t *testing.T) {
func TestGroup_Restore(t *testing.T) { func TestGroup_Restore(t *testing.T) {
defaultTS := time.Now() defaultTS := time.Now()
fqr := &fakeQuerierWithRegistry{} fqr := &datasource.FakeQuerierWithRegistry{}
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) { fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
t.Helper() t.Helper()
defer fqr.reset() defer fqr.Reset()
for _, r := range rules { for _, r := range rules {
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert)) fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
} }
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil) fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
wg.Add(1) wg.Add(1)
go func() { go func() {
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} } nts := func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} }
fg.start(context.Background(), nts, nil, fqr) fg.Start(context.Background(), nts, nil, fqr)
wg.Done() wg.Done()
}() }()
fg.close() fg.Close()
wg.Wait() wg.Wait()
gotAlerts := make(map[uint64]*notifier.Alert) gotAlerts := make(map[uint64]*notifier.Alert)
@ -582,11 +582,11 @@ func TestGroup_Restore(t *testing.T) {
ActiveAt: defaultTS, ActiveAt: defaultTS,
}, },
}) })
fqr.reset() fqr.Reset()
// one active alert with state restore // one active alert with state restore
ts := time.Now().Truncate(time.Hour) ts := time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts)) stateMetric("foo", ts))
fn( fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}}, []config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -598,7 +598,7 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, one with state restored // two rules, two active alerts, one with state restored
ts = time.Now().Truncate(time.Hour) ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("foo", ts)) stateMetric("foo", ts))
fn( fn(
[]config.Rule{ []config.Rule{
@ -616,9 +616,9 @@ func TestGroup_Restore(t *testing.T) {
// two rules, two active alerts, two with state restored // two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour) ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts)) stateMetric("foo", ts))
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts)) stateMetric("bar", ts))
fn( fn(
[]config.Rule{ []config.Rule{
@ -636,7 +636,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert but wrong state restore // one active alert but wrong state restore
ts = time.Now().Truncate(time.Hour) ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
stateMetric("wrong alert", ts)) stateMetric("wrong alert", ts))
fn( fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}}, []config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
@ -648,7 +648,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with labels // one active alert with labels
ts = time.Now().Truncate(time.Hour) ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev")) stateMetric("foo", ts, "env", "dev"))
fn( fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}}, []config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -660,7 +660,7 @@ func TestGroup_Restore(t *testing.T) {
// one active alert with restore labels missmatch // one active alert with restore labels missmatch
ts = time.Now().Truncate(time.Hour) ts = time.Now().Truncate(time.Hour)
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`, fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev", "team", "foo")) stateMetric("foo", ts, "env", "dev", "team", "foo"))
fn( fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}}, []config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
@ -672,30 +672,30 @@ func TestGroup_Restore(t *testing.T) {
} }
func TestAlertingRule_Exec_Negative(t *testing.T) { func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0) ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"} ar.Labels = map[string]string{"job": "test"}
ar.q = fq ar.q = fq
// successful attempt // successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), time.Now(), 0) _, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
// label `job` will collide with rule extra label and will make both time series equal // label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), time.Now(), 0) _, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) { if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err) t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
} }
fq.reset() fq.Reset()
expErr := "connection reset by peer" expErr := "connection reset by peer"
fq.setErr(errors.New(expErr)) fq.SetErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), time.Now(), 0) _, err = ar.exec(context.TODO(), time.Now(), 0)
if err == nil { if err == nil {
t.Fatalf("expected to get err; got nil") t.Fatalf("expected to get err; got nil")
} }
@ -705,7 +705,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
} }
func TestAlertingRuleLimit(t *testing.T) { func TestAlertingRuleLimit(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0) ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"} ar.Labels = map[string]string{"job": "test"}
ar.q = fq ar.q = fq
@ -737,15 +737,15 @@ func TestAlertingRuleLimit(t *testing.T) {
err error err error
timestamp = time.Now() timestamp = time.Now()
) )
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
for _, testCase := range testCases { for _, testCase := range testCases {
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit) _, err = ar.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) { if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err) t.Fatal(err)
} }
} }
fq.reset() fq.Reset()
} }
func TestAlertingRule_Template(t *testing.T) { func TestAlertingRule_Template(t *testing.T) {
@ -870,12 +870,12 @@ func TestAlertingRule_Template(t *testing.T) {
fakeGroup := Group{Name: "TestRule_Exec"} fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
tc.rule.GroupID = fakeGroup.ID() tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq tc.rule.q = fq
tc.rule.state = newRuleState(10) tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
fq.add(tc.metrics...) fq.Add(tc.metrics...)
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil { if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err) t.Fatalf("unexpected err: %s", err)
} }
for hash, expAlert := range tc.expAlerts { for hash, expAlert := range tc.expAlerts {
@ -989,7 +989,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
For: waitFor, For: waitFor,
EvalInterval: waitFor, EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert), alerts: make(map[uint64]*notifier.Alert),
state: newRuleState(10), state: &ruleState{entries: make([]StateEntry, 10)},
} }
return &rule return &rule
} }

View file

@ -1,8 +1,10 @@
package main package rule
import ( import (
"context" "context"
"encoding/json"
"errors" "errors"
"flag"
"fmt" "fmt"
"hash/fnv" "hash/fnv"
"net/url" "net/url"
@ -11,7 +13,7 @@ import (
"sync" "sync"
"time" "time"
"github.com/VictoriaMetrics/metrics" "github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@ -21,6 +23,18 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
var (
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
"which by default is 4 times evaluationInterval of the parent ")
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
) )
// Group is an entity for grouping rules // Group is an entity for grouping rules
@ -96,7 +110,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
return r return r
} }
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group { // NewGroup returns a new group
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{ g := &Group{
Type: cfg.Type, Type: cfg.Type,
Name: cfg.Name, Name: cfg.Name,
@ -153,11 +168,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
return g return g
} }
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule { func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
if rule.Alert != "" { if r.Alert != "" {
return newAlertingRule(qb, g, rule) return NewAlertingRule(qb, g, r)
} }
return newRecordingRule(qb, g, rule) return NewRecordingRule(qb, g, r)
} }
// ID return unique group ID that consists of // ID return unique group ID that consists of
@ -178,8 +193,8 @@ func (g *Group) ID() uint64 {
return hash.Sum64() return hash.Sum64()
} }
// Restore restores alerts state for group rules // restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error { func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
for _, rule := range g.Rules { for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule) ar, ok := rule.(*AlertingRule)
if !ok { if !ok {
@ -195,7 +210,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
Headers: g.Headers, Headers: g.Headers,
Debug: ar.Debug, Debug: ar.Debug,
}) })
if err := ar.Restore(ctx, q, ts, lookback); err != nil { if err := ar.restore(ctx, q, ts, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err) return fmt.Errorf("error while restoring rule %q: %w", rule, err)
} }
} }
@ -205,7 +220,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
// updateWith updates existing group with // updateWith updates existing group with
// passed group object. This function ignores group // passed group object. This function ignores group
// evaluation interval change. It supposed to be updated // evaluation interval change. It supposed to be updated
// in group.start function. // in group.Start function.
// Not thread-safe. // Not thread-safe.
func (g *Group) updateWith(newGroup *Group) error { func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule) rulesRegistry := make(map[uint64]Rule)
@ -218,11 +233,11 @@ func (g *Group) updateWith(newGroup *Group) error {
if !ok { if !ok {
// old rule is not present in the new list // old rule is not present in the new list
// so we mark it for removing // so we mark it for removing
g.Rules[i].Close() g.Rules[i].close()
g.Rules[i] = nil g.Rules[i] = nil
continue continue
} }
if err := or.UpdateWith(nr); err != nil { if err := or.updateWith(nr); err != nil {
return err return err
} }
delete(rulesRegistry, nr.ID()) delete(rulesRegistry, nr.ID())
@ -255,10 +270,10 @@ func (g *Group) updateWith(newGroup *Group) error {
return nil return nil
} }
// interruptEval interrupts in-flight rules evaluations // InterruptEval interrupts in-flight rules evaluations
// within the group. It is expected that g.evalCancel // within the group. It is expected that g.evalCancel
// will be repopulated after the call. // will be repopulated after the call.
func (g *Group) interruptEval() { func (g *Group) InterruptEval() {
g.mu.RLock() g.mu.RLock()
defer g.mu.RUnlock() defer g.mu.RUnlock()
@ -267,12 +282,13 @@ func (g *Group) interruptEval() {
} }
} }
func (g *Group) close() { // Close stops the group and it's rules, unregisters group metrics
func (g *Group) Close() {
if g.doneCh == nil { if g.doneCh == nil {
return return
} }
close(g.doneCh) close(g.doneCh)
g.interruptEval() g.InterruptEval()
<-g.finishedCh <-g.finishedCh
g.metrics.iterationDuration.Unregister() g.metrics.iterationDuration.Unregister()
@ -280,19 +296,21 @@ func (g *Group) close() {
g.metrics.iterationMissed.Unregister() g.metrics.iterationMissed.Unregister()
g.metrics.iterationInterval.Unregister() g.metrics.iterationInterval.Unregister()
for _, rule := range g.Rules { for _, rule := range g.Rules {
rule.Close() rule.close()
} }
} }
var skipRandSleepOnGroupStart bool // SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
var SkipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) { // Start starts group's evaluation
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
defer func() { close(g.finishedCh) }() defer func() { close(g.finishedCh) }()
evalTS := time.Now() evalTS := time.Now()
// sleep random duration to spread group rules evaluation // sleep random duration to spread group rules evaluation
// over time in order to reduce load on datasource. // over time in order to reduce load on datasource.
if !skipRandSleepOnGroupStart { if !SkipRandSleepOnGroupStart {
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset) sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
g.infof("will start in %v", sleepBeforeStart) g.infof("will start in %v", sleepBeforeStart)
@ -310,10 +328,10 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
} }
e := &executor{ e := &executor{
rw: rw, Rw: rw,
notifiers: nts, Notifiers: nts,
notifierHeaders: g.NotifierHeaders, notifierHeaders: g.NotifierHeaders,
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label), PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
} }
g.infof("started") g.infof("started")
@ -355,7 +373,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
// restore the rules state after the first evaluation // restore the rules state after the first evaluation
// so only active alerts can be restored. // so only active alerts can be restored.
if rr != nil { if rr != nil {
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack) err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
if err != nil { if err != nil {
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err) logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
} }
@ -409,6 +427,22 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
} }
} }
// UpdateWith inserts new group to updateCh
func (g *Group) UpdateWith(new *Group) {
g.updateCh <- new
}
// DeepCopy returns a deep copy of group
func (g *Group) DeepCopy() *Group {
g.mu.RLock()
data, _ := json.Marshal(g)
g.mu.RUnlock()
newG := Group{}
_ = json.Unmarshal(data, &newG)
newG.Rules = g.Rules
return &newG
}
// delayBeforeStart returns a duration on the interval between [ts..ts+interval]. // delayBeforeStart returns a duration on the interval between [ts..ts+interval].
// delayBeforeStart accounts for `offset`, so returned duration should be always // delayBeforeStart accounts for `offset`, so returned duration should be always
// bigger than the `offset`. // bigger than the `offset`.
@ -438,6 +472,89 @@ func (g *Group) infof(format string, args ...interface{}) {
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency) g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
} }
// Replay performs group replay
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
var total int
step := g.Interval * time.Duration(maxDataPoint)
ri := rangeIterator{start: start, end: end, step: step}
iterations := int(end.Sub(start)/step) + 1
fmt.Printf("\nGroup %q"+
"\ninterval: \t%v"+
"\nrequests to make: \t%d"+
"\nmax range per request: \t%v\n",
g.Name, g.Interval, iterations, step)
if g.Limit > 0 {
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
g.Limit)
}
for _, rule := range g.Rules {
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
var bar *pb.ProgressBar
if !disableProgressBar {
bar = pb.StartNew(iterations)
}
ri.reset()
for ri.next() {
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
if err != nil {
logger.Fatalf("rule %q: %s", rule, err)
}
total += n
if bar != nil {
bar.Increment()
}
}
if bar != nil {
bar.Finish()
}
// sleep to let remote storage to flush data on-disk
// so chained rules could be calculated correctly
time.Sleep(replayDelay)
}
return total
}
// ExecOnce evaluates all the rules under group for once with given timestamp.
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
e := &executor{
Rw: rw,
Notifiers: nts,
notifierHeaders: g.NotifierHeaders,
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
}
if len(g.Rules) < 1 {
return nil
}
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
}
type rangeIterator struct {
step time.Duration
start, end time.Time
iter int
s, e time.Time
}
func (ri *rangeIterator) reset() {
ri.iter = 0
ri.s, ri.e = time.Time{}, time.Time{}
}
func (ri *rangeIterator) next() bool {
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
if !ri.end.After(ri.s) {
return false
}
ri.e = ri.s.Add(ri.step)
if ri.e.After(ri.end) {
ri.e = ri.end
}
ri.iter++
return true
}
// getResolveDuration returns the duration after which firing alert // getResolveDuration returns the duration after which firing alert
// can be considered as resolved. // can be considered as resolved.
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration { func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
@ -477,20 +594,22 @@ func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
return timestamp return timestamp
} }
// executor contains group's notify and rw configs
type executor struct { type executor struct {
notifiers func() []notifier.Notifier Notifiers func() []notifier.Notifier
notifierHeaders map[string]string notifierHeaders map[string]string
rw *remotewrite.Client Rw remotewrite.RWClient
previouslySentSeriesToRWMu sync.Mutex previouslySentSeriesToRWMu sync.Mutex
// previouslySentSeriesToRW stores series sent to RW on previous iteration // PreviouslySentSeriesToRW stores series sent to RW on previous iteration
// map[ruleID]map[ruleLabels][]prompb.Label // map[ruleID]map[ruleLabels][]prompb.Label
// where `ruleID` is ID of the Rule within a Group // where `ruleID` is ID of the Rule within a Group
// and `ruleLabels` is []prompb.Label marshalled to a string // and `ruleLabels` is []prompb.Label marshalled to a string
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label PreviouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
} }
// execConcurrently executes rules concurrently if concurrency>1
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error { func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
res := make(chan error, len(rules)) res := make(chan error, len(rules))
if concurrency == 1 { if concurrency == 1 {
@ -505,14 +624,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
sem := make(chan struct{}, concurrency) sem := make(chan struct{}, concurrency)
go func() { go func() {
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
for _, rule := range rules { for _, r := range rules {
sem <- struct{}{} sem <- struct{}{}
wg.Add(1) wg.Add(1)
go func(r Rule) { go func(r Rule) {
res <- e.exec(ctx, r, ts, resolveDuration, limit) res <- e.exec(ctx, r, ts, resolveDuration, limit)
<-sem <-sem
wg.Done() wg.Done()
}(rule) }(r)
} }
wg.Wait() wg.Wait()
close(res) close(res)
@ -530,10 +649,10 @@ var (
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`) remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
) )
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error { func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
execTotal.Inc() execTotal.Inc()
tss, err := rule.Exec(ctx, ts, limit) tss, err := r.exec(ctx, ts, limit)
if err != nil { if err != nil {
if errors.Is(err, context.Canceled) { if errors.Is(err, context.Canceled) {
// the context can be cancelled on graceful shutdown // the context can be cancelled on graceful shutdown
@ -541,17 +660,17 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return nil return nil
} }
execErrors.Inc() execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err) return fmt.Errorf("rule %q: failed to execute: %w", r, err)
} }
if e.rw != nil { if e.Rw != nil {
pushToRW := func(tss []prompbmarshal.TimeSeries) error { pushToRW := func(tss []prompbmarshal.TimeSeries) error {
var lastErr error var lastErr error
for _, ts := range tss { for _, ts := range tss {
remoteWriteTotal.Inc() remoteWriteTotal.Inc()
if err := e.rw.Push(ts); err != nil { if err := e.Rw.Push(ts); err != nil {
remoteWriteErrors.Inc() remoteWriteErrors.Inc()
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err) lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
} }
} }
return lastErr return lastErr
@ -560,13 +679,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
return err return err
} }
staleSeries := e.getStaleSeries(rule, tss, ts) staleSeries := e.getStaleSeries(r, tss, ts)
if err := pushToRW(staleSeries); err != nil { if err := pushToRW(staleSeries); err != nil {
return err return err
} }
} }
ar, ok := rule.(*AlertingRule) ar, ok := r.(*AlertingRule)
if !ok { if !ok {
return nil return nil
} }
@ -578,11 +697,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
errGr := new(utils.ErrGroup) errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers() { for _, nt := range e.Notifiers() {
wg.Add(1) wg.Add(1)
go func(nt notifier.Notifier) { go func(nt notifier.Notifier) {
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil { if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err)) errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
} }
wg.Done() wg.Done()
}(nt) }(nt)
@ -592,7 +711,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
} }
// getStaledSeries checks whether there are stale series from previously sent ones. // getStaledSeries checks whether there are stale series from previously sent ones.
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries { func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss)) ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
for _, ts := range tss { for _, ts := range tss {
// convert labels to strings so we can compare with previously sent series // convert labels to strings so we can compare with previously sent series
@ -600,11 +719,11 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
ruleLabels[key] = ts.Labels ruleLabels[key] = ts.Labels
} }
rID := rule.ID() rID := r.ID()
var staleS []prompbmarshal.TimeSeries var staleS []prompbmarshal.TimeSeries
// check whether there are series which disappeared and need to be marked as stale // check whether there are series which disappeared and need to be marked as stale
e.previouslySentSeriesToRWMu.Lock() e.previouslySentSeriesToRWMu.Lock()
for key, labels := range e.previouslySentSeriesToRW[rID] { for key, labels := range e.PreviouslySentSeriesToRW[rID] {
if _, ok := ruleLabels[key]; ok { if _, ok := ruleLabels[key]; ok {
continue continue
} }
@ -613,7 +732,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
staleS = append(staleS, ss) staleS = append(staleS, ss)
} }
// set previous series to current // set previous series to current
e.previouslySentSeriesToRW[rID] = ruleLabels e.PreviouslySentSeriesToRW[rID] = ruleLabels
e.previouslySentSeriesToRWMu.Unlock() e.previouslySentSeriesToRWMu.Unlock()
return staleS return staleS
@ -631,14 +750,14 @@ func (e *executor) purgeStaleSeries(activeRules []Rule) {
for _, rule := range activeRules { for _, rule := range activeRules {
id := rule.ID() id := rule.ID()
prev, ok := e.previouslySentSeriesToRW[id] prev, ok := e.PreviouslySentSeriesToRW[id]
if ok { if ok {
// keep previous series for staleness detection // keep previous series for staleness detection
newPreviouslySentSeriesToRW[id] = prev newPreviouslySentSeriesToRW[id] = prev
} }
} }
e.previouslySentSeriesToRW = nil e.PreviouslySentSeriesToRW = nil
e.previouslySentSeriesToRW = newPreviouslySentSeriesToRW e.PreviouslySentSeriesToRW = newPreviouslySentSeriesToRW
e.previouslySentSeriesToRWMu.Unlock() e.previouslySentSeriesToRWMu.Unlock()
} }

View file

@ -1,17 +1,22 @@
package main package rule
import ( import (
"context" "context"
"fmt" "fmt"
"math" "math"
"os"
"reflect" "reflect"
"sort" "sort"
"testing" "testing"
"time" "time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils" "github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
@ -20,7 +25,15 @@ import (
func init() { func init() {
// Disable rand sleep on group start during tests in order to speed up test execution. // Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code. // Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true SkipRandSleepOnGroupStart = true
}
func TestMain(m *testing.M) {
if err := templates.Load([]string{}, true); err != nil {
fmt.Println("failed to load template for test")
os.Exit(1)
}
os.Exit(m.Run())
} }
func TestUpdateWith(t *testing.T) { func TestUpdateWith(t *testing.T) {
@ -138,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"} g := &Group{Name: "test"}
qb := &fakeQuerier{} qb := &datasource.FakeQuerier{}
for _, r := range tc.currentRules { for _, r := range tc.currentRules {
r.ID = config.HashRule(r) r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(qb, r)) g.Rules = append(g.Rules, g.newRule(qb, r))
@ -170,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
if got.ID() != want.ID() { if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got) t.Fatalf("expected to have rule %q; got %q", want, got)
} }
if err := compareRules(t, got, want); err != nil { if err := CompareRules(t, got, want); err != nil {
t.Fatalf("comparison error: %s", err) t.Fatalf("comparison error: %s", err)
} }
} }
@ -179,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
} }
func TestGroupStart(t *testing.T) { func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file const (
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true) rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil { if err != nil {
t.Fatalf("failed to parse rules: %s", err) t.Fatalf("failed to parse rules: %s", err)
} }
fs := &fakeQuerier{} fs := &datasource.FakeQuerier{}
fn := &fakeNotifier{} fn := &notifier.FakeNotifier{}
const evalInterval = time.Millisecond const evalInterval = time.Millisecond
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"}) g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2 g.Concurrency = 2
const inst1, inst2, job = "foo", "bar", "baz" const inst1, inst2, job = "foo", "bar", "baz"
@ -204,7 +231,7 @@ func TestGroupStart(t *testing.T) {
alert1.State = notifier.StateFiring alert1.State = notifier.StateFiring
// add external label // add external label
alert1.Labels["cluster"] = "east-1" alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules // add rule labels
alert1.Labels["label"] = "bar" alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1 alert1.Labels["host"] = inst1
// add service labels // add service labels
@ -219,7 +246,7 @@ func TestGroupStart(t *testing.T) {
alert2.State = notifier.StateFiring alert2.State = notifier.StateFiring
// add external label // add external label
alert2.Labels["cluster"] = "east-1" alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules // add rule labels
alert2.Labels["label"] = "bar" alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2 alert2.Labels["host"] = inst2
// add service labels // add service labels
@ -228,40 +255,40 @@ func TestGroupStart(t *testing.T) {
alert2.ID = hash(alert2.Labels) alert2.ID = hash(alert2.Labels)
finished := make(chan struct{}) finished := make(chan struct{})
fs.add(m1) fs.Add(m1)
fs.add(m2) fs.Add(m2)
go func() { go func() {
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs) g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
close(finished) close(finished)
}() }()
// wait for multiple evals // wait for multiple evals
time.Sleep(20 * evalInterval) time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts() gotAlerts := fn.GetAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2} expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts) compareAlerts(t, expectedAlerts, gotAlerts)
gotAlertsNum := fn.getCounter() gotAlertsNum := fn.GetCounter()
if gotAlertsNum < len(expectedAlerts)*2 { if gotAlertsNum < len(expectedAlerts)*2 {
t.Fatalf("expected to receive at least %d alerts; got %d instead", t.Fatalf("expected to receive at least %d alerts; got %d instead",
len(expectedAlerts)*2, gotAlertsNum) len(expectedAlerts)*2, gotAlertsNum)
} }
// reset previous data // reset previous data
fs.reset() fs.Reset()
// and set only one datapoint for response // and set only one datapoint for response
fs.add(m1) fs.Add(m1)
// wait for multiple evals // wait for multiple evals
time.Sleep(20 * evalInterval) time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts() gotAlerts = fn.GetAlerts()
alert2.State = notifier.StateInactive alert2.State = notifier.StateInactive
expectedAlerts = []notifier.Alert{*alert1, *alert2} expectedAlerts = []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts) compareAlerts(t, expectedAlerts, gotAlerts)
g.close() g.Close()
<-finished <-finished
} }
@ -294,15 +321,15 @@ func TestResolveDuration(t *testing.T) {
func TestGetStaleSeries(t *testing.T) { func TestGetStaleSeries(t *testing.T) {
ts := time.Now() ts := time.Now()
e := &executor{ e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label), PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
} }
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) { f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
t.Helper() t.Helper()
var tss []prompbmarshal.TimeSeries var tss []prompbmarshal.TimeSeries
for _, l := range labels { for _, l := range labels {
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l)) tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
} }
staleS := e.getStaleSeries(rule, tss, ts) staleS := e.getStaleSeries(r, tss, ts)
if staleS == nil && expLabels == nil { if staleS == nil && expLabels == nil {
return return
} }
@ -387,7 +414,7 @@ func TestPurgeStaleSeries(t *testing.T) {
f := func(curRules, newRules, expStaleRules []Rule) { f := func(curRules, newRules, expStaleRules []Rule) {
t.Helper() t.Helper()
e := &executor{ e := &executor{
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label), PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
} }
// seed executor with series for // seed executor with series for
// current rules // current rules
@ -397,13 +424,13 @@ func TestPurgeStaleSeries(t *testing.T) {
e.purgeStaleSeries(newRules) e.purgeStaleSeries(newRules)
if len(e.previouslySentSeriesToRW) != len(expStaleRules) { if len(e.PreviouslySentSeriesToRW) != len(expStaleRules) {
t.Fatalf("expected to get %d stale series, got %d", t.Fatalf("expected to get %d stale series, got %d",
len(expStaleRules), len(e.previouslySentSeriesToRW)) len(expStaleRules), len(e.PreviouslySentSeriesToRW))
} }
for _, exp := range expStaleRules { for _, exp := range expStaleRules {
if _, ok := e.previouslySentSeriesToRW[exp.ID()]; !ok { if _, ok := e.PreviouslySentSeriesToRW[exp.ID()]; !ok {
t.Fatalf("expected to have rule %d; got nil instead", exp.ID()) t.Fatalf("expected to have rule %d; got nil instead", exp.ID())
} }
} }
@ -438,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
} }
func TestFaultyNotifier(t *testing.T) { func TestFaultyNotifier(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := newTestAlertingRule("instant", 0) r := newTestAlertingRule("instant", 0)
r.q = fq r.q = fq
fn := &fakeNotifier{} fn := &notifier.FakeNotifier{}
e := &executor{ e := &executor{
notifiers: func() []notifier.Notifier { Notifiers: func() []notifier.Notifier {
return []notifier.Notifier{ return []notifier.Notifier{
&faultyNotifier{}, &notifier.FaultyNotifier{},
fn, fn,
} }
}, },
@ -464,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
tn := time.Now() tn := time.Now()
deadline := tn.Add(delay / 2) deadline := tn.Add(delay / 2)
for { for {
if fn.getCounter() > 0 { if fn.GetCounter() > 0 {
return return
} }
if tn.After(deadline) { if tn.After(deadline) {
@ -477,18 +504,18 @@ func TestFaultyNotifier(t *testing.T) {
} }
func TestFaultyRW(t *testing.T) { func TestFaultyRW(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
r := &RecordingRule{ r := &RecordingRule{
Name: "test", Name: "test",
state: newRuleState(10),
q: fq, q: fq,
state: &ruleState{entries: make([]StateEntry, 10)},
} }
e := &executor{ e := &executor{
rw: &remotewrite.Client{}, Rw: &remotewrite.Client{},
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label), PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
} }
err := e.exec(context.Background(), r, time.Now(), 0, 10) err := e.exec(context.Background(), r, time.Now(), 0, 10)
@ -498,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
} }
func TestCloseWithEvalInterruption(t *testing.T) { func TestCloseWithEvalInterruption(t *testing.T) {
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true) const (
rules = `
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"
`
)
var groups []config.Group
err := yaml.Unmarshal([]byte(rules), &groups)
if err != nil { if err != nil {
t.Fatalf("failed to parse rules: %s", err) t.Fatalf("failed to parse rules: %s", err)
} }
const delay = time.Second * 2 const delay = time.Second * 2
fq := &fakeQuerierWithDelay{delay: delay} fq := &datasource.FakeQuerierWithDelay{Delay: delay}
const evalInterval = time.Millisecond const evalInterval = time.Millisecond
g := newGroup(groups[0], fq, evalInterval, nil) g := NewGroup(groups[0], fq, evalInterval, nil)
go g.start(context.Background(), nil, nil, nil) go g.Start(context.Background(), nil, nil, nil)
time.Sleep(evalInterval * 20) time.Sleep(evalInterval * 20)
go func() { go func() {
g.close() g.Close()
}() }()
deadline := time.Tick(delay / 2) deadline := time.Tick(delay / 2)
@ -637,3 +679,81 @@ func TestGetPrometheusReqTimestamp(t *testing.T) {
} }
} }
} }
func TestRangeIterator(t *testing.T) {
testCases := []struct {
ri rangeIterator
result [][2]time.Time
}{
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 5 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
step: 45 * time.Minute,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
},
},
{
ri: rangeIterator{
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
step: time.Second,
},
result: [][2]time.Time{
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
var j int
for tc.ri.next() {
if len(tc.result) < j+1 {
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
j, tc.ri.s, tc.ri.e)
}
s, e := tc.ri.s, tc.ri.e
expS, expE := tc.result[j][0], tc.result[j][1]
if s != expS {
t.Fatalf("expected to get start=%v; got %v", expS, s)
}
if e != expE {
t.Fatalf("expected to get end=%v; got %v", expE, e)
}
j++
}
})
}
}
func parseTime(t *testing.T, s string) time.Time {
t.Helper()
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
if err != nil {
t.Fatal(err)
}
return tt
}

View file

@ -1,4 +1,4 @@
package main package rule
import ( import (
"context" "context"
@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
return rr.RuleID return rr.RuleID
} }
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule { // NewRecordingRule creates a new RecordingRule
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{ rr := &RecordingRule{
Type: group.Type, Type: group.Type,
RuleID: cfg.ID, RuleID: cfg.ID,
@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
}), }),
} }
entrySize := *ruleUpdateEntriesLimit
if cfg.UpdateEntriesLimit != nil { if cfg.UpdateEntriesLimit != nil {
rr.state = newRuleState(*cfg.UpdateEntriesLimit) entrySize = *cfg.UpdateEntriesLimit
} else { }
rr.state = newRuleState(*ruleUpdateEntriesLimit) if entrySize < 1 {
entrySize = 1
}
rr.state = &ruleState{
entries: make([]StateEntry, entrySize),
} }
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID()) labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels), rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 { func() float64 {
e := rr.state.getLast() e := rr.state.getLast()
if e.err == nil { if e.Err == nil {
return 0 return 0
} }
return 1 return 1
@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels), rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
func() float64 { func() float64 {
e := rr.state.getLast() e := rr.state.getLast()
return float64(e.samples) return float64(e.Samples)
}) })
return rr return rr
} }
// Close unregisters rule metrics // close unregisters rule metrics
func (rr *RecordingRule) Close() { func (rr *RecordingRule) close() {
rr.metrics.errors.Unregister() rr.metrics.errors.Unregister()
rr.metrics.samples.Unregister() rr.metrics.samples.Unregister()
} }
// ExecRange executes recording rule on the given time range similarly to Exec. // execRange executes recording rule on the given time range similarly to Exec.
// It doesn't update internal states of the Rule and meant to be used just // It doesn't update internal states of the Rule and meant to be used just
// to get time series for backfilling. // to get time series for backfilling.
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) { func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end) res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
if err != nil { if err != nil {
return nil, err return nil, err
@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
return tss, nil return tss, nil
} }
// Exec executes RecordingRule expression via the given Querier. // exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) { func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
start := time.Now() start := time.Now()
res, req, err := rr.q.Query(ctx, rr.Expr, ts) res, req, err := rr.q.Query(ctx, rr.Expr, ts)
curState := ruleStateEntry{ curState := StateEntry{
time: start, Time: start,
at: ts, At: ts,
duration: time.Since(start), Duration: time.Since(start),
samples: len(res.Data), Samples: len(res.Data),
seriesFetched: res.SeriesFetched, SeriesFetched: res.SeriesFetched,
curl: requestToCurl(req), Curl: requestToCurl(req),
} }
defer func() { defer func() {
@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
}() }()
if err != nil { if err != nil {
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err) curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
return nil, curState.err return nil, curState.Err
} }
qMetrics := res.Data qMetrics := res.Data
numSeries := len(qMetrics) numSeries := len(qMetrics)
if limit > 0 && numSeries > limit { if limit > 0 && numSeries > limit {
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries) curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
return nil, curState.err return nil, curState.Err
} }
duplicates := make(map[string]struct{}, len(qMetrics)) duplicates := make(map[string]struct{}, len(qMetrics))
@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
ts := rr.toTimeSeries(r) ts := rr.toTimeSeries(r)
key := stringifyLabels(ts) key := stringifyLabels(ts)
if _, ok := duplicates[key]; ok { if _, ok := duplicates[key]; ok {
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate) curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
return nil, curState.err return nil, curState.Err
} }
duplicates[key] = struct{}{} duplicates[key] = struct{}{}
tss = append(tss, ts) tss = append(tss, ts)
@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
return newTimeSeries(m.Values, m.Timestamps, labels) return newTimeSeries(m.Values, m.Timestamps, labels)
} }
// UpdateWith copies all significant fields. // updateWith copies all significant fields.
func (rr *RecordingRule) UpdateWith(r Rule) error { func (rr *RecordingRule) updateWith(r Rule) error {
nr, ok := r.(*RecordingRule) nr, ok := r.(*RecordingRule)
if !ok { if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r) return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
rr.q = nr.q rr.q = nr.q
return nil return nil
} }
// ToAPI returns Rule's representation in form
// of APIRule
func (rr *RecordingRule) ToAPI() APIRule {
lastState := rr.state.getLast()
r := APIRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.time,
EvaluationTime: lastState.duration.Seconds(),
Health: "ok",
LastSamples: lastState.samples,
LastSeriesFetched: lastState.seriesFetched,
MaxUpdates: rr.state.size(),
Updates: rr.state.getAll(),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.err != nil {
r.LastError = lastState.err.Error()
r.Health = "err"
}
return r
}

View file

@ -1,4 +1,4 @@
package main package rule
import ( import (
"context" "context"
@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
Name: "job:foo", Name: "job:foo",
Labels: map[string]string{ Labels: map[string]string{
"source": "test", "source": "test",
}}, },
},
[]datasource.Metric{ []datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")}, metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{ []prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo", "__name__": "job:foo",
@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
} }
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
fq.add(tc.metrics...) fq.Add(tc.metrics...)
tc.rule.q = fq tc.rule.q = fq
tc.rule.state = newRuleState(10) tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0) tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
if err != nil { if err != nil {
t.Fatalf("unexpected Exec err: %s", err) t.Fatalf("unexpected Exec err: %s", err)
} }
@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
}}, }},
[]datasource.Metric{ []datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"), metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")}, metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
},
[]prompbmarshal.TimeSeries{ []prompbmarshal.TimeSeries{
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{ newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": "job:foo", "__name__": "job:foo",
@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
} }
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
fq.add(tc.metrics...) fq.Add(tc.metrics...)
tc.rule.q = fq tc.rule.q = fq
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now()) tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
if err != nil { if err != nil {
t.Fatalf("unexpected Exec err: %s", err) t.Fatalf("unexpected Exec err: %s", err)
} }
@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"), metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"), metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
} }
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{ rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
"source": "test_limit", "source": "test_limit",
}} }}
var err error var err error
for _, testCase := range testCases { for _, testCase := range testCases {
fq := &fakeQuerier{} fq := &datasource.FakeQuerier{}
fq.add(testMetrics...) fq.Add(testMetrics...)
rule.q = fq rule.q = fq
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit) _, err = rule.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) { if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err) t.Fatal(err)
} }
@ -215,18 +218,17 @@ func TestRecordingRuleLimit(t *testing.T) {
func TestRecordingRule_ExecNegative(t *testing.T) { func TestRecordingRule_ExecNegative(t *testing.T) {
rr := &RecordingRule{ rr := &RecordingRule{
Name: "job:foo", Name: "job:foo",
state: newRuleState(10),
Labels: map[string]string{ Labels: map[string]string{
"job": "test", "job": "test",
}, },
state: &ruleState{entries: make([]StateEntry, 10)},
} }
fq := &datasource.FakeQuerier{}
fq := &fakeQuerier{}
expErr := "connection reset by peer" expErr := "connection reset by peer"
fq.setErr(errors.New(expErr)) fq.SetErr(errors.New(expErr))
rr.q = fq rr.q = fq
_, err := rr.Exec(context.TODO(), time.Now(), 0) _, err := rr.exec(context.TODO(), time.Now(), 0)
if err == nil { if err == nil {
t.Fatalf("expected to get err; got nil") t.Fatalf("expected to get err; got nil")
} }
@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err) t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
} }
fq.reset() fq.Reset()
// add metrics which differs only by `job` label // add metrics which differs only by `job` label
// which will be overridden by rule // which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo")) fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar")) fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), time.Now(), 0) _, err = rr.exec(context.TODO(), time.Now(), 0)
if err == nil { if err == nil {
t.Fatalf("expected to get err; got nil") t.Fatalf("expected to get err; got nil")
} }

174
app/vmalert/rule/rule.go Normal file
View file

@ -0,0 +1,174 @@
package rule
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// ID returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// exec executes the rule with given context at the given timestamp and limit.
// returns an err if number of resulting time series exceeds the limit.
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
// execRange executes the rule on the given time range.
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
// updateWith performs modification of current Rule
// with fields of the given Rule.
updateWith(Rule) error
// close performs the shutdown procedures for rule
// such as metrics unregister
close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
type ruleState struct {
sync.RWMutex
entries []StateEntry
cur int
}
// StateEntry stores rule's execution states
type StateEntry struct {
// stores last moment of time rule.Exec was called
Time time.Time
// stores the timesteamp with which rule.Exec was called
At time.Time
// stores the duration of the last rule.Exec call
Duration time.Duration
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health ruleState
Err error
// stores the number of samples returned during
// the last evaluation
Samples int
// stores the number of time series fetched during
// the last evaluation.
// Is supported by VictoriaMetrics only, starting from v1.90.0
// If seriesFetched == nil, then this attribute was missing in
// datasource response (unsupported).
SeriesFetched *int
// stores the curl command reflecting the HTTP request used during rule.Exec
Curl string
}
// GetLastEntry returns latest stateEntry of rule
func GetLastEntry(r Rule) StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getLast()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getLast()
}
return StateEntry{}
}
// GetRuleStateSize returns size of rule stateEntry
func GetRuleStateSize(r Rule) int {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.size()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.size()
}
return 0
}
// GetAllRuleState returns rule entire stateEntries
func GetAllRuleState(r Rule) []StateEntry {
if rule, ok := r.(*AlertingRule); ok {
return rule.state.getAll()
}
if rule, ok := r.(*RecordingRule); ok {
return rule.state.getAll()
}
return []StateEntry{}
}
func (s *ruleState) size() int {
s.RLock()
defer s.RUnlock()
return len(s.entries)
}
func (s *ruleState) getLast() StateEntry {
s.RLock()
defer s.RUnlock()
if len(s.entries) == 0 {
return StateEntry{}
}
return s.entries[s.cur]
}
func (s *ruleState) getAll() []StateEntry {
entries := make([]StateEntry, 0)
s.RLock()
defer s.RUnlock()
cur := s.cur
for {
e := s.entries[cur]
if !e.Time.IsZero() || !e.At.IsZero() {
entries = append(entries, e)
}
cur--
if cur < 0 {
cur = cap(s.entries) - 1
}
if cur == s.cur {
return entries
}
}
}
func (s *ruleState) add(e StateEntry) {
s.Lock()
defer s.Unlock()
s.cur++
if s.cur > cap(s.entries)-1 {
s.cur = 0
}
s.entries[s.cur] = e
}
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
var err error
var tss []prompbmarshal.TimeSeries
for i := 0; i < replayRuleRetryAttempts; i++ {
tss, err = r.execRange(context.Background(), start, end)
if err == nil {
break
}
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
time.Sleep(time.Second)
}
if err != nil { // means all attempts failed
return 0, err
}
if len(tss) < 1 {
return 0, nil
}
var n int
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
return n, fmt.Errorf("remote write failure: %s", err)
}
n += len(ts.Samples)
}
return n, nil
}

View file

@ -0,0 +1,81 @@
package rule
import (
"sync"
"testing"
"time"
)
func TestRule_state(t *testing.T) {
stateEntriesN := 20
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, stateEntriesN)}}
e := r.state.getLast()
if !e.At.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
r.state.add(StateEntry{At: now})
e = r.state.getLast()
if e.At != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
r.state.add(StateEntry{At: now2})
e = r.state.getLast()
if e.At != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, now2)
}
if len(r.state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(r.state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
r.state.add(StateEntry{At: last})
}
e = r.state.getLast()
if e.At != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.At, last)
}
if len(r.state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(r.state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, 20)}}
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
r.state.add(StateEntry{At: time.Now()})
r.state.getAll()
r.state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -1,239 +1,18 @@
package main package rule
import ( import (
"context"
"fmt" "fmt"
"net/http"
"reflect" "reflect"
"sort" "sort"
"sync"
"testing" "testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
) )
type fakeQuerier struct { // CompareRules is a test helper func for other tests
sync.Mutex func CompareRules(t *testing.T, a, b Rule) error {
metrics []datasource.Metric
err error
}
func (fq *fakeQuerier) setErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fq
}
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fq.Query(ctx, q, time.Now())
return req, err
}
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) (datasource.Result, *http.Request, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return datasource.Result{}, nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithRegistry struct {
sync.Mutex
registry map[string][]datasource.Metric
}
func (fqr *fakeQuerierWithRegistry) set(key string, metrics ...datasource.Metric) {
fqr.Lock()
if fqr.registry == nil {
fqr.registry = make(map[string][]datasource.Metric)
}
fqr.registry[key] = metrics
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) reset() {
fqr.Lock()
fqr.registry = nil
fqr.Unlock()
}
func (fqr *fakeQuerierWithRegistry) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqr
}
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
req, _, err := fqr.Query(ctx, q, time.Now())
return req, err
}
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (datasource.Result, *http.Request, error) {
fqr.Lock()
defer fqr.Unlock()
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
metrics, ok := fqr.registry[expr]
if !ok {
return datasource.Result{}, req, nil
}
cp := make([]datasource.Metric, len(metrics))
copy(cp, metrics)
return datasource.Result{Data: cp}, req, nil
}
type fakeQuerierWithDelay struct {
fakeQuerier
delay time.Duration
}
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (datasource.Result, *http.Request, error) {
timer := time.NewTimer(fqd.delay)
select {
case <-ctx.Done():
case <-timer.C:
}
return fqd.fakeQuerier.Query(ctx, expr, ts)
}
func (fqd *fakeQuerierWithDelay) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
return fqd
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
// records number of received alerts in total
counter int
}
func (*fakeNotifier) Close() {}
func (*fakeNotifier) Addr() string { return "" }
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert, _ map[string]string) error {
fn.Lock()
defer fn.Unlock()
fn.counter += len(alerts)
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getCounter() int {
fn.Lock()
defer fn.Unlock()
return fn.counter
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
type faultyNotifier struct {
fakeNotifier
}
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert, _ map[string]string) error {
d, ok := ctx.Deadline()
if ok {
time.Sleep(time.Until(d))
}
return fmt.Errorf("send failed")
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := compareRules(t, want, got); err != nil {
t.Fatalf("comparison error: %s", err)
}
}
}
func compareRules(t *testing.T, a, b Rule) error {
t.Helper() t.Helper()
switch v := a.(type) { switch v := a.(type) {
case *AlertingRule: case *AlertingRule:
@ -287,6 +66,50 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
return nil return nil
} }
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
return metricWithValuesAndLabels(t, []float64{value}, labels...)
}
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Values = values
for i := range values {
m.Timestamps = append(m.Timestamps, int64(i))
}
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
var ls []prompbmarshal.Label
for i := 0; i < len(labels); i += 2 {
ls = append(ls, prompbmarshal.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return ls
}
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error { func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
t.Helper() t.Helper()
if len(a) != len(b) { if len(a) != len(b) {

View file

@ -1,4 +1,4 @@
package main package rule
import ( import (
"fmt" "fmt"

View file

@ -1,4 +1,4 @@
package main package rule
import ( import (
"net/http" "net/http"

View file

@ -1,100 +0,0 @@
package main
import (
"sync"
"testing"
"time"
)
func TestRule_stateDisabled(t *testing.T) {
state := newRuleState(-1)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
state.add(ruleStateEntry{at: time.Now()})
if len(state.getAll()) != 1 {
// state should store at least one update at any circumstances
t.Fatalf("expected for state to have %d entries; got %d",
1, len(state.getAll()),
)
}
}
func TestRule_state(t *testing.T) {
stateEntriesN := 20
state := newRuleState(stateEntriesN)
e := state.getLast()
if !e.at.IsZero() {
t.Fatalf("expected entry to be zero")
}
now := time.Now()
state.add(ruleStateEntry{at: now})
e = state.getLast()
if e.at != now {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now)
}
time.Sleep(time.Millisecond)
now2 := time.Now()
state.add(ruleStateEntry{at: now2})
e = state.getLast()
if e.at != now2 {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, now2)
}
if len(state.getAll()) != 2 {
t.Fatalf("expected for state to have 2 entries only; got %d",
len(state.getAll()),
)
}
var last time.Time
for i := 0; i < stateEntriesN*2; i++ {
last = time.Now()
state.add(ruleStateEntry{at: last})
}
e = state.getLast()
if e.at != last {
t.Fatalf("expected entry at %v to be equal to %v",
e.at, last)
}
if len(state.getAll()) != stateEntriesN {
t.Fatalf("expected for state to have %d entries only; got %d",
stateEntriesN, len(state.getAll()),
)
}
}
// TestRule_stateConcurrent supposed to test concurrent
// execution of state updates.
// Should be executed with -race flag
func TestRule_stateConcurrent(_ *testing.T) {
state := newRuleState(20)
const workers = 50
const iterations = 100
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
state.add(ruleStateEntry{at: time.Now()})
state.getAll()
state.getLast()
}
}()
}
wg.Wait()
}

View file

@ -10,6 +10,7 @@ import (
"strings" "strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@ -143,38 +144,32 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
} }
} }
const ( func (rh *requestHandler) getRule(r *http.Request) (apiRule, error) {
paramGroupID = "group_id" groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
paramAlertID = "alert_id"
paramRuleID = "rule_id"
)
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
if err != nil { if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err) return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
} }
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0) ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 64)
if err != nil { if err != nil {
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err) return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
} }
rule, err := rh.m.RuleAPI(groupID, ruleID) obj, err := rh.m.ruleAPI(groupID, ruleID)
if err != nil { if err != nil {
return APIRule{}, errResponse(err, http.StatusNotFound) return apiRule{}, errResponse(err, http.StatusNotFound)
} }
return rule, nil return obj, nil
} }
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) { func (rh *requestHandler) getAlert(r *http.Request) (*apiAlert, error) {
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0) groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramGroupID, err) return nil, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
} }
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 0) alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 64)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read %q param: %s", paramAlertID, err) return nil, fmt.Errorf("failed to read %q param: %s", paramAlertID, err)
} }
a, err := rh.m.AlertAPI(groupID, alertID) a, err := rh.m.alertAPI(groupID, alertID)
if err != nil { if err != nil {
return nil, errResponse(err, http.StatusNotFound) return nil, errResponse(err, http.StatusNotFound)
} }
@ -184,17 +179,17 @@ func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
type listGroupsResponse struct { type listGroupsResponse struct {
Status string `json:"status"` Status string `json:"status"`
Data struct { Data struct {
Groups []APIGroup `json:"groups"` Groups []apiGroup `json:"groups"`
} `json:"data"` } `json:"data"`
} }
func (rh *requestHandler) groups() []APIGroup { func (rh *requestHandler) groups() []apiGroup {
rh.m.groupsMu.RLock() rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock() defer rh.m.groupsMu.RUnlock()
groups := make([]APIGroup, 0) groups := make([]apiGroup, 0)
for _, g := range rh.m.groups { for _, g := range rh.m.groups {
groups = append(groups, g.toAPI()) groups = append(groups, groupToAPI(g))
} }
// sort list of alerts for deterministic output // sort list of alerts for deterministic output
@ -221,35 +216,35 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
type listAlertsResponse struct { type listAlertsResponse struct {
Status string `json:"status"` Status string `json:"status"`
Data struct { Data struct {
Alerts []*APIAlert `json:"alerts"` Alerts []*apiAlert `json:"alerts"`
} `json:"data"` } `json:"data"`
} }
func (rh *requestHandler) groupAlerts() []GroupAlerts { func (rh *requestHandler) groupAlerts() []groupAlerts {
rh.m.groupsMu.RLock() rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock() defer rh.m.groupsMu.RUnlock()
var groupAlerts []GroupAlerts var gAlerts []groupAlerts
for _, g := range rh.m.groups { for _, g := range rh.m.groups {
var alerts []*APIAlert var alerts []*apiAlert
for _, r := range g.Rules { for _, r := range g.Rules {
a, ok := r.(*AlertingRule) a, ok := r.(*rule.AlertingRule)
if !ok { if !ok {
continue continue
} }
alerts = append(alerts, a.AlertsToAPI()...) alerts = append(alerts, ruleToAPIAlert(a)...)
} }
if len(alerts) > 0 { if len(alerts) > 0 {
groupAlerts = append(groupAlerts, GroupAlerts{ gAlerts = append(gAlerts, groupAlerts{
Group: g.toAPI(), Group: groupToAPI(g),
Alerts: alerts, Alerts: alerts,
}) })
} }
} }
sort.Slice(groupAlerts, func(i, j int) bool { sort.Slice(gAlerts, func(i, j int) bool {
return groupAlerts[i].Group.Name < groupAlerts[j].Group.Name return gAlerts[i].Group.Name < gAlerts[j].Group.Name
}) })
return groupAlerts return gAlerts
} }
func (rh *requestHandler) listAlerts() ([]byte, error) { func (rh *requestHandler) listAlerts() ([]byte, error) {
@ -257,14 +252,14 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
defer rh.m.groupsMu.RUnlock() defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"} lr := listAlertsResponse{Status: "success"}
lr.Data.Alerts = make([]*APIAlert, 0) lr.Data.Alerts = make([]*apiAlert, 0)
for _, g := range rh.m.groups { for _, g := range rh.m.groups {
for _, r := range g.Rules { for _, r := range g.Rules {
a, ok := r.(*AlertingRule) a, ok := r.(*rule.AlertingRule)
if !ok { if !ok {
continue continue
} }
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsToAPI()...) lr.Data.Alerts = append(lr.Data.Alerts, ruleToAPIAlert(a)...)
} }
} }

View file

@ -38,7 +38,7 @@ btn-primary
{% endif %} {% endif %}
{% endfunc %} {% endfunc %}
{% func ListGroups(r *http.Request, originGroups []APIGroup) %} {% func ListGroups(r *http.Request, originGroups []apiGroup) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %} {%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
{%code {%code
@ -46,9 +46,9 @@ btn-primary
rOk := make(map[string]int) rOk := make(map[string]int)
rNotOk := make(map[string]int) rNotOk := make(map[string]int)
rNoMatch := make(map[string]int) rNoMatch := make(map[string]int)
var groups []APIGroup var groups []apiGroup
for _, g := range originGroups { for _, g := range originGroups {
var rules []APIRule var rules []apiRule
for _, r := range g.Rules { for _, r := range g.Rules {
if r.LastError != "" { if r.LastError != "" {
rNotOk[g.ID]++ rNotOk[g.ID]++
@ -166,7 +166,7 @@ btn-primary
{% endfunc %} {% endfunc %}
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %} {% func ListAlerts(r *http.Request, groupAlerts []groupAlerts) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %} {%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
{% if len(groupAlerts) > 0 %} {% if len(groupAlerts) > 0 %}
@ -183,7 +183,7 @@ btn-primary
</div> </div>
{%code {%code
var keys []string var keys []string
alertsByRule := make(map[string][]*APIAlert) alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts { for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 { if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID) keys = append(keys, alert.RuleID)
@ -310,7 +310,7 @@ btn-primary
{% endfunc %} {% endfunc %}
{% func Alert(r *http.Request, alert *APIAlert) %} {% func Alert(r *http.Request, alert *apiAlert) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %} {%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code {%code
@ -397,7 +397,7 @@ btn-primary
{% endfunc %} {% endfunc %}
{% func RuleDetails(r *http.Request, rule APIRule) %} {% func RuleDetails(r *http.Request, rule apiRule) %}
{%code prefix := utils.Prefix(r.URL.Path) %} {%code prefix := utils.Prefix(r.URL.Path) %}
{%= tpl.Header(r, navItems, "", getLastConfigError()) %} {%= tpl.Header(r, navItems, "", getLastConfigError()) %}
{%code {%code
@ -416,9 +416,9 @@ btn-primary
var seriesFetchedEnabled bool var seriesFetchedEnabled bool
var seriesFetchedWarning bool var seriesFetchedWarning bool
for _, u := range rule.Updates { for _, u := range rule.Updates {
if u.seriesFetched != nil { if u.SeriesFetched != nil {
seriesFetchedEnabled = true seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0{ if *u.SeriesFetched == 0 && u.Samples == 0{
seriesFetchedWarning = true seriesFetchedWarning = true
} }
} }
@ -537,23 +537,23 @@ btn-primary
<tbody> <tbody>
{% for _, u := range rule.Updates %} {% for _, u := range rule.Updates %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}> <tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td> <td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span> <span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.Time.Format(time.RFC3339) %}</span>
</td> </td>
<td class="text-center">{%d u.samples %}</td> <td class="text-center">{%d u.Samples %}</td>
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.seriesFetched != nil %}{%d *u.seriesFetched %}{% endif %}</td>{% endif %} {% if seriesFetchedEnabled %}<td class="text-center">{% if u.SeriesFetched != nil %}{%d *u.SeriesFetched %}{% endif %}</td>{% endif %}
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td> <td class="text-center">{%f.3 u.Duration.Seconds() %}s</td>
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td> <td class="text-center">{%s u.At.Format(time.RFC3339) %}</td>
<td> <td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.curl %}</textarea> <textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.Curl %}</textarea>
</td> </td>
</tr> </tr>
</li> </li>
{% if u.err != nil %} {% if u.Err != nil %}
<tr{% if u.err != nil %} class="alert-danger"{% endif %}> <tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
<td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}"> <td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}">
<span class="alert-danger">{%v u.err %}</span> <span class="alert-danger">{%v u.Err %}</span>
</td> </td>
</tr> </tr>
{% endif %} {% endif %}
@ -582,7 +582,7 @@ btn-primary
<span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span> <span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span>
{% endfunc %} {% endfunc %}
{% func seriesFetchedWarn(r APIRule) %} {% func seriesFetchedWarn(r apiRule) %}
{% if isNoMatch(r) %} {% if isNoMatch(r) %}
<svg xmlns="http://www.w3.org/2000/svg" <svg xmlns="http://www.w3.org/2000/svg"
data-bs-toggle="tooltip" data-bs-toggle="tooltip"
@ -596,7 +596,7 @@ btn-primary
{% endfunc %} {% endfunc %}
{%code {%code
func isNoMatch (r APIRule) bool { func isNoMatch (r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0 return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
} }
%} %}

View file

@ -196,7 +196,7 @@ func buttonActive(filter, expValue string) string {
} }
//line app/vmalert/web.qtpl:41 //line app/vmalert/web.qtpl:41
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []APIGroup) { func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:41 //line app/vmalert/web.qtpl:41
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -216,9 +216,9 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
rOk := make(map[string]int) rOk := make(map[string]int)
rNotOk := make(map[string]int) rNotOk := make(map[string]int)
rNoMatch := make(map[string]int) rNoMatch := make(map[string]int)
var groups []APIGroup var groups []apiGroup
for _, g := range originGroups { for _, g := range originGroups {
var rules []APIRule var rules []apiRule
for _, r := range g.Rules { for _, r := range g.Rules {
if r.LastError != "" { if r.LastError != "" {
rNotOk[g.ID]++ rNotOk[g.ID]++
@ -610,7 +610,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
} }
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []APIGroup) { func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []apiGroup) {
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
qw422016 := qt422016.AcquireWriter(qq422016) qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
@ -621,7 +621,7 @@ func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups [
} }
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
func ListGroups(r *http.Request, originGroups []APIGroup) string { func ListGroups(r *http.Request, originGroups []apiGroup) string {
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
qb422016 := qt422016.AcquireByteBuffer() qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:166 //line app/vmalert/web.qtpl:166
@ -636,7 +636,7 @@ func ListGroups(r *http.Request, originGroups []APIGroup) string {
} }
//line app/vmalert/web.qtpl:169 //line app/vmalert/web.qtpl:169
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []GroupAlerts) { func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:169 //line app/vmalert/web.qtpl:169
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -712,7 +712,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
`) `)
//line app/vmalert/web.qtpl:185 //line app/vmalert/web.qtpl:185
var keys []string var keys []string
alertsByRule := make(map[string][]*APIAlert) alertsByRule := make(map[string][]*apiAlert)
for _, alert := range ga.Alerts { for _, alert := range ga.Alerts {
if len(alertsByRule[alert.RuleID]) < 1 { if len(alertsByRule[alert.RuleID]) < 1 {
keys = append(keys, alert.RuleID) keys = append(keys, alert.RuleID)
@ -891,7 +891,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
} }
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []GroupAlerts) { func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
qw422016 := qt422016.AcquireWriter(qq422016) qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
@ -902,7 +902,7 @@ func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []
} }
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) string { func ListAlerts(r *http.Request, groupAlerts []groupAlerts) string {
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
qb422016 := qt422016.AcquireByteBuffer() qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:255 //line app/vmalert/web.qtpl:255
@ -1091,7 +1091,7 @@ func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Tar
} }
//line app/vmalert/web.qtpl:313 //line app/vmalert/web.qtpl:313
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) { func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:313 //line app/vmalert/web.qtpl:313
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -1274,7 +1274,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
} }
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) { func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *apiAlert) {
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
qw422016 := qt422016.AcquireWriter(qq422016) qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
@ -1285,7 +1285,7 @@ func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
} }
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
func Alert(r *http.Request, alert *APIAlert) string { func Alert(r *http.Request, alert *apiAlert) string {
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
qb422016 := qt422016.AcquireByteBuffer() qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:397 //line app/vmalert/web.qtpl:397
@ -1300,7 +1300,7 @@ func Alert(r *http.Request, alert *APIAlert) string {
} }
//line app/vmalert/web.qtpl:400 //line app/vmalert/web.qtpl:400
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule) { func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:400 //line app/vmalert/web.qtpl:400
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -1331,9 +1331,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
var seriesFetchedEnabled bool var seriesFetchedEnabled bool
var seriesFetchedWarning bool var seriesFetchedWarning bool
for _, u := range rule.Updates { for _, u := range rule.Updates {
if u.seriesFetched != nil { if u.SeriesFetched != nil {
seriesFetchedEnabled = true seriesFetchedEnabled = true
if *u.seriesFetched == 0 && u.samples == 0 { if *u.SeriesFetched == 0 && u.Samples == 0 {
seriesFetchedWarning = true seriesFetchedWarning = true
} }
} }
@ -1587,7 +1587,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(` qw422016.N().S(`
<tr`) <tr`)
//line app/vmalert/web.qtpl:540 //line app/vmalert/web.qtpl:540
if u.err != nil { if u.Err != nil {
//line app/vmalert/web.qtpl:540 //line app/vmalert/web.qtpl:540
qw422016.N().S(` class="alert-danger"`) qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:540 //line app/vmalert/web.qtpl:540
@ -1597,13 +1597,13 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
<td> <td>
<span class="badge bg-primary rounded-pill me-3" title="Updated at">`) <span class="badge bg-primary rounded-pill me-3" title="Updated at">`)
//line app/vmalert/web.qtpl:542 //line app/vmalert/web.qtpl:542
qw422016.E().S(u.time.Format(time.RFC3339)) qw422016.E().S(u.Time.Format(time.RFC3339))
//line app/vmalert/web.qtpl:542 //line app/vmalert/web.qtpl:542
qw422016.N().S(`</span> qw422016.N().S(`</span>
</td> </td>
<td class="text-center">`) <td class="text-center">`)
//line app/vmalert/web.qtpl:544 //line app/vmalert/web.qtpl:544
qw422016.N().D(u.samples) qw422016.N().D(u.Samples)
//line app/vmalert/web.qtpl:544 //line app/vmalert/web.qtpl:544
qw422016.N().S(`</td> qw422016.N().S(`</td>
`) `)
@ -1612,9 +1612,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
//line app/vmalert/web.qtpl:545 //line app/vmalert/web.qtpl:545
qw422016.N().S(`<td class="text-center">`) qw422016.N().S(`<td class="text-center">`)
//line app/vmalert/web.qtpl:545 //line app/vmalert/web.qtpl:545
if u.seriesFetched != nil { if u.SeriesFetched != nil {
//line app/vmalert/web.qtpl:545 //line app/vmalert/web.qtpl:545
qw422016.N().D(*u.seriesFetched) qw422016.N().D(*u.SeriesFetched)
//line app/vmalert/web.qtpl:545 //line app/vmalert/web.qtpl:545
} }
//line app/vmalert/web.qtpl:545 //line app/vmalert/web.qtpl:545
@ -1625,18 +1625,18 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(` qw422016.N().S(`
<td class="text-center">`) <td class="text-center">`)
//line app/vmalert/web.qtpl:546 //line app/vmalert/web.qtpl:546
qw422016.N().FPrec(u.duration.Seconds(), 3) qw422016.N().FPrec(u.Duration.Seconds(), 3)
//line app/vmalert/web.qtpl:546 //line app/vmalert/web.qtpl:546
qw422016.N().S(`s</td> qw422016.N().S(`s</td>
<td class="text-center">`) <td class="text-center">`)
//line app/vmalert/web.qtpl:547 //line app/vmalert/web.qtpl:547
qw422016.E().S(u.at.Format(time.RFC3339)) qw422016.E().S(u.At.Format(time.RFC3339))
//line app/vmalert/web.qtpl:547 //line app/vmalert/web.qtpl:547
qw422016.N().S(`</td> qw422016.N().S(`</td>
<td> <td>
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">`) <textarea class="curl-area" rows="1" onclick="this.focus();this.select()">`)
//line app/vmalert/web.qtpl:549 //line app/vmalert/web.qtpl:549
qw422016.E().S(u.curl) qw422016.E().S(u.Curl)
//line app/vmalert/web.qtpl:549 //line app/vmalert/web.qtpl:549
qw422016.N().S(`</textarea> qw422016.N().S(`</textarea>
</td> </td>
@ -1644,12 +1644,12 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
</li> </li>
`) `)
//line app/vmalert/web.qtpl:553 //line app/vmalert/web.qtpl:553
if u.err != nil { if u.Err != nil {
//line app/vmalert/web.qtpl:553 //line app/vmalert/web.qtpl:553
qw422016.N().S(` qw422016.N().S(`
<tr`) <tr`)
//line app/vmalert/web.qtpl:554 //line app/vmalert/web.qtpl:554
if u.err != nil { if u.Err != nil {
//line app/vmalert/web.qtpl:554 //line app/vmalert/web.qtpl:554
qw422016.N().S(` class="alert-danger"`) qw422016.N().S(` class="alert-danger"`)
//line app/vmalert/web.qtpl:554 //line app/vmalert/web.qtpl:554
@ -1671,7 +1671,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
qw422016.N().S(`"> qw422016.N().S(`">
<span class="alert-danger">`) <span class="alert-danger">`)
//line app/vmalert/web.qtpl:556 //line app/vmalert/web.qtpl:556
qw422016.E().V(u.err) qw422016.E().V(u.Err)
//line app/vmalert/web.qtpl:556 //line app/vmalert/web.qtpl:556
qw422016.N().S(`</span> qw422016.N().S(`</span>
</td> </td>
@ -1697,7 +1697,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
} }
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule) { func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule apiRule) {
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
qw422016 := qt422016.AcquireWriter(qq422016) qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
@ -1708,7 +1708,7 @@ func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule)
} }
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
func RuleDetails(r *http.Request, rule APIRule) string { func RuleDetails(r *http.Request, rule apiRule) string {
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
qb422016 := qt422016.AcquireByteBuffer() qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:563 //line app/vmalert/web.qtpl:563
@ -1853,7 +1853,7 @@ func badgeStabilizing() string {
} }
//line app/vmalert/web.qtpl:585 //line app/vmalert/web.qtpl:585
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) { func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:585 //line app/vmalert/web.qtpl:585
qw422016.N().S(` qw422016.N().S(`
`) `)
@ -1879,7 +1879,7 @@ func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
} }
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) { func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r apiRule) {
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
qw422016 := qt422016.AcquireWriter(qq422016) qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
@ -1890,7 +1890,7 @@ func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
} }
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
func seriesFetchedWarn(r APIRule) string { func seriesFetchedWarn(r apiRule) string {
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
qb422016 := qt422016.AcquireByteBuffer() qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/web.qtpl:596 //line app/vmalert/web.qtpl:596
@ -1905,6 +1905,6 @@ func seriesFetchedWarn(r APIRule) string {
} }
//line app/vmalert/web.qtpl:599 //line app/vmalert/web.qtpl:599
func isNoMatch(r APIRule) bool { func isNoMatch(r apiRule) bool {
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0 return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
} }

View file

@ -1,6 +1,7 @@
package main package main
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"net/http" "net/http"
@ -9,32 +10,29 @@ import (
"testing" "testing"
"time" "time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier" "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
) )
func TestHandler(t *testing.T) { func TestHandler(t *testing.T) {
ar := &AlertingRule{ fq := &datasource.FakeQuerier{}
Name: "alert", fq.Add(datasource.Metric{
alerts: map[uint64]*notifier.Alert{ Values: []float64{1}, Timestamps: []int64{0},
0: {State: notifier.StateFiring},
},
state: newRuleState(10),
}
ar.state.add(ruleStateEntry{
time: time.Now(),
at: time.Now(),
samples: 10,
}) })
rr := &RecordingRule{ g := &rule.Group{
Name: "record", Name: "group",
state: newRuleState(10), Concurrency: 1,
} }
g := &Group{ ar := rule.NewAlertingRule(fq, g, config.Rule{ID: 0, Alert: "alert"})
Name: "group", rr := rule.NewRecordingRule(fq, g, config.Rule{ID: 1, Record: "record"})
Rules: []Rule{ar, rr}, g.Rules = []rule.Rule{ar, rr}
} g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, nil, time.Time{})
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g m := &manager{groups: map[uint64]*rule.Group{
g.ID(): g,
}}
rh := &requestHandler{m: m} rh := &requestHandler{m: m}
getResp := func(url string, to interface{}, code int) { getResp := func(url string, to interface{}, code int) {
@ -70,13 +68,13 @@ func TestHandler(t *testing.T) {
}) })
t.Run("/vmalert/rule", func(t *testing.T) { t.Run("/vmalert/rule", func(t *testing.T) {
a := ar.ToAPI() a := ruleToAPI(ar)
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200) getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
r := rr.ToAPI() r := ruleToAPI(rr)
getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200) getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200)
}) })
t.Run("/vmalert/alert", func(t *testing.T) { t.Run("/vmalert/alert", func(t *testing.T) {
alerts := ar.AlertsToAPI() alerts := ruleToAPIAlert(ar)
for _, a := range alerts { for _, a := range alerts {
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200) getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
} }
@ -103,14 +101,14 @@ func TestHandler(t *testing.T) {
} }
}) })
t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) { t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) {
expAlert := ar.newAlertAPI(*ar.alerts[0]) expAlert := newAlertAPI(ar, ar.GetAlerts()[0])
alert := &APIAlert{} alert := &apiAlert{}
getResp(ts.URL+"/"+expAlert.APILink(), alert, 200) getResp(ts.URL+"/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) { if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert) t.Errorf("expected %v is equal to %v", alert, expAlert)
} }
alert = &APIAlert{} alert = &apiAlert{}
getResp(ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200) getResp(ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200)
if !reflect.DeepEqual(alert, expAlert) { if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert) t.Errorf("expected %v is equal to %v", alert, expAlert)
@ -148,7 +146,7 @@ func TestHandler(t *testing.T) {
} }
func TestEmptyResponse(t *testing.T) { func TestEmptyResponse(t *testing.T) {
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*Group)}} rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*rule.Group)}}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) })) ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) }))
defer ts.Close() defer ts.Close()
@ -201,7 +199,7 @@ func TestEmptyResponse(t *testing.T) {
} }
}) })
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*Group{0: {Name: "test"}}}} rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*rule.Group{0: {Name: "test"}}}}
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) }) ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) })
t.Run("empty group /api/v1/rules", func(t *testing.T) { t.Run("empty group /api/v1/rules", func(t *testing.T) {

View file

@ -2,13 +2,28 @@ package main
import ( import (
"fmt" "fmt"
"net/url"
"sort"
"strconv"
"time" "time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
) )
// APIAlert represents a notifier.AlertingRule state const (
// ParamGroupID is group id key in url parameter
paramGroupID = "group_id"
// ParamAlertID is alert id key in url parameter
paramAlertID = "alert_id"
// ParamRuleID is rule id key in url parameter
paramRuleID = "rule_id"
)
// apiAlert represents a notifier.AlertingRule state
// for WEB view // for WEB view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules // https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIAlert struct { type apiAlert struct {
State string `json:"state"` State string `json:"state"`
Name string `json:"name"` Name string `json:"name"`
Value string `json:"value"` Value string `json:"value"`
@ -38,24 +53,24 @@ type APIAlert struct {
} }
// WebLink returns a link to the alert which can be used in UI. // WebLink returns a link to the alert which can be used in UI.
func (aa *APIAlert) WebLink() string { func (aa *apiAlert) WebLink() string {
return fmt.Sprintf("alert?%s=%s&%s=%s", return fmt.Sprintf("alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID) paramGroupID, aa.GroupID, paramAlertID, aa.ID)
} }
// APILink returns a link to the alert's JSON representation. // APILink returns a link to the alert's JSON representation.
func (aa *APIAlert) APILink() string { func (aa *apiAlert) APILink() string {
return fmt.Sprintf("api/v1/alert?%s=%s&%s=%s", return fmt.Sprintf("api/v1/alert?%s=%s&%s=%s",
paramGroupID, aa.GroupID, paramAlertID, aa.ID) paramGroupID, aa.GroupID, paramAlertID, aa.ID)
} }
// APIGroup represents Group for WEB view // apiGroup represents Group for web view
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules // https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIGroup struct { type apiGroup struct {
// Name is the group name as present in the config // Name is the group name as present in the config
Name string `json:"name"` Name string `json:"name"`
// Rules contains both recording and alerting rules // Rules contains both recording and alerting rules
Rules []APIRule `json:"rules"` Rules []apiRule `json:"rules"`
// Interval is the Group's evaluation interval in float seconds as present in the file. // Interval is the Group's evaluation interval in float seconds as present in the file.
Interval float64 `json:"interval"` Interval float64 `json:"interval"`
// LastEvaluation is the timestamp of the last time the Group was executed // LastEvaluation is the timestamp of the last time the Group was executed
@ -81,15 +96,15 @@ type APIGroup struct {
Labels map[string]string `json:"labels,omitempty"` Labels map[string]string `json:"labels,omitempty"`
} }
// GroupAlerts represents a group of alerts for WEB view // groupAlerts represents a group of alerts for WEB view
type GroupAlerts struct { type groupAlerts struct {
Group APIGroup Group apiGroup
Alerts []*APIAlert Alerts []*apiAlert
} }
// APIRule represents a Rule for WEB view // apiRule represents a Rule for web view
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules // see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
type APIRule struct { type apiRule struct {
// State must be one of these under following scenarios // State must be one of these under following scenarios
// "pending": at least 1 alert in the rule in pending state and no other alert in firing ruleState. // "pending": at least 1 alert in the rule in pending state and no other alert in firing ruleState.
// "firing": at least 1 alert in the rule in firing state. // "firing": at least 1 alert in the rule in firing state.
@ -111,7 +126,7 @@ type APIRule struct {
// LastEvaluation is the timestamp of the last time the rule was executed // LastEvaluation is the timestamp of the last time the rule was executed
LastEvaluation time.Time `json:"lastEvaluation"` LastEvaluation time.Time `json:"lastEvaluation"`
// Alerts is the list of all the alerts in this rule that are currently pending or firing // Alerts is the list of all the alerts in this rule that are currently pending or firing
Alerts []*APIAlert `json:"alerts,omitempty"` Alerts []*apiAlert `json:"alerts,omitempty"`
// Health is the health of rule evaluation. // Health is the health of rule evaluation.
// It MUST be one of "ok", "err", "unknown" // It MUST be one of "ok", "err", "unknown"
Health string `json:"health"` Health string `json:"health"`
@ -138,11 +153,206 @@ type APIRule struct {
// MaxUpdates is the max number of recorded ruleStateEntry objects // MaxUpdates is the max number of recorded ruleStateEntry objects
MaxUpdates int `json:"max_updates_entries"` MaxUpdates int `json:"max_updates_entries"`
// Updates contains the ordered list of recorded ruleStateEntry objects // Updates contains the ordered list of recorded ruleStateEntry objects
Updates []ruleStateEntry `json:"-"` Updates []rule.StateEntry `json:"-"`
} }
// WebLink returns a link to the alert which can be used in UI. // WebLink returns a link to the alert which can be used in UI.
func (ar APIRule) WebLink() string { func (ar apiRule) WebLink() string {
return fmt.Sprintf("rule?%s=%s&%s=%s", return fmt.Sprintf("rule?%s=%s&%s=%s",
paramGroupID, ar.GroupID, paramRuleID, ar.ID) paramGroupID, ar.GroupID, paramRuleID, ar.ID)
} }
func ruleToAPI(r interface{}) apiRule {
if ar, ok := r.(*rule.AlertingRule); ok {
return alertingToAPI(ar)
}
if rr, ok := r.(*rule.RecordingRule); ok {
return recordingToAPI(rr)
}
return apiRule{}
}
func recordingToAPI(rr *rule.RecordingRule) apiRule {
lastState := rule.GetLastEntry(rr)
r := apiRule{
Type: "recording",
DatasourceType: rr.Type.String(),
Name: rr.Name,
Query: rr.Expr,
Labels: rr.Labels,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(rr),
Updates: rule.GetAllRuleState(rr),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
return r
}
// alertingToAPI returns Rule representation in form of apiRule
func alertingToAPI(ar *rule.AlertingRule) apiRule {
lastState := rule.GetLastEntry(ar)
r := apiRule{
Type: "alerting",
DatasourceType: ar.Type.String(),
Name: ar.Name,
Query: ar.Expr,
Duration: ar.For.Seconds(),
KeepFiringFor: ar.KeepFiringFor.Seconds(),
Labels: ar.Labels,
Annotations: ar.Annotations,
LastEvaluation: lastState.Time,
EvaluationTime: lastState.Duration.Seconds(),
Health: "ok",
State: "inactive",
Alerts: ruleToAPIAlert(ar),
LastSamples: lastState.Samples,
LastSeriesFetched: lastState.SeriesFetched,
MaxUpdates: rule.GetRuleStateSize(ar),
Updates: rule.GetAllRuleState(ar),
Debug: ar.Debug,
// encode as strings to avoid rounding in JSON
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
}
if lastState.Err != nil {
r.LastError = lastState.Err.Error()
r.Health = "err"
}
// satisfy apiRule.State logic
if len(r.Alerts) > 0 {
r.State = notifier.StatePending.String()
stateFiring := notifier.StateFiring.String()
for _, a := range r.Alerts {
if a.State == stateFiring {
r.State = stateFiring
break
}
}
}
return r
}
// ruleToAPIAlert generates list of apiAlert objects from existing alerts
func ruleToAPIAlert(ar *rule.AlertingRule) []*apiAlert {
var alerts []*apiAlert
for _, a := range ar.GetAlerts() {
if a.State == notifier.StateInactive {
continue
}
alerts = append(alerts, newAlertAPI(ar, a))
}
return alerts
}
// alertToAPI generates apiAlert object from alert by its id(hash)
func alertToAPI(ar *rule.AlertingRule, id uint64) *apiAlert {
a := ar.GetAlert(id)
if a == nil {
return nil
}
return newAlertAPI(ar, a)
}
// NewAlertAPI creates apiAlert for notifier.Alert
func newAlertAPI(ar *rule.AlertingRule, a *notifier.Alert) *apiAlert {
aa := &apiAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
RuleID: fmt.Sprintf("%d", ar.RuleID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.ActiveAt,
Restored: a.Restored,
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
}
if alertURLGeneratorFn != nil {
aa.SourceLink = alertURLGeneratorFn(*a)
}
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
aa.Stabilizing = true
}
return aa
}
func groupToAPI(g *rule.Group) apiGroup {
g = g.DeepCopy()
ag := apiGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
Type: g.Type.String(),
File: g.File,
Interval: g.Interval.Seconds(),
LastEvaluation: g.LastEvaluation,
Concurrency: g.Concurrency,
Params: urlValuesToStrings(g.Params),
Headers: headersToStrings(g.Headers),
NotifierHeaders: headersToStrings(g.NotifierHeaders),
Labels: g.Labels,
}
ag.Rules = make([]apiRule, 0)
for _, r := range g.Rules {
ag.Rules = append(ag.Rules, ruleToAPI(r))
}
return ag
}
func urlValuesToStrings(values url.Values) []string {
if len(values) < 1 {
return nil
}
keys := make([]string, 0, len(values))
for k := range values {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
params := values[k]
for _, v := range params {
res = append(res, fmt.Sprintf("%s=%s", k, v))
}
}
return res
}
func headersToStrings(headers map[string]string) []string {
if len(headers) < 1 {
return nil
}
keys := make([]string, 0, len(headers))
for k := range headers {
keys = append(keys, k)
}
sort.Strings(keys)
var res []string
for _, k := range keys {
v := headers[k]
res = append(res, fmt.Sprintf("%s: %s", k, v))
}
return res
}

View file

@ -0,0 +1,23 @@
package main
import (
"testing"
)
func TestUrlValuesToStrings(t *testing.T) {
mapQueryParams := map[string][]string{
"param1": {"param1"},
"param2": {"anotherparam"},
}
expectedRes := []string{"param1=param1", "param2=anotherparam"}
res := urlValuesToStrings(mapQueryParams)
if len(res) != len(expectedRes) {
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
}
for ind, val := range expectedRes {
if val != res[ind] {
t.Errorf("Expected %v; but got %v", val, res[ind])
}
}
}

View file

@ -33,7 +33,7 @@ var (
) )
var ( var (
saCfgReloaderStopCh = make(chan struct{}) saCfgReloaderStopCh chan struct{}
saCfgReloaderWG sync.WaitGroup saCfgReloaderWG sync.WaitGroup
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`) saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
@ -62,6 +62,8 @@ func CheckStreamAggrConfig() error {
// //
// MustStopStreamAggr must be called when stream aggr is no longer needed. // MustStopStreamAggr must be called when stream aggr is no longer needed.
func InitStreamAggr() { func InitStreamAggr() {
saCfgReloaderStopCh = make(chan struct{})
if *streamAggrConfig == "" { if *streamAggrConfig == "" {
return return
} }

View file

@ -1,11 +1,11 @@
--- ---
sort: 20 sort: 29
weight: 20 weight: 29
title: Articles title: Articles
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 20 weight: 29
aliases: aliases:
- /Articles.html - /Articles.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 23 sort: 32
weight: 23 weight: 32
title: VictoriaMetrics best practices title: VictoriaMetrics best practices
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 23 weight: 32
aliases: aliases:
- /BestPractices.html - /BestPractices.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 16 sort: 25
weight: 16 weight: 25
title: CHANGELOG title: CHANGELOG
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 16 weight: 25
aliases: aliases:
- /CHANGELOG.html - /CHANGELOG.html
--- ---
@ -44,8 +44,10 @@ The sandbox cluster installation is running under the constant load generated by
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve repeated VMUI page load times by enabling caching of static js and css at web browser side according to [these recommendations](https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve repeated VMUI page load times by enabling caching of static js and css at web browser side according to [these recommendations](https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show information about lines with bigger values at the top of the legend under the graph in order to simplify graph analysis. * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show information about lines with bigger values at the top of the legend under the graph in order to simplify graph analysis.
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reduce vertical space usage, so more information is visible on the screen without scrolling. * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reduce vertical space usage, so more information is visible on the screen without scrolling.
* FEATURE: [vmalert-tool](https://docs.victoriametrics.com/#vmalert-tool): add `unittest` command to run unittest for alerting and recording rules. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4789) for details.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087) for details. * BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087) for details.
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component. * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.

View file

@ -1,11 +1,11 @@
--- ---
sort: 19 sort: 28
weight: 19 weight: 28
title: CHANGELOG for the year 2020 title: CHANGELOG for the year 2020
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 19 weight: 28
aliases: aliases:
- /CHANGELOG.html - /CHANGELOG.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 18 sort: 27
weight: 18 weight: 27
title: CHANGELOG for the year 2021 title: CHANGELOG for the year 2021
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 18 weight: 27
aliases: aliases:
- /CHANGELOG.html - /CHANGELOG.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 17 sort: 26
weight: 17 weight: 26
title: CHANGELOG for the year 2022 title: CHANGELOG for the year 2022
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 17 weight: 26
aliases: aliases:
- /CHANGELOG.html - /CHANGELOG.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 12 sort: 21
weight: 12 weight: 21
title: Case studies and talks title: Case studies and talks
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 12 weight: 21
aliases: aliases:
- /CaseStudies.html - /CaseStudies.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 15 sort: 24
weight: 15 weight: 24
title: FAQ title: FAQ
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 15 weight: 24
aliases: aliases:
- /FAQ.html - /FAQ.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 14 sort: 23
weight: 14 weight: 23
title: MetricsQL title: MetricsQL
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 14 weight: 23
aliases: aliases:
- /ExtendedPromQL.html - /ExtendedPromQL.html
- /MetricsQL.html - /MetricsQL.html

View file

@ -1,11 +1,11 @@
--- ---
sort: 22 sort: 31
weight: 22 weight: 31
title: VictoriaMetrics Cluster Per Tenant Statistic title: VictoriaMetrics Cluster Per Tenant Statistic
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 22 weight: 31
aliases: aliases:
- /PerTenantStatistic.html - /PerTenantStatistic.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 13 sort: 22
weight: 13 weight: 22
title: Quick start title: Quick start
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 13 weight: 22
aliases: aliases:
- /Quick-Start.html - /Quick-Start.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 21 sort: 30
weight: 21 weight: 30
title: Release process guidance title: Release process guidance
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 21 weight: 30
aliases: aliases:
- /Release-Guide.html - /Release-Guide.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 26 sort: 35
weight: 26 weight: 35
title: Troubleshooting title: Troubleshooting
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 26 weight: 35
aliases: aliases:
- /Troubleshooting.html - /Troubleshooting.html
--- ---

View file

@ -4,7 +4,7 @@ weight: 99
title: VictoriaMetrics Enterprise title: VictoriaMetrics Enterprise
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 99 weight: 99
aliases: aliases:
- /enterprise.html - /enterprise.html

View file

@ -1,11 +1,11 @@
--- ---
sort: 25 sort: 34
weight: 25 weight: 34
title: Key concepts title: Key concepts
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 25 weight: 34
aliases: aliases:
- /keyConcepts.html - /keyConcepts.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 28 sort: 37
weight: 28 weight: 37
title: Relabeling cookbook title: Relabeling cookbook
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 25 weight: 37
aliases: aliases:
- /relabeling.html - /relabeling.html
--- ---

View file

@ -1,11 +1,11 @@
--- ---
sort: 27 sort: 36
weight: 27 weight: 36
title: Prometheus service discovery title: Prometheus service discovery
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 27 weight: 36
aliases: aliases:
- /sd_configs.html - /sd_configs.html
--- ---

View file

@ -4,7 +4,7 @@ weight: 98
title: Streaming aggregation title: Streaming aggregation
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 98 weight: 98
aliases: aliases:
- /stream-aggregation.html - /stream-aggregation.html

View file

@ -1,11 +1,11 @@
--- ---
sort: 24 sort: 33
weight: 24 weight: 33
title: VictoriaMetrics API examples title: VictoriaMetrics API examples
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 24 weight: 33
--- ---
# VictoriaMetrics API examples # VictoriaMetrics API examples

253
docs/vmalert-tool.md Normal file
View file

@ -0,0 +1,253 @@
---
sort: 12
weight: 12
menu:
docs:
parent: 'victoriametrics'
weight: 12
title: vmalert-tool
---
# vmalert-tool
VMAlert command-line tool
## Unit testing for rules
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
It will perform the following actions:
* sets up an isolated VictoriaMetrics instance;
* simulates the periodic ingestion of time series;
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
* checks whether the firing alerts or resulting recording rules match the expected results.
See how to run vmalert-tool for unit test below:
```
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
./vmalert-tool unittest --files test1.yaml --files test2.yaml
```
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
### Test file format
The configuration format for files specified in `--files` cmd-line flag is the following:
```
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
rule_files:
[ - <string> ]
# The evaluation interval for rules specified in `rule_files`
[ evaluation_interval: <duration> | default = 1m ]
# Groups listed below will be evaluated by order.
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
group_eval_order:
[ - <string> ]
# The list of unit test files to be checked during evaluation.
tests:
[ - <test_group> ]
```
#### `<test_group>`
```
# Interval between samples for input series
interval: <duration>
# Time series to persist into the database according to configured <interval> before running tests.
input_series:
[ - <series> ]
# Name of the test group, optional
[ name: <string> ]
# Unit tests for alerting rules
alert_rule_test:
[ - <alert_test_case> ]
# Unit tests for Metricsql expressions.
metricsql_expr_test:
[ - <metricsql_expr_test> ]
# External labels accessible for templating.
external_labels:
[ <labelname>: <string> ... ]
```
#### `<series>`
```
# series in the following format '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
series: <string>
# values support several special equations:
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
# Read this as series starts at a, then c further samples incrementing by b.
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
# '_' represents a missing sample from scrape
# 'stale' indicates a stale sample
# Examples:
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
values: <string>
```
#### `<alert_test_case>`
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
but no need to add them under `exp_alerts`.
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
```
# The time elapsed from time=0s when this alerting rule should be checked.
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
eval_time: <duration>
# Name of the group name to be tested.
groupname: <string>
# Name of the alert to be tested.
alertname: <string>
# List of the expected alerts that are firing under the given alertname at
# the given evaluation time. If you want to test if an alerting rule should
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
exp_alerts:
[ - <alert> ]
```
#### `<alert>`
```
# These are the expanded labels and annotations of the expected alert.
# Note: labels also include the labels of the sample associated with the alert
exp_labels:
[ <labelname>: <string> ]
exp_annotations:
[ <labelname>: <string> ]
```
#### `<metricsql_expr_test>`
```
# Expression to evaluate
expr: <string>
# The time elapsed from time=0s when this expression be evaluated.
eval_time: <duration>
# Expected samples at the given evaluation time.
exp_samples:
[ - <sample> ]
```
#### `<sample>`
```
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
# Examples:
# series_name{label1="value1", label2="value2"}
# go_goroutines{job="prometheus", instance="localhost:9090"}
labels: <string>
# The expected value of the Metricsql expression.
value: <number>
```
### Example
This is an example input file for unit testing which will pass.
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
#### `test.yaml`
```
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'up{job="prometheus", instance="localhost:9090"}'
values: "0+0x1440"
metricsql_expr_test:
- expr: suquery_interval_test
eval_time: 4m
exp_samples:
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
value: 1
alert_rule_test:
- eval_time: 2h
groupname: group1
alertname: InstanceDown
exp_alerts:
- exp_labels:
job: prometheus
severity: page
instance: localhost:9090
datacenter: dc-123
exp_annotations:
summary: "Instance localhost:9090 down"
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
- eval_time: 0
groupname: group1
alertname: AlwaysFiring
exp_alerts:
- exp_labels:
datacenter: dc-123
- eval_time: 0
groupname: group1
alertname: InstanceDown
exp_alerts: []
external_labels:
datacenter: dc-123
```
#### `alerts.yaml`
```
# This is the rules file.
groups:
- name: group1
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: AlwaysFiring
expr: 1
- name: group2
rules:
- record: job:test:count_over_time1m
expr: sum without(instance) (count_over_time(test[1m]))
- record: suquery_interval_test
expr: count_over_time(up[5m:])
```

View file

@ -765,6 +765,11 @@ See full description for these flags in `./vmalert -help`.
* `limit` group's param has no effect during replay (might be changed in future); * `limit` group's param has no effect during replay (might be changed in future);
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future). * `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
## Unit Testing for Rules
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
## Monitoring ## Monitoring
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page. `vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.

View file

@ -4,7 +4,7 @@ weight: 11
title: vmanomaly title: vmanomaly
menu: menu:
docs: docs:
parent: "victoriametrics" parent: 'victoriametrics'
weight: 11 weight: 11
aliases: aliases:
- /vmanomaly.html - /vmanomaly.html

View file

@ -53,3 +53,11 @@ func ParseDuration(s string) (time.Duration, error) {
} }
return time.Duration(ms) * time.Millisecond, nil return time.Duration(ms) * time.Millisecond, nil
} }
// ParseTime returns time for pd.
func (pd *Duration) ParseTime() time.Time {
if pd == nil {
return time.Time{}
}
return time.UnixMilli(pd.Duration().Milliseconds())
}