mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vmalert-tool: implement unittest (#4789)
1. split package rule under /app/vmalert, expose needed objects 2. add vmalert-tool with unittest subcmd https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2945
This commit is contained in:
parent
98a5007d32
commit
dc28196237
74 changed files with 3997 additions and 1683 deletions
62
Makefile
62
Makefile
|
@ -28,7 +28,8 @@ all: \
|
|||
vmauth-prod \
|
||||
vmbackup-prod \
|
||||
vmrestore-prod \
|
||||
vmctl-prod
|
||||
vmctl-prod \
|
||||
vmalert-tool-prod
|
||||
|
||||
clean:
|
||||
rm -rf bin/*
|
||||
|
@ -40,7 +41,8 @@ publish: package-base \
|
|||
publish-vmauth \
|
||||
publish-vmbackup \
|
||||
publish-vmrestore \
|
||||
publish-vmctl
|
||||
publish-vmctl \
|
||||
publish-vmalert-tool
|
||||
|
||||
package: \
|
||||
package-victoria-metrics \
|
||||
|
@ -50,7 +52,8 @@ package: \
|
|||
package-vmauth \
|
||||
package-vmbackup \
|
||||
package-vmrestore \
|
||||
package-vmctl
|
||||
package-vmctl \
|
||||
package-vmalert-tool
|
||||
|
||||
vmutils: \
|
||||
vmagent \
|
||||
|
@ -58,7 +61,8 @@ vmutils: \
|
|||
vmauth \
|
||||
vmbackup \
|
||||
vmrestore \
|
||||
vmctl
|
||||
vmctl \
|
||||
vmalert-tool
|
||||
|
||||
vmutils-pure: \
|
||||
vmagent-pure \
|
||||
|
@ -66,7 +70,8 @@ vmutils-pure: \
|
|||
vmauth-pure \
|
||||
vmbackup-pure \
|
||||
vmrestore-pure \
|
||||
vmctl-pure
|
||||
vmctl-pure \
|
||||
vmalert-tool-pure
|
||||
|
||||
vmutils-linux-amd64: \
|
||||
vmagent-linux-amd64 \
|
||||
|
@ -74,7 +79,8 @@ vmutils-linux-amd64: \
|
|||
vmauth-linux-amd64 \
|
||||
vmbackup-linux-amd64 \
|
||||
vmrestore-linux-amd64 \
|
||||
vmctl-linux-amd64
|
||||
vmctl-linux-amd64 \
|
||||
vmalert-tool-linux-amd64
|
||||
|
||||
vmutils-linux-arm64: \
|
||||
vmagent-linux-arm64 \
|
||||
|
@ -82,7 +88,8 @@ vmutils-linux-arm64: \
|
|||
vmauth-linux-arm64 \
|
||||
vmbackup-linux-arm64 \
|
||||
vmrestore-linux-arm64 \
|
||||
vmctl-linux-arm64
|
||||
vmctl-linux-arm64 \
|
||||
vmalert-tool-linux-arm64
|
||||
|
||||
vmutils-linux-arm: \
|
||||
vmagent-linux-arm \
|
||||
|
@ -90,7 +97,8 @@ vmutils-linux-arm: \
|
|||
vmauth-linux-arm \
|
||||
vmbackup-linux-arm \
|
||||
vmrestore-linux-arm \
|
||||
vmctl-linux-arm
|
||||
vmctl-linux-arm \
|
||||
vmalert-tool-linux-arm
|
||||
|
||||
vmutils-linux-386: \
|
||||
vmagent-linux-386 \
|
||||
|
@ -98,7 +106,8 @@ vmutils-linux-386: \
|
|||
vmauth-linux-386 \
|
||||
vmbackup-linux-386 \
|
||||
vmrestore-linux-386 \
|
||||
vmctl-linux-386
|
||||
vmctl-linux-386 \
|
||||
vmalert-tool-linux-386
|
||||
|
||||
vmutils-linux-ppc64le: \
|
||||
vmagent-linux-ppc64le \
|
||||
|
@ -106,7 +115,8 @@ vmutils-linux-ppc64le: \
|
|||
vmauth-linux-ppc64le \
|
||||
vmbackup-linux-ppc64le \
|
||||
vmrestore-linux-ppc64le \
|
||||
vmctl-linux-ppc64le
|
||||
vmctl-linux-ppc64le \
|
||||
vmalert-tool-linux-ppc64le
|
||||
|
||||
vmutils-darwin-amd64: \
|
||||
vmagent-darwin-amd64 \
|
||||
|
@ -114,7 +124,8 @@ vmutils-darwin-amd64: \
|
|||
vmauth-darwin-amd64 \
|
||||
vmbackup-darwin-amd64 \
|
||||
vmrestore-darwin-amd64 \
|
||||
vmctl-darwin-amd64
|
||||
vmctl-darwin-amd64 \
|
||||
vmalert-tool-darwin-amd64
|
||||
|
||||
vmutils-darwin-arm64: \
|
||||
vmagent-darwin-arm64 \
|
||||
|
@ -122,7 +133,8 @@ vmutils-darwin-arm64: \
|
|||
vmauth-darwin-arm64 \
|
||||
vmbackup-darwin-arm64 \
|
||||
vmrestore-darwin-arm64 \
|
||||
vmctl-darwin-arm64
|
||||
vmctl-darwin-arm64 \
|
||||
vmalert-tool-darwin-arm64
|
||||
|
||||
vmutils-freebsd-amd64: \
|
||||
vmagent-freebsd-amd64 \
|
||||
|
@ -130,7 +142,8 @@ vmutils-freebsd-amd64: \
|
|||
vmauth-freebsd-amd64 \
|
||||
vmbackup-freebsd-amd64 \
|
||||
vmrestore-freebsd-amd64 \
|
||||
vmctl-freebsd-amd64
|
||||
vmctl-freebsd-amd64 \
|
||||
vmalert-tool-freebsd-amd64
|
||||
|
||||
vmutils-openbsd-amd64: \
|
||||
vmagent-openbsd-amd64 \
|
||||
|
@ -138,7 +151,8 @@ vmutils-openbsd-amd64: \
|
|||
vmauth-openbsd-amd64 \
|
||||
vmbackup-openbsd-amd64 \
|
||||
vmrestore-openbsd-amd64 \
|
||||
vmctl-openbsd-amd64
|
||||
vmctl-openbsd-amd64 \
|
||||
vmalert-tool-openbsd-amd64
|
||||
|
||||
vmutils-windows-amd64: \
|
||||
vmagent-windows-amd64 \
|
||||
|
@ -146,7 +160,8 @@ vmutils-windows-amd64: \
|
|||
vmauth-windows-amd64 \
|
||||
vmbackup-windows-amd64 \
|
||||
vmrestore-windows-amd64 \
|
||||
vmctl-windows-amd64
|
||||
vmctl-windows-amd64 \
|
||||
vmalert-tool-windows-amd64
|
||||
|
||||
victoria-metrics-crossbuild: \
|
||||
victoria-metrics-linux-386 \
|
||||
|
@ -342,7 +357,8 @@ release-vmutils-goos-goarch: \
|
|||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod
|
||||
cd bin && \
|
||||
tar --transform="flags=r;s|-$(GOOS)-$(GOARCH)||" -czf vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
|
@ -351,6 +367,7 @@ release-vmutils-goos-goarch: \
|
|||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod
|
||||
&& sha256sum vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG).tar.gz \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-$(GOOS)-$(GOARCH)-prod \
|
||||
|
@ -358,6 +375,7 @@ release-vmutils-goos-goarch: \
|
|||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod \
|
||||
| sed s/-$(GOOS)-$(GOARCH)-prod/-prod/ > vmutils-$(GOOS)-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
cd bin && rm -rf \
|
||||
vmagent-$(GOOS)-$(GOARCH)-prod \
|
||||
|
@ -365,7 +383,8 @@ release-vmutils-goos-goarch: \
|
|||
vmauth-$(GOOS)-$(GOARCH)-prod \
|
||||
vmbackup-$(GOOS)-$(GOARCH)-prod \
|
||||
vmrestore-$(GOOS)-$(GOARCH)-prod \
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod
|
||||
vmctl-$(GOOS)-$(GOARCH)-prod \
|
||||
vmalert-tool-$(GOOS)-$(GOARCH)-prod
|
||||
|
||||
release-vmutils-windows-goarch: \
|
||||
vmagent-windows-$(GOARCH)-prod \
|
||||
|
@ -373,7 +392,8 @@ release-vmutils-windows-goarch: \
|
|||
vmauth-windows-$(GOARCH)-prod \
|
||||
vmbackup-windows-$(GOARCH)-prod \
|
||||
vmrestore-windows-$(GOARCH)-prod \
|
||||
vmctl-windows-$(GOARCH)-prod
|
||||
vmctl-windows-$(GOARCH)-prod \
|
||||
vmalert-tool-windows-$(GOARCH)-prod
|
||||
cd bin && \
|
||||
zip vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
|
@ -382,6 +402,7 @@ release-vmutils-windows-goarch: \
|
|||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
vmctl-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe \
|
||||
&& sha256sum vmutils-windows-$(GOARCH)-$(PKG_TAG).zip \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-windows-$(GOARCH)-prod.exe \
|
||||
|
@ -389,6 +410,7 @@ release-vmutils-windows-goarch: \
|
|||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
vmctl-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe \
|
||||
> vmutils-windows-$(GOARCH)-$(PKG_TAG)_checksums.txt
|
||||
cd bin && rm -rf \
|
||||
vmagent-windows-$(GOARCH)-prod.exe \
|
||||
|
@ -396,7 +418,8 @@ release-vmutils-windows-goarch: \
|
|||
vmauth-windows-$(GOARCH)-prod.exe \
|
||||
vmbackup-windows-$(GOARCH)-prod.exe \
|
||||
vmrestore-windows-$(GOARCH)-prod.exe \
|
||||
vmctl-windows-$(GOARCH)-prod.exe
|
||||
vmctl-windows-$(GOARCH)-prod.exe \
|
||||
vmalert-tool-windows-$(GOARCH)-prod.exe
|
||||
|
||||
pprof-cpu:
|
||||
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
|
||||
|
@ -514,3 +537,4 @@ docs-sync:
|
|||
SRC=app/vmctl/README.md DST=docs/vmctl.md OLD_URL='/vmctl.html' ORDER=8 TITLE=vmctl $(MAKE) copy-docs
|
||||
SRC=app/vmgateway/README.md DST=docs/vmgateway.md OLD_URL='/vmgateway.html' ORDER=9 TITLE=vmgateway $(MAKE) copy-docs
|
||||
SRC=app/vmbackupmanager/README.md DST=docs/vmbackupmanager.md OLD_URL='/vmbackupmanager.html' ORDER=10 TITLE=vmbackupmanager $(MAKE) copy-docs
|
||||
SRC=app/vmalert-tool/README.md DST=docs/vmalert-tool.md OLD_URL='' ORDER=12 TITLE=vmalert-tool $(MAKE) copy-docs
|
||||
|
|
103
app/vmalert-tool/Makefile
Normal file
103
app/vmalert-tool/Makefile
Normal file
|
@ -0,0 +1,103 @@
|
|||
# All these commands must run from repository root.
|
||||
|
||||
vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-local
|
||||
|
||||
vmalert-tool-race:
|
||||
APP_NAME=vmalert-tool RACE=-race $(MAKE) app-local
|
||||
|
||||
vmalert-tool-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker
|
||||
|
||||
vmalert-tool-pure-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-pure
|
||||
|
||||
vmalert-tool-linux-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-amd64
|
||||
|
||||
vmalert-tool-linux-arm-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm
|
||||
|
||||
vmalert-tool-linux-arm64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-arm64
|
||||
|
||||
vmalert-tool-linux-ppc64le-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-ppc64le
|
||||
|
||||
vmalert-tool-linux-386-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmalert-tool-darwin-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
vmalert-tool-darwin-arm64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-arm64
|
||||
|
||||
vmalert-tool-freebsd-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-freebsd-amd64
|
||||
|
||||
vmalert-tool-openbsd-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-openbsd-amd64
|
||||
|
||||
vmalert-tool-windows-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-windows-amd64
|
||||
|
||||
package-vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker
|
||||
|
||||
package-vmalert-tool-pure:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-pure
|
||||
|
||||
package-vmalert-tool-amd64:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-amd64
|
||||
|
||||
package-vmalert-tool-arm:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm
|
||||
|
||||
package-vmalert-tool-arm64:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-arm64
|
||||
|
||||
package-vmalert-tool-ppc64le:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-ppc64le
|
||||
|
||||
package-vmalert-tool-386:
|
||||
APP_NAME=vmalert-tool $(MAKE) package-via-docker-386
|
||||
|
||||
publish-vmalert-tool:
|
||||
APP_NAME=vmalert-tool $(MAKE) publish-via-docker
|
||||
|
||||
vmalert-tool-linux-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-arm:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-arm64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-ppc64le:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-s390x:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-linux-386:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-darwin-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-darwin-arm64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-freebsd-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-openbsd-amd64:
|
||||
APP_NAME=vmalert-tool CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch
|
||||
|
||||
vmalert-tool-windows-amd64:
|
||||
GOARCH=amd64 APP_NAME=vmalert-tool $(MAKE) app-local-windows-goarch
|
||||
|
||||
vmalert-tool-pure:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-local-pure
|
244
app/vmalert-tool/README.md
Normal file
244
app/vmalert-tool/README.md
Normal file
|
@ -0,0 +1,244 @@
|
|||
|
||||
# vmalert-tool
|
||||
|
||||
VMAlert command-line tool
|
||||
|
||||
## Unit testing for rules
|
||||
|
||||
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
|
||||
It will perform the following actions:
|
||||
* sets up an isolated VictoriaMetrics instance;
|
||||
* simulates the periodic ingestion of time series;
|
||||
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
|
||||
* checks whether the firing alerts or resulting recording rules match the expected results.
|
||||
|
||||
See how to run vmalert-tool for unit test below:
|
||||
```
|
||||
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
|
||||
./vmalert-tool unittest --files test1.yaml --files test2.yaml
|
||||
```
|
||||
|
||||
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
|
||||
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
|
||||
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
|
||||
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
|
||||
|
||||
### Test file format
|
||||
|
||||
The configuration format for files specified in `--files` cmd-line flag is the following:
|
||||
```
|
||||
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
|
||||
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
|
||||
rule_files:
|
||||
[ - <string> ]
|
||||
|
||||
# The evaluation interval for rules specified in `rule_files`
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
# Groups listed below will be evaluated by order.
|
||||
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
|
||||
group_eval_order:
|
||||
[ - <string> ]
|
||||
|
||||
# The list of unit test files to be checked during evaluation.
|
||||
tests:
|
||||
[ - <test_group> ]
|
||||
```
|
||||
|
||||
#### `<test_group>`
|
||||
|
||||
```
|
||||
# Interval between samples for input series
|
||||
interval: <duration>
|
||||
# Time series to persist into the database according to configured <interval> before running tests.
|
||||
input_series:
|
||||
[ - <series> ]
|
||||
|
||||
# Name of the test group, optional
|
||||
[ name: <string> ]
|
||||
|
||||
# Unit tests for alerting rules
|
||||
alert_rule_test:
|
||||
[ - <alert_test_case> ]
|
||||
|
||||
# Unit tests for Metricsql expressions.
|
||||
metricsql_expr_test:
|
||||
[ - <metricsql_expr_test> ]
|
||||
|
||||
# External labels accessible for templating.
|
||||
external_labels:
|
||||
[ <labelname>: <string> ... ]
|
||||
|
||||
```
|
||||
|
||||
#### `<series>`
|
||||
|
||||
```
|
||||
# series in the following format '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
series: <string>
|
||||
|
||||
# values support several special equations:
|
||||
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
|
||||
# Read this as series starts at a, then c further samples incrementing by b.
|
||||
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
|
||||
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
|
||||
# '_' represents a missing sample from scrape
|
||||
# 'stale' indicates a stale sample
|
||||
# Examples:
|
||||
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
|
||||
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
|
||||
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
|
||||
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
|
||||
values: <string>
|
||||
```
|
||||
|
||||
#### `<alert_test_case>`
|
||||
|
||||
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
|
||||
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
|
||||
but no need to add them under `exp_alerts`.
|
||||
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
|
||||
|
||||
```
|
||||
# The time elapsed from time=0s when this alerting rule should be checked.
|
||||
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
|
||||
eval_time: <duration>
|
||||
|
||||
# Name of the group name to be tested.
|
||||
groupname: <string>
|
||||
|
||||
# Name of the alert to be tested.
|
||||
alertname: <string>
|
||||
|
||||
# List of the expected alerts that are firing under the given alertname at
|
||||
# the given evaluation time. If you want to test if an alerting rule should
|
||||
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
|
||||
exp_alerts:
|
||||
[ - <alert> ]
|
||||
```
|
||||
|
||||
#### `<alert>`
|
||||
|
||||
```
|
||||
# These are the expanded labels and annotations of the expected alert.
|
||||
# Note: labels also include the labels of the sample associated with the alert
|
||||
exp_labels:
|
||||
[ <labelname>: <string> ]
|
||||
exp_annotations:
|
||||
[ <labelname>: <string> ]
|
||||
```
|
||||
|
||||
#### `<metricsql_expr_test>`
|
||||
|
||||
```
|
||||
# Expression to evaluate
|
||||
expr: <string>
|
||||
|
||||
# The time elapsed from time=0s when this expression be evaluated.
|
||||
eval_time: <duration>
|
||||
|
||||
# Expected samples at the given evaluation time.
|
||||
exp_samples:
|
||||
[ - <sample> ]
|
||||
```
|
||||
|
||||
#### `<sample>`
|
||||
|
||||
```
|
||||
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
labels: <string>
|
||||
|
||||
# The expected value of the Metricsql expression.
|
||||
value: <number>
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
This is an example input file for unit testing which will pass.
|
||||
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
|
||||
|
||||
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
|
||||
|
||||
#### `test.yaml`
|
||||
|
||||
```
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="prometheus", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: prometheus
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
```
|
||||
|
||||
#### `alerts.yaml`
|
||||
|
||||
```
|
||||
# This is the rules file.
|
||||
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
```
|
54
app/vmalert-tool/main.go
Normal file
54
app/vmalert-tool/main.go
Normal file
|
@ -0,0 +1,54 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert-tool/unittest"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
)
|
||||
|
||||
func main() {
|
||||
start := time.Now()
|
||||
app := &cli.App{
|
||||
Name: "vmalert-tool",
|
||||
Usage: "VMAlert command-line tool",
|
||||
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html",
|
||||
Version: buildinfo.Version,
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
Name: "unittest",
|
||||
Usage: "Run unittest for alerting and recording rules.",
|
||||
UsageText: "More info in https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringSliceFlag{
|
||||
Name: "files",
|
||||
Usage: "files to run unittest with. Supports an array of values separated by comma or specified via multiple flags.",
|
||||
Required: true,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "disableAlertgroupLabel",
|
||||
Usage: "disable adding group's Name as label to generated alerts and time series.",
|
||||
Required: false,
|
||||
},
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
if failed := unittest.UnitTest(c.StringSlice("files"), c.Bool("disableAlertgroupLabel")); failed {
|
||||
return fmt.Errorf("unittest failed")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
err := app.Run(os.Args)
|
||||
if err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
log.Printf("Total time: %v", time.Since(start))
|
||||
}
|
19
app/vmalert-tool/unittest/alerting.go
Normal file
19
app/vmalert-tool/unittest/alerting.go
Normal file
|
@ -0,0 +1,19 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
)
|
||||
|
||||
// alertTestCase holds alert_rule_test cases defined in test file
|
||||
type alertTestCase struct {
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
GroupName string `yaml:"groupname"`
|
||||
Alertname string `yaml:"alertname"`
|
||||
ExpAlerts []expAlert `yaml:"exp_alerts"`
|
||||
}
|
||||
|
||||
// expAlert holds exp_alerts defined in test file
|
||||
type expAlert struct {
|
||||
ExpLabels map[string]string `yaml:"exp_labels"`
|
||||
ExpAnnotations map[string]string `yaml:"exp_annotations"`
|
||||
}
|
182
app/vmalert-tool/unittest/input.go
Normal file
182
app/vmalert-tool/unittest/input.go
Normal file
|
@ -0,0 +1,182 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
testutil "github.com/VictoriaMetrics/VictoriaMetrics/app/victoria-metrics/test"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// series holds input_series defined in the test file
|
||||
type series struct {
|
||||
Series string `yaml:"series"`
|
||||
Values string `yaml:"values"`
|
||||
}
|
||||
|
||||
// sequenceValue is an omittable value in a sequence of time series values.
|
||||
type sequenceValue struct {
|
||||
Value float64
|
||||
Omitted bool
|
||||
}
|
||||
|
||||
func httpWrite(address string, r io.Reader) {
|
||||
resp, err := http.Post(address, "", r)
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to send to storage: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// writeInputSeries send input series to vmstorage and flush them
|
||||
func writeInputSeries(input []series, interval *promutils.Duration, startStamp time.Time, dst string) error {
|
||||
r := testutil.WriteRequest{}
|
||||
for _, data := range input {
|
||||
expr, err := metricsql.Parse(data.Series)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse series %s: %v", data.Series, err)
|
||||
}
|
||||
promvals, err := parseInputValue(data.Values, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse input series value %s: %v", data.Values, err)
|
||||
}
|
||||
metricExpr, ok := expr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to parse series %s to metric expr: %v", data.Series, err)
|
||||
}
|
||||
samples := make([]testutil.Sample, 0, len(promvals))
|
||||
ts := startStamp
|
||||
for _, v := range promvals {
|
||||
if !v.Omitted {
|
||||
samples = append(samples, testutil.Sample{
|
||||
Timestamp: ts.UnixMilli(),
|
||||
Value: v.Value,
|
||||
})
|
||||
}
|
||||
ts = ts.Add(interval.Duration())
|
||||
}
|
||||
var ls []testutil.Label
|
||||
for _, filter := range metricExpr.LabelFilterss[0] {
|
||||
ls = append(ls, testutil.Label{Name: filter.Label, Value: filter.Value})
|
||||
}
|
||||
r.Timeseries = append(r.Timeseries, testutil.TimeSeries{Labels: ls, Samples: samples})
|
||||
}
|
||||
|
||||
data, err := testutil.Compress(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compress data: %v", err)
|
||||
}
|
||||
// write input series to vm
|
||||
httpWrite(dst, bytes.NewBuffer(data))
|
||||
vmstorage.Storage.DebugFlush()
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseInputValue support input like "1", "1+1x1 _ -4 3+20x1", see more examples in test.
|
||||
func parseInputValue(input string, origin bool) ([]sequenceValue, error) {
|
||||
var res []sequenceValue
|
||||
items := strings.Split(input, " ")
|
||||
reg := regexp.MustCompile(`\D?\d*\D?`)
|
||||
for _, item := range items {
|
||||
if item == "stale" {
|
||||
res = append(res, sequenceValue{Value: decimal.StaleNaN})
|
||||
continue
|
||||
}
|
||||
vals := reg.FindAllString(item, -1)
|
||||
switch len(vals) {
|
||||
case 1:
|
||||
if vals[0] == "_" {
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v, err := strconv.ParseFloat(vals[0], 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v})
|
||||
continue
|
||||
case 2:
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v2, err := strconv.ParseInt(vals[1], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
switch option {
|
||||
case '+':
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, sequenceValue{Value: v1 + float64(v2)})
|
||||
case 'x':
|
||||
for i := int64(0); i <= v2; i++ {
|
||||
if p1 == "_" {
|
||||
if i == 0 {
|
||||
i = 1
|
||||
}
|
||||
res = append(res, sequenceValue{Omitted: true})
|
||||
continue
|
||||
}
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !origin || v1 == 0 {
|
||||
res = append(res, sequenceValue{Value: v1 * float64(i)})
|
||||
continue
|
||||
}
|
||||
newVal := fmt.Sprintf("%s+0x%s", p1, vals[1])
|
||||
newRes, err := parseInputValue(newVal, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res = append(res, newRes...)
|
||||
break
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("got invalid operation %b", option)
|
||||
}
|
||||
case 3:
|
||||
r1, err := parseInputValue(fmt.Sprintf("%s%s", vals[1], vals[2]), false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p1 := vals[0][:len(vals[0])-1]
|
||||
v1, err := strconv.ParseFloat(p1, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
option := vals[0][len(vals[0])-1]
|
||||
var isAdd bool
|
||||
if option == '+' {
|
||||
isAdd = true
|
||||
}
|
||||
for _, r := range r1 {
|
||||
if isAdd {
|
||||
res = append(res, sequenceValue{
|
||||
Value: r.Value + v1,
|
||||
})
|
||||
} else {
|
||||
res = append(res, sequenceValue{
|
||||
Value: v1 - r.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported input %s", input)
|
||||
}
|
||||
}
|
||||
return res, nil
|
||||
}
|
93
app/vmalert-tool/unittest/input_test.go
Normal file
93
app/vmalert-tool/unittest/input_test.go
Normal file
|
@ -0,0 +1,93 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
)
|
||||
|
||||
func TestParseInputValue(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
exp []sequenceValue
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
"",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"testfailed",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
// stale doesn't support operations
|
||||
{
|
||||
"stalex3",
|
||||
nil,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"-4",
|
||||
[]sequenceValue{{Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"stale",
|
||||
[]sequenceValue{{Value: decimal.StaleNaN}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"-4x1",
|
||||
[]sequenceValue{{Value: -4}, {Value: -4}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"_x1",
|
||||
[]sequenceValue{{Omitted: true}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x4",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Value: 3}, {Value: 4}, {Value: 5}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"2-1x4",
|
||||
[]sequenceValue{{Value: 2}, {Value: 1}, {Value: 0}, {Value: -1}, {Value: -2}},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"1+1x1 _ -4 stale 3+20x1",
|
||||
[]sequenceValue{{Value: 1}, {Value: 2}, {Omitted: true}, {Value: -4}, {Value: decimal.StaleNaN}, {Value: 3}, {Value: 23}},
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
output, err := parseInputValue(tc.input, true)
|
||||
if err != nil != tc.failed {
|
||||
t.Fatalf("failed to parse %s, expect %t, got %t", tc.input, tc.failed, err != nil)
|
||||
}
|
||||
if len(tc.exp) != len(output) {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
for i := 0; i < len(tc.exp); i++ {
|
||||
if tc.exp[i].Omitted != output[i].Omitted {
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
if tc.exp[i].Value != output[i].Value {
|
||||
if decimal.IsStaleNaN(tc.exp[i].Value) && decimal.IsStaleNaN(output[i].Value) {
|
||||
continue
|
||||
}
|
||||
t.Fatalf("expect %v, got %v", tc.exp, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
92
app/vmalert-tool/unittest/recording.go
Normal file
92
app/vmalert-tool/unittest/recording.go
Normal file
|
@ -0,0 +1,92 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metricsql"
|
||||
)
|
||||
|
||||
// metricsqlTestCase holds metricsql_expr_test cases defined in test file
|
||||
type metricsqlTestCase struct {
|
||||
Expr string `yaml:"expr"`
|
||||
EvalTime *promutils.Duration `yaml:"eval_time"`
|
||||
ExpSamples []expSample `yaml:"exp_samples"`
|
||||
}
|
||||
|
||||
type expSample struct {
|
||||
Labels string `yaml:"labels"`
|
||||
Value float64 `yaml:"value"`
|
||||
}
|
||||
|
||||
// checkMetricsqlCase will check metricsql_expr_test cases
|
||||
func checkMetricsqlCase(cases []metricsqlTestCase, q datasource.QuerierBuilder) (checkErrs []error) {
|
||||
queries := q.BuildWithParams(datasource.QuerierParams{QueryParams: url.Values{"nocache": {"1"}, "latency_offset": {"1ms"}}, DataSourceType: "prometheus"})
|
||||
Outer:
|
||||
for _, mt := range cases {
|
||||
result, _, err := queries.Query(context.Background(), mt.Expr, mt.EvalTime.ParseTime())
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf(" expr: %q, time: %s, err: %w", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), err))
|
||||
continue
|
||||
}
|
||||
var gotSamples []parsedSample
|
||||
for _, s := range result.Data {
|
||||
sort.Slice(s.Labels, func(i, j int) bool {
|
||||
return s.Labels[i].Name < s.Labels[j].Name
|
||||
})
|
||||
gotSamples = append(gotSamples, parsedSample{
|
||||
Labels: s.Labels,
|
||||
Value: s.Values[0],
|
||||
})
|
||||
}
|
||||
var expSamples []parsedSample
|
||||
for _, s := range mt.ExpSamples {
|
||||
expLb := datasource.Labels{}
|
||||
if s.Labels != "" {
|
||||
metricsqlExpr, err := metricsql.Parse(s.Labels)
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("failed to parse labels %q: %w", s.Labels, err)))
|
||||
continue Outer
|
||||
}
|
||||
metricsqlMetricExpr, ok := metricsqlExpr.(*metricsql.MetricExpr)
|
||||
if !ok {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s, err: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), fmt.Errorf("got unsupported metricsql type")))
|
||||
continue Outer
|
||||
}
|
||||
for _, l := range metricsqlMetricExpr.LabelFilterss[0] {
|
||||
expLb = append(expLb, datasource.Label{
|
||||
Name: l.Label,
|
||||
Value: l.Value,
|
||||
})
|
||||
}
|
||||
}
|
||||
sort.Slice(expLb, func(i, j int) bool {
|
||||
return expLb[i].Name < expLb[j].Name
|
||||
})
|
||||
expSamples = append(expSamples, parsedSample{
|
||||
Labels: expLb,
|
||||
Value: s.Value,
|
||||
})
|
||||
}
|
||||
sort.Slice(expSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(expSamples[i].Labels, expSamples[j].Labels) <= 0
|
||||
})
|
||||
sort.Slice(gotSamples, func(i, j int) bool {
|
||||
return datasource.LabelCompare(gotSamples[i].Labels, gotSamples[j].Labels) <= 0
|
||||
})
|
||||
if !reflect.DeepEqual(expSamples, gotSamples) {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n expr: %q, time: %s,\n exp: %v\n got: %v", mt.Expr,
|
||||
mt.EvalTime.Duration().String(), parsedSamplesString(expSamples), parsedSamplesString(gotSamples)))
|
||||
}
|
||||
|
||||
}
|
||||
return
|
||||
}
|
43
app/vmalert-tool/unittest/testdata/disable-group-label.yaml
vendored
Normal file
43
app/vmalert-tool/unittest/testdata/disable-group-label.yaml
vendored
Normal file
|
@ -0,0 +1,43 @@
|
|||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
49
app/vmalert-tool/unittest/testdata/failed-test.yaml
vendored
Normal file
49
app/vmalert-tool/unittest/testdata/failed-test.yaml
vendored
Normal file
|
@ -0,0 +1,49 @@
|
|||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "Failing test"
|
||||
input_series:
|
||||
- series: test
|
||||
values: "0"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
labels: test
|
||||
|
||||
# will failed cause there is no "Test" group and rule defined
|
||||
alert_rule_test:
|
||||
- eval_time: 0m
|
||||
groupname: Test
|
||||
alertname: Test
|
||||
exp_alerts:
|
||||
- exp_labels: {}
|
||||
|
||||
- interval: 1m
|
||||
name: Failing alert test
|
||||
input_series:
|
||||
- series: 'up{job="test"}'
|
||||
values: 0x10
|
||||
|
||||
alert_rule_test:
|
||||
# will failed cause rule is firing
|
||||
- eval_time: 5m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
- interval: 1m
|
||||
name: Failing alert test with missing groupname
|
||||
input_series:
|
||||
- series: 'up{job="test"}'
|
||||
values: 0x10
|
||||
|
||||
alert_rule_test:
|
||||
# will failed cause missing groupname
|
||||
- eval_time: 5m
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts: []
|
30
app/vmalert-tool/unittest/testdata/long-period.yaml
vendored
Normal file
30
app/vmalert-tool/unittest/testdata/long-period.yaml
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
# can be executed successfully but will take more than 1 minute
|
||||
# not included in unit test now
|
||||
evaluation_interval: 100d
|
||||
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
tests:
|
||||
- interval: 1d
|
||||
input_series:
|
||||
- series: test
|
||||
# Max time in time.Duration is 106751d from 1970 (2^63/10^9), i.e. 2262.
|
||||
# But VictoriaMetrics supports maxTimestamp value +2 days from now. see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/827.
|
||||
# We input series to 2024-01-01T00:00:00 here.
|
||||
values: "0+1x19723"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: timestamp(test)
|
||||
eval_time: 0m
|
||||
exp_samples:
|
||||
- value: 0
|
||||
- expr: test
|
||||
eval_time: 100d
|
||||
exp_samples:
|
||||
- labels: test
|
||||
value: 100
|
||||
- expr: timestamp(test)
|
||||
eval_time: 19000d
|
||||
exp_samples:
|
||||
- value: 1641600000 # 19000d -> seconds.
|
39
app/vmalert-tool/unittest/testdata/rules.yaml
vendored
Normal file
39
app/vmalert-tool/unittest/testdata/rules.yaml
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 1m
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: t1
|
||||
expr: test
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
|
||||
- alert: SameAlertNameWithDifferentGroup
|
||||
expr: absent(test)
|
||||
for: 5m
|
||||
|
||||
- name: group3
|
||||
rules:
|
||||
- record: t2
|
||||
expr: t1
|
||||
|
||||
- name: group4
|
||||
rules:
|
||||
- record: t3
|
||||
expr: t1
|
99
app/vmalert-tool/unittest/testdata/test1.yaml
vendored
Normal file
99
app/vmalert-tool/unittest/testdata/test1.yaml
vendored
Normal file
|
@ -0,0 +1,99 @@
|
|||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
group_eval_order: ["group4", "group2", "group3"]
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
name: "basic test"
|
||||
input_series:
|
||||
- series: "test"
|
||||
values: "_x5 1x5 _ stale"
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 1m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts:
|
||||
- {}
|
||||
- eval_time: 1m
|
||||
groupname: group2
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
- eval_time: 6m
|
||||
groupname: group1
|
||||
alertname: SameAlertNameWithDifferentGroup
|
||||
exp_alerts: []
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: test
|
||||
eval_time: 11m
|
||||
exp_samples:
|
||||
- labels: '{__name__="test"}'
|
||||
value: 1
|
||||
- expr: test
|
||||
eval_time: 12m
|
||||
exp_samples: []
|
||||
|
||||
- interval: 1m
|
||||
name: "basic test2"
|
||||
input_series:
|
||||
- series: 'up{job="vmagent1", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
- series: "test"
|
||||
values: "0+1x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: count(ALERTS) by (alertgroup, alertname, alertstate)
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{alertgroup="group1", alertname="AlwaysFiring", alertstate="firing"}'
|
||||
value: 1
|
||||
- labels: '{alertgroup="group1", alertname="InstanceDown", alertstate="pending"}'
|
||||
value: 1
|
||||
- expr: t1
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t1", datacenter="dc-123"}'
|
||||
- expr: t2
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- value: 4
|
||||
labels: '{__name__="t2", datacenter="dc-123"}'
|
||||
- expr: t3
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
# t3 is 3 instead of 4 cause it's rules3 is evaluated before rules1
|
||||
- value: 3
|
||||
labels: '{__name__="t3", datacenter="dc-123"}'
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent1
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent1 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: alerts
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
46
app/vmalert-tool/unittest/testdata/test2.yaml
vendored
Normal file
46
app/vmalert-tool/unittest/testdata/test2.yaml
vendored
Normal file
|
@ -0,0 +1,46 @@
|
|||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="vmagent2", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test",datacenter="dc-123", instance="localhost:9090", job="vmagent2"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: vmagent2
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job vmagent2 has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
83
app/vmalert-tool/unittest/type.go
Normal file
83
app/vmalert-tool/unittest/type.go
Normal file
|
@ -0,0 +1,83 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
)
|
||||
|
||||
// parsedSample is a sample with parsed Labels
|
||||
type parsedSample struct {
|
||||
Labels datasource.Labels
|
||||
Value float64
|
||||
}
|
||||
|
||||
func (ps *parsedSample) String() string {
|
||||
return ps.Labels.String() + " " + strconv.FormatFloat(ps.Value, 'E', -1, 64)
|
||||
}
|
||||
|
||||
func parsedSamplesString(pss []parsedSample) string {
|
||||
if len(pss) == 0 {
|
||||
return "nil"
|
||||
}
|
||||
s := pss[0].String()
|
||||
for _, ps := range pss[1:] {
|
||||
s += ", " + ps.String()
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// labelAndAnnotation holds labels and annotations
|
||||
type labelAndAnnotation struct {
|
||||
Labels datasource.Labels
|
||||
Annotations datasource.Labels
|
||||
}
|
||||
|
||||
func (la *labelAndAnnotation) String() string {
|
||||
return "Labels:" + la.Labels.String() + "\nAnnotations:" + la.Annotations.String()
|
||||
}
|
||||
|
||||
// labelsAndAnnotations is collection of LabelAndAnnotation
|
||||
type labelsAndAnnotations []labelAndAnnotation
|
||||
|
||||
func (la labelsAndAnnotations) Len() int { return len(la) }
|
||||
|
||||
func (la labelsAndAnnotations) Swap(i, j int) { la[i], la[j] = la[j], la[i] }
|
||||
func (la labelsAndAnnotations) Less(i, j int) bool {
|
||||
diff := datasource.LabelCompare(la[i].Labels, la[j].Labels)
|
||||
if diff != 0 {
|
||||
return diff < 0
|
||||
}
|
||||
return datasource.LabelCompare(la[i].Annotations, la[j].Annotations) < 0
|
||||
}
|
||||
|
||||
func (la labelsAndAnnotations) String() string {
|
||||
if len(la) == 0 {
|
||||
return "[]"
|
||||
}
|
||||
s := "[\n0:" + indentLines("\n"+la[0].String(), " ")
|
||||
for i, l := range la[1:] {
|
||||
s += ",\n" + fmt.Sprintf("%d", i+1) + ":" + indentLines("\n"+l.String(), " ")
|
||||
}
|
||||
s += "\n]"
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// indentLines prefixes each line in the supplied string with the given "indent" string.
|
||||
func indentLines(lines, indent string) string {
|
||||
sb := strings.Builder{}
|
||||
n := strings.Split(lines, "\n")
|
||||
for i, l := range n {
|
||||
if i > 0 {
|
||||
sb.WriteString(indent)
|
||||
}
|
||||
sb.WriteString(l)
|
||||
if i != len(n)-1 {
|
||||
sb.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
}
|
443
app/vmalert-tool/unittest/unittest.go
Normal file
443
app/vmalert-tool/unittest/unittest.go
Normal file
|
@ -0,0 +1,443 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
vmalertconfig "github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
storagePath string
|
||||
httpListenAddr = ":8880"
|
||||
// insert series from 1970-01-01T00:00:00
|
||||
testStartTime = time.Unix(0, 0).UTC()
|
||||
|
||||
testPromWriteHTTPPath = "http://127.0.0.1" + httpListenAddr + "/api/v1/write"
|
||||
testDataSourcePath = "http://127.0.0.1" + httpListenAddr + "/prometheus"
|
||||
testRemoteWritePath = "http://127.0.0.1" + httpListenAddr
|
||||
testHealthHTTPPath = "http://127.0.0.1" + httpListenAddr + "/health"
|
||||
|
||||
disableAlertgroupLabel bool
|
||||
)
|
||||
|
||||
const (
|
||||
testStoragePath = "vmalert-unittest"
|
||||
testLogLevel = "ERROR"
|
||||
)
|
||||
|
||||
// UnitTest runs unittest for files
|
||||
func UnitTest(files []string, disableGroupLabel bool) bool {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
logger.Fatalf("failed to load template: %v", err)
|
||||
}
|
||||
storagePath = filepath.Join(os.TempDir(), testStoragePath)
|
||||
processFlags()
|
||||
vminsert.Init()
|
||||
vmselect.Init()
|
||||
// storagePath will be created again when closing vmselect, so remove it again.
|
||||
defer fs.MustRemoveAll(storagePath)
|
||||
defer vminsert.Stop()
|
||||
defer vmselect.Stop()
|
||||
disableAlertgroupLabel = disableGroupLabel
|
||||
return rulesUnitTest(files)
|
||||
}
|
||||
|
||||
func rulesUnitTest(files []string) bool {
|
||||
var failed bool
|
||||
for _, f := range files {
|
||||
if err := ruleUnitTest(f); err != nil {
|
||||
fmt.Println(" FAILED")
|
||||
fmt.Printf("\nfailed to run unit test for file %q: \n%v", f, err)
|
||||
failed = true
|
||||
} else {
|
||||
fmt.Println(" SUCCESS")
|
||||
}
|
||||
}
|
||||
return failed
|
||||
}
|
||||
|
||||
func ruleUnitTest(filename string) []error {
|
||||
fmt.Println("\nUnit Testing: ", filename)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to read file: %w", err)}
|
||||
}
|
||||
|
||||
var unitTestInp unitTestFile
|
||||
if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to unmarshal file: %w", err)}
|
||||
}
|
||||
if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil {
|
||||
return []error{fmt.Errorf("failed to resolve path for `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
if unitTestInp.EvaluationInterval.Duration() == 0 {
|
||||
fmt.Println("evaluation_interval set to 1m by default")
|
||||
unitTestInp.EvaluationInterval = &promutils.Duration{D: 1 * time.Minute}
|
||||
}
|
||||
|
||||
groupOrderMap := make(map[string]int)
|
||||
for i, gn := range unitTestInp.GroupEvalOrder {
|
||||
if _, ok := groupOrderMap[gn]; ok {
|
||||
return []error{fmt.Errorf("group name repeated in `group_eval_order`: %s", gn)}
|
||||
}
|
||||
groupOrderMap[gn] = i
|
||||
}
|
||||
|
||||
testGroups, err := vmalertconfig.Parse(unitTestInp.RuleFiles, nil, true)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to parse `rule_files`: %w", err)}
|
||||
}
|
||||
|
||||
var errs []error
|
||||
for _, t := range unitTestInp.Tests {
|
||||
if err := verifyTestGroup(t); err != nil {
|
||||
errs = append(errs, err)
|
||||
continue
|
||||
}
|
||||
testErrs := t.test(unitTestInp.EvaluationInterval.Duration(), groupOrderMap, testGroups)
|
||||
errs = append(errs, testErrs...)
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyTestGroup(group testGroup) error {
|
||||
var testGroupName string
|
||||
if group.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s\n", group.TestGroupName)
|
||||
}
|
||||
for _, at := range group.AlertRuleTests {
|
||||
if at.Alertname == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"alertname\"", testGroupName)
|
||||
}
|
||||
if !disableAlertgroupLabel && at.GroupName == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"groupname\" when flag \"disableAlertGroupLabel\" is false", testGroupName)
|
||||
}
|
||||
if disableAlertgroupLabel && at.GroupName != "" {
|
||||
return fmt.Errorf("\n%s shouldn't set filed \"groupname\" when flag \"disableAlertGroupLabel\" is true", testGroupName)
|
||||
}
|
||||
if at.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
for _, et := range group.MetricsqlExprTests {
|
||||
if et.Expr == "" {
|
||||
return fmt.Errorf("\n%s missing required filed \"expr\"", testGroupName)
|
||||
}
|
||||
if et.EvalTime == nil {
|
||||
return fmt.Errorf("\n%s missing required filed \"eval_time\"", testGroupName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func processFlags() {
|
||||
flag.Parse()
|
||||
for _, fv := range []struct {
|
||||
flag string
|
||||
value string
|
||||
}{
|
||||
{flag: "storageDataPath", value: storagePath},
|
||||
{flag: "loggerLevel", value: testLogLevel},
|
||||
{flag: "search.disableCache", value: "true"},
|
||||
// set storage retention time to 100 years, allow to store series from 1970-01-01T00:00:00.
|
||||
{flag: "retentionPeriod", value: "100y"},
|
||||
{flag: "datasource.url", value: testDataSourcePath},
|
||||
{flag: "remoteWrite.url", value: testRemoteWritePath},
|
||||
} {
|
||||
// panics if flag doesn't exist
|
||||
if err := flag.Lookup(fv.flag).Value.Set(fv.value); err != nil {
|
||||
logger.Fatalf("unable to set %q with value %q, err: %v", fv.flag, fv.value, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func setUp() {
|
||||
vmstorage.Init(promql.ResetRollupResultCacheIfNeeded)
|
||||
go httpserver.Serve(httpListenAddr, false, func(w http.ResponseWriter, r *http.Request) bool {
|
||||
switch r.URL.Path {
|
||||
case "/prometheus/api/v1/query":
|
||||
if err := prometheus.QueryHandler(nil, time.Now(), w, r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
case "/prometheus/api/v1/write", "/api/v1/write":
|
||||
if err := promremotewrite.InsertHandler(r); err != nil {
|
||||
httpserver.Errorf(w, r, "%s", err)
|
||||
}
|
||||
return true
|
||||
default:
|
||||
}
|
||||
return false
|
||||
})
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
readyCheckFunc := func() bool {
|
||||
resp, err := http.Get(testHealthHTTPPath)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
return resp.StatusCode == 200
|
||||
}
|
||||
checkCheck:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Fatalf("http server can't be ready in 30s")
|
||||
default:
|
||||
if readyCheckFunc() {
|
||||
break checkCheck
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tearDown() {
|
||||
if err := httpserver.Stop(httpListenAddr); err != nil {
|
||||
logger.Errorf("cannot stop the webservice: %s", err)
|
||||
}
|
||||
vmstorage.Stop()
|
||||
metrics.UnregisterAllMetrics()
|
||||
fs.MustRemoveAll(storagePath)
|
||||
}
|
||||
|
||||
// resolveAndGlobFilepaths joins all relative paths in a configuration
|
||||
// with a given base directory and replaces all globs with matching files.
|
||||
func resolveAndGlobFilepaths(baseDir string, utf *unitTestFile) error {
|
||||
for i, rf := range utf.RuleFiles {
|
||||
if rf != "" && !filepath.IsAbs(rf) {
|
||||
utf.RuleFiles[i] = filepath.Join(baseDir, rf)
|
||||
}
|
||||
}
|
||||
|
||||
var globbedFiles []string
|
||||
for _, rf := range utf.RuleFiles {
|
||||
m, err := filepath.Glob(rf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(m) == 0 {
|
||||
fmt.Fprintln(os.Stderr, " WARNING: no file match pattern", rf)
|
||||
}
|
||||
globbedFiles = append(globbedFiles, m...)
|
||||
}
|
||||
utf.RuleFiles = globbedFiles
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]int, testGroups []vmalertconfig.Group) (checkErrs []error) {
|
||||
// set up vmstorage and http server for ingest and read queries
|
||||
setUp()
|
||||
// tear down vmstorage and clean the data dir
|
||||
defer tearDown()
|
||||
|
||||
err := writeInputSeries(tg.InputSeries, tg.Interval, testStartTime, testPromWriteHTTPPath)
|
||||
if err != nil {
|
||||
return []error{err}
|
||||
}
|
||||
|
||||
q, err := datasource.Init(nil)
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init datasource: %v", err)}
|
||||
}
|
||||
rw, err := remotewrite.NewDebugClient()
|
||||
if err != nil {
|
||||
return []error{fmt.Errorf("failed to init wr: %v", err)}
|
||||
}
|
||||
|
||||
alertEvalTimesMap := map[time.Duration]struct{}{}
|
||||
alertExpResultMap := map[time.Duration]map[string]map[string][]expAlert{}
|
||||
for _, at := range tg.AlertRuleTests {
|
||||
et := at.EvalTime.Duration()
|
||||
alertEvalTimesMap[et] = struct{}{}
|
||||
if _, ok := alertExpResultMap[et]; !ok {
|
||||
alertExpResultMap[et] = make(map[string]map[string][]expAlert)
|
||||
}
|
||||
if _, ok := alertExpResultMap[et][at.GroupName]; !ok {
|
||||
alertExpResultMap[et][at.GroupName] = make(map[string][]expAlert)
|
||||
}
|
||||
alertExpResultMap[et][at.GroupName][at.Alertname] = at.ExpAlerts
|
||||
}
|
||||
alertEvalTimes := make([]time.Duration, 0, len(alertEvalTimesMap))
|
||||
for k := range alertEvalTimesMap {
|
||||
alertEvalTimes = append(alertEvalTimes, k)
|
||||
}
|
||||
sort.Slice(alertEvalTimes, func(i, j int) bool {
|
||||
return alertEvalTimes[i] < alertEvalTimes[j]
|
||||
})
|
||||
|
||||
// sort group eval order according to the given "group_eval_order".
|
||||
sort.Slice(testGroups, func(i, j int) bool {
|
||||
return groupOrderMap[testGroups[i].Name] < groupOrderMap[testGroups[j].Name]
|
||||
})
|
||||
|
||||
// create groups with given rule
|
||||
var groups []*rule.Group
|
||||
for _, group := range testGroups {
|
||||
ng := rule.NewGroup(group, q, time.Minute, tg.ExternalLabels)
|
||||
groups = append(groups, ng)
|
||||
}
|
||||
|
||||
evalIndex := 0
|
||||
maxEvalTime := testStartTime.Add(tg.maxEvalTime())
|
||||
for ts := testStartTime; ts.Before(maxEvalTime) || ts.Equal(maxEvalTime); ts = ts.Add(evalInterval) {
|
||||
for _, g := range groups {
|
||||
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
|
||||
ts, err))
|
||||
}
|
||||
}
|
||||
// flush series after each group evaluation
|
||||
vmstorage.Storage.DebugFlush()
|
||||
}
|
||||
|
||||
// check alert_rule_test case at every eval time
|
||||
for evalIndex < len(alertEvalTimes) {
|
||||
if ts.Sub(testStartTime) > alertEvalTimes[evalIndex] ||
|
||||
alertEvalTimes[evalIndex] >= ts.Add(evalInterval).Sub(testStartTime) {
|
||||
break
|
||||
}
|
||||
gotAlertsMap := map[string]map[string]labelsAndAnnotations{}
|
||||
for _, g := range groups {
|
||||
if disableAlertgroupLabel {
|
||||
g.Name = ""
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, ok := gotAlertsMap[g.Name]; !ok {
|
||||
gotAlertsMap[g.Name] = make(map[string]labelsAndAnnotations)
|
||||
}
|
||||
for _, r := range g.Rules {
|
||||
ar, isAlertRule := r.(*rule.AlertingRule)
|
||||
if !isAlertRule {
|
||||
continue
|
||||
}
|
||||
if _, ok := alertExpResultMap[time.Duration(ts.UnixNano())][g.Name][ar.Name]; ok {
|
||||
for _, got := range ar.GetAlerts() {
|
||||
if got.State != notifier.StateFiring {
|
||||
continue
|
||||
}
|
||||
if disableAlertgroupLabel {
|
||||
delete(got.Labels, "alertgroup")
|
||||
}
|
||||
laa := labelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(got.Labels),
|
||||
Annotations: datasource.ConvertToLabels(got.Annotations),
|
||||
}
|
||||
gotAlertsMap[g.Name][ar.Name] = append(gotAlertsMap[g.Name][ar.Name], laa)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
for groupname, gres := range alertExpResultMap[alertEvalTimes[evalIndex]] {
|
||||
for alertname, res := range gres {
|
||||
var expAlerts labelsAndAnnotations
|
||||
for _, expAlert := range res {
|
||||
if expAlert.ExpLabels == nil {
|
||||
expAlert.ExpLabels = make(map[string]string)
|
||||
}
|
||||
// alertGroupNameLabel is added as additional labels when `disableAlertGroupLabel` is false
|
||||
if !disableAlertgroupLabel {
|
||||
expAlert.ExpLabels["alertgroup"] = groupname
|
||||
}
|
||||
// alertNameLabel is added as additional labels in vmalert.
|
||||
expAlert.ExpLabels["alertname"] = alertname
|
||||
expAlerts = append(expAlerts, labelAndAnnotation{
|
||||
Labels: datasource.ConvertToLabels(expAlert.ExpLabels),
|
||||
Annotations: datasource.ConvertToLabels(expAlert.ExpAnnotations),
|
||||
})
|
||||
}
|
||||
sort.Sort(expAlerts)
|
||||
|
||||
gotAlerts := gotAlertsMap[groupname][alertname]
|
||||
sort.Sort(gotAlerts)
|
||||
if !reflect.DeepEqual(expAlerts, gotAlerts) {
|
||||
var testGroupName string
|
||||
if tg.TestGroupName != "" {
|
||||
testGroupName = fmt.Sprintf("testGroupName: %s,\n", tg.TestGroupName)
|
||||
}
|
||||
expString := indentLines(expAlerts.String(), " ")
|
||||
gotString := indentLines(gotAlerts.String(), " ")
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\n%s groupname: %s, alertname: %s, time: %s, \n exp:%v, \n got:%v ",
|
||||
testGroupName, groupname, alertname, alertEvalTimes[evalIndex].String(), expString, gotString))
|
||||
}
|
||||
}
|
||||
}
|
||||
evalIndex++
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
checkErrs = append(checkErrs, checkMetricsqlCase(tg.MetricsqlExprTests, q)...)
|
||||
return checkErrs
|
||||
}
|
||||
|
||||
// unitTestFile holds the contents of a single unit test file
|
||||
type unitTestFile struct {
|
||||
RuleFiles []string `yaml:"rule_files"`
|
||||
EvaluationInterval *promutils.Duration `yaml:"evaluation_interval"`
|
||||
GroupEvalOrder []string `yaml:"group_eval_order"`
|
||||
Tests []testGroup `yaml:"tests"`
|
||||
}
|
||||
|
||||
// testGroup is a group of input series and test cases associated with it
|
||||
type testGroup struct {
|
||||
Interval *promutils.Duration `yaml:"interval"`
|
||||
InputSeries []series `yaml:"input_series"`
|
||||
AlertRuleTests []alertTestCase `yaml:"alert_rule_test"`
|
||||
MetricsqlExprTests []metricsqlTestCase `yaml:"metricsql_expr_test"`
|
||||
ExternalLabels map[string]string `yaml:"external_labels"`
|
||||
TestGroupName string `yaml:"name"`
|
||||
}
|
||||
|
||||
// maxEvalTime returns the max eval time among all alert_rule_test and metricsql_expr_test
|
||||
func (tg *testGroup) maxEvalTime() time.Duration {
|
||||
var maxd time.Duration
|
||||
for _, alert := range tg.AlertRuleTests {
|
||||
if alert.EvalTime.Duration() > maxd {
|
||||
maxd = alert.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
for _, met := range tg.MetricsqlExprTests {
|
||||
if met.EvalTime.Duration() > maxd {
|
||||
maxd = met.EvalTime.Duration()
|
||||
}
|
||||
}
|
||||
return maxd
|
||||
}
|
47
app/vmalert-tool/unittest/unittest_test.go
Normal file
47
app/vmalert-tool/unittest/unittest_test.go
Normal file
|
@ -0,0 +1,47 @@
|
|||
package unittest
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestUnitRule(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
disableGroupLabel bool
|
||||
files []string
|
||||
failed bool
|
||||
}{
|
||||
{
|
||||
name: "run multi files",
|
||||
files: []string{"./testdata/test1.yaml", "./testdata/test2.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "disable group label",
|
||||
disableGroupLabel: true,
|
||||
files: []string{"./testdata/disable-group-label.yaml"},
|
||||
failed: false,
|
||||
},
|
||||
{
|
||||
name: "failing test",
|
||||
files: []string{"./testdata/failed-test.yaml"},
|
||||
failed: true,
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
fail := UnitTest(tc.files, tc.disableGroupLabel)
|
||||
if fail != tc.failed {
|
||||
t.Fatalf("failed to test %s, expect %t, got %t", tc.name, tc.failed, fail)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -754,6 +754,11 @@ See full description for these flags in `./vmalert -help`.
|
|||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
|
||||
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
|
|
131
app/vmalert/datasource/faker.go
Normal file
131
app/vmalert/datasource/faker.go
Normal file
|
@ -0,0 +1,131 @@
|
|||
package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FakeQuerier is a mock querier that return predefined results and error message
|
||||
type FakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []Metric
|
||||
err error
|
||||
}
|
||||
|
||||
// SetErr sets query error message
|
||||
func (fq *FakeQuerier) SetErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// Reset reset querier's error message and results
|
||||
func (fq *FakeQuerier) Reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// Add appends metrics to querier result metrics
|
||||
func (fq *FakeQuerier) Add(metrics ...Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
// BuildWithParams return FakeQuerier itself
|
||||
func (fq *FakeQuerier) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fq
|
||||
}
|
||||
|
||||
// QueryRange performs query
|
||||
func (fq *FakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
|
||||
req, _, err := fq.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
// Query returns metrics restored in querier
|
||||
func (fq *FakeQuerier) Query(_ context.Context, _ string, _ time.Time) (Result, *http.Request, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return Result{}, nil, fq.err
|
||||
}
|
||||
cp := make([]Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
return Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
// FakeQuerierWithRegistry can store different results for different query expr
|
||||
type FakeQuerierWithRegistry struct {
|
||||
sync.Mutex
|
||||
registry map[string][]Metric
|
||||
}
|
||||
|
||||
// Set stores query result for given key
|
||||
func (fqr *FakeQuerierWithRegistry) Set(key string, metrics ...Metric) {
|
||||
fqr.Lock()
|
||||
if fqr.registry == nil {
|
||||
fqr.registry = make(map[string][]Metric)
|
||||
}
|
||||
fqr.registry[key] = metrics
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
// Reset clean querier's results registry
|
||||
func (fqr *FakeQuerierWithRegistry) Reset() {
|
||||
fqr.Lock()
|
||||
fqr.registry = nil
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
// BuildWithParams returns itself
|
||||
func (fqr *FakeQuerierWithRegistry) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fqr
|
||||
}
|
||||
|
||||
// QueryRange performs query
|
||||
func (fqr *FakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (Result, error) {
|
||||
req, _, err := fqr.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
// Query returns metrics restored in querier registry
|
||||
func (fqr *FakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (Result, *http.Request, error) {
|
||||
fqr.Lock()
|
||||
defer fqr.Unlock()
|
||||
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
metrics, ok := fqr.registry[expr]
|
||||
if !ok {
|
||||
return Result{}, req, nil
|
||||
}
|
||||
cp := make([]Metric, len(metrics))
|
||||
copy(cp, metrics)
|
||||
return Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
// FakeQuerierWithDelay mock querier with given delay duration
|
||||
type FakeQuerierWithDelay struct {
|
||||
FakeQuerier
|
||||
Delay time.Duration
|
||||
}
|
||||
|
||||
// Query returns query result after delay duration
|
||||
func (fqd *FakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (Result, *http.Request, error) {
|
||||
timer := time.NewTimer(fqd.Delay)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-timer.C:
|
||||
}
|
||||
return fqd.FakeQuerier.Query(ctx, expr, ts)
|
||||
}
|
||||
|
||||
// BuildWithParams returns itself
|
||||
func (fqd *FakeQuerierWithDelay) BuildWithParams(_ QuerierParams) Querier {
|
||||
return fqd
|
||||
}
|
|
@ -18,6 +18,7 @@ import (
|
|||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
||||
|
@ -66,11 +67,6 @@ absolute path to all .tpl files in root.
|
|||
|
||||
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
|
||||
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+
|
||||
"which by default is 4 times evaluationInterval of the parent group.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier")
|
||||
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
|
||||
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
|
||||
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier. By default, hostname is used as address.")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager `+
|
||||
|
@ -82,12 +78,8 @@ absolute path to all .tpl files in root.
|
|||
externalLabels = flagutil.NewArrayString("external.label", "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. "+
|
||||
"Pass multiple -label flags in order to add multiple label sets.")
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup. DEPRECATED - this flag has no effect and will be removed in the next releases.")
|
||||
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified.")
|
||||
)
|
||||
|
||||
|
@ -229,7 +221,7 @@ func newManager(ctx context.Context) (*manager, error) {
|
|||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: q,
|
||||
notifiers: nts,
|
||||
labels: labels,
|
||||
|
|
|
@ -8,11 +8,19 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// Disable rand sleep on group start during tests in order to speed up test execution.
|
||||
// Rand sleep is needed only in prod code.
|
||||
rule.SkipRandSleepOnGroupStart = true
|
||||
}
|
||||
|
||||
func TestGetExternalURL(t *testing.T) {
|
||||
expURL := "https://vicotriametrics.com/path"
|
||||
u, err := getExternalURL(expURL, "", false)
|
||||
|
@ -98,10 +106,10 @@ groups:
|
|||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
m := &manager{
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
labels: map[string]string{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
rw: &remotewrite.Client{},
|
||||
}
|
||||
|
||||
|
|
|
@ -3,14 +3,13 @@ package main
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
)
|
||||
|
||||
|
@ -19,7 +18,7 @@ type manager struct {
|
|||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers func() []notifier.Notifier
|
||||
|
||||
rw *remotewrite.Client
|
||||
rw remotewrite.RWClient
|
||||
// remote read builder.
|
||||
rr datasource.QuerierBuilder
|
||||
|
||||
|
@ -27,28 +26,28 @@ type manager struct {
|
|||
labels map[string]string
|
||||
|
||||
groupsMu sync.RWMutex
|
||||
groups map[uint64]*Group
|
||||
groups map[uint64]*rule.Group
|
||||
}
|
||||
|
||||
// RuleAPI generates APIRule object from alert by its ID(hash)
|
||||
func (m *manager) RuleAPI(gID, rID uint64) (APIRule, error) {
|
||||
// ruleAPI generates apiRule object from alert by its ID(hash)
|
||||
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
g, ok := m.groups[gID]
|
||||
if !ok {
|
||||
return APIRule{}, fmt.Errorf("can't find group with id %d", gID)
|
||||
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
if rule.ID() == rID {
|
||||
return rule.ToAPI(), nil
|
||||
return ruleToAPI(rule), nil
|
||||
}
|
||||
}
|
||||
return APIRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
||||
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
||||
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
||||
// alertAPI generates apiAlert object from alert by its ID(hash)
|
||||
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
|
||||
m.groupsMu.RLock()
|
||||
defer m.groupsMu.RUnlock()
|
||||
|
||||
|
@ -56,12 +55,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
|||
if !ok {
|
||||
return nil, fmt.Errorf("can't find group with id %d", gID)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
for _, r := range g.Rules {
|
||||
ar, ok := r.(*rule.AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
||||
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
|
||||
return apiAlert, nil
|
||||
}
|
||||
}
|
||||
|
@ -82,15 +81,15 @@ func (m *manager) close() {
|
|||
m.wg.Wait()
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error {
|
||||
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
|
||||
m.wg.Add(1)
|
||||
id := g.ID()
|
||||
go func() {
|
||||
defer m.wg.Done()
|
||||
if restore {
|
||||
g.start(ctx, m.notifiers, m.rw, m.rr)
|
||||
g.Start(ctx, m.notifiers, m.rw, m.rr)
|
||||
} else {
|
||||
g.start(ctx, m.notifiers, m.rw, nil)
|
||||
g.Start(ctx, m.notifiers, m.rw, nil)
|
||||
}
|
||||
}()
|
||||
m.groups[id] = g
|
||||
|
@ -99,7 +98,7 @@ func (m *manager) startGroup(ctx context.Context, g *Group, restore bool) error
|
|||
|
||||
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
||||
var rrPresent, arPresent bool
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
groupsRegistry := make(map[uint64]*rule.Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
for _, r := range cfg.Rules {
|
||||
if rrPresent && arPresent {
|
||||
|
@ -112,7 +111,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
|||
arPresent = true
|
||||
}
|
||||
}
|
||||
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
||||
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
|
@ -124,8 +123,8 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
|||
}
|
||||
|
||||
type updateItem struct {
|
||||
old *Group
|
||||
new *Group
|
||||
old *rule.Group
|
||||
new *rule.Group
|
||||
}
|
||||
var toUpdate []updateItem
|
||||
|
||||
|
@ -135,7 +134,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
|||
if !ok {
|
||||
// old group is not present in new list,
|
||||
// so must be stopped and deleted
|
||||
og.close()
|
||||
og.Close()
|
||||
delete(m.groups, og.ID())
|
||||
og = nil
|
||||
continue
|
||||
|
@ -157,81 +156,13 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
|||
var wg sync.WaitGroup
|
||||
for _, item := range toUpdate {
|
||||
wg.Add(1)
|
||||
go func(old *Group, new *Group) {
|
||||
old.updateCh <- new
|
||||
go func(old *rule.Group, new *rule.Group) {
|
||||
old.UpdateWith(new)
|
||||
wg.Done()
|
||||
}(item.old, item.new)
|
||||
item.old.interruptEval()
|
||||
item.old.InterruptEval()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) toAPI() APIGroup {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
|
||||
ag := APIGroup{
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.Seconds(),
|
||||
LastEvaluation: g.LastEvaluation,
|
||||
Concurrency: g.Concurrency,
|
||||
Params: urlValuesToStrings(g.Params),
|
||||
Headers: headersToStrings(g.Headers),
|
||||
NotifierHeaders: headersToStrings(g.NotifierHeaders),
|
||||
|
||||
Labels: g.Labels,
|
||||
}
|
||||
ag.Rules = make([]APIRule, 0)
|
||||
for _, r := range g.Rules {
|
||||
ag.Rules = append(ag.Rules, r.ToAPI())
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
func urlValuesToStrings(values url.Values) []string {
|
||||
if len(values) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(values))
|
||||
for k := range values {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
params := values[k]
|
||||
for _, v := range params {
|
||||
res = append(res, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func headersToStrings(headers map[string]string) []string {
|
||||
if len(headers) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(headers))
|
||||
for k := range headers {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
v := headers[k]
|
||||
res = append(res, fmt.Sprintf("%s: %s", k, v))
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
|
|
@ -10,8 +10,10 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
)
|
||||
|
||||
|
@ -26,7 +28,7 @@ func TestMain(m *testing.M) {
|
|||
// successful cases of
|
||||
// starting with empty rules folder
|
||||
func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m := &manager{groups: make(map[uint64]*rule.Group)}
|
||||
cfg := loadCfg(t, []string{"foo/bar"}, true, true)
|
||||
if err := m.update(context.Background(), cfg, false); err != nil {
|
||||
t.Fatalf("expected to load successfully with empty rules dir; got err instead: %v", err)
|
||||
|
@ -38,9 +40,9 @@ func TestManagerEmptyRulesDir(t *testing.T) {
|
|||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
|
@ -91,7 +93,7 @@ func TestManagerUpdate(t *testing.T) {
|
|||
}()
|
||||
|
||||
var (
|
||||
VMRows = &AlertingRule{
|
||||
VMRows = &rule.AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 10 * time.Second,
|
||||
|
@ -104,7 +106,7 @@ func TestManagerUpdate(t *testing.T) {
|
|||
"description": "{{$labels}}",
|
||||
},
|
||||
}
|
||||
Conns = &AlertingRule{
|
||||
Conns = &rule.AlertingRule{
|
||||
Name: "Conns",
|
||||
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
|
||||
Annotations: map[string]string{
|
||||
|
@ -112,7 +114,7 @@ func TestManagerUpdate(t *testing.T) {
|
|||
"description": "It is {{ $value }} connections for {{$labels.instance}}",
|
||||
},
|
||||
}
|
||||
ExampleAlertAlwaysFiring = &AlertingRule{
|
||||
ExampleAlertAlwaysFiring = &rule.AlertingRule{
|
||||
Name: "ExampleAlertAlwaysFiring",
|
||||
Expr: "sum by(job) (up == 1)",
|
||||
}
|
||||
|
@ -122,20 +124,20 @@ func TestManagerUpdate(t *testing.T) {
|
|||
name string
|
||||
initPath string
|
||||
updatePath string
|
||||
want []*Group
|
||||
want []*rule.Group
|
||||
}{
|
||||
{
|
||||
name: "update good rules",
|
||||
initPath: "config/testdata/rules/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules1-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/dir/rules1-good.rules",
|
||||
Name: "duplicatedGroupDiffFiles",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{
|
||||
&AlertingRule{
|
||||
Rules: []rule.Rule{
|
||||
&rule.AlertingRule{
|
||||
Name: "VMRows",
|
||||
Expr: "vm_rows > 0",
|
||||
For: 5 * time.Minute,
|
||||
|
@ -153,19 +155,20 @@ func TestManagerUpdate(t *testing.T) {
|
|||
name: "update good rules from 1 to 2 groups",
|
||||
initPath: "config/testdata/dir/rules/rules1-good.rules",
|
||||
updatePath: "config/testdata/rules/rules0-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Rules: []Rule{VMRows},
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Type: config.NewPrometheusType(),
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Name: "TestGroup",
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
},
|
||||
|
@ -176,20 +179,20 @@ func TestManagerUpdate(t *testing.T) {
|
|||
name: "update with one bad rule file",
|
||||
initPath: "config/testdata/rules/rules0-good.rules",
|
||||
updatePath: "config/testdata/dir/rules2-bad.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Name: "TestGroup",
|
||||
Type: config.NewPrometheusType(),
|
||||
Rules: []Rule{
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
},
|
||||
|
@ -200,19 +203,20 @@ func TestManagerUpdate(t *testing.T) {
|
|||
name: "update empty dir rules from 0 to 2 groups",
|
||||
initPath: "config/testdata/empty/*",
|
||||
updatePath: "config/testdata/rules/rules0-good.rules",
|
||||
want: []*Group{
|
||||
want: []*rule.Group{
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Name: "groupGorSingleAlert",
|
||||
Type: config.NewPrometheusType(),
|
||||
Interval: defaultEvalInterval,
|
||||
Rules: []Rule{VMRows},
|
||||
Rules: []rule.Rule{VMRows},
|
||||
},
|
||||
{
|
||||
File: "config/testdata/rules/rules0-good.rules",
|
||||
Interval: defaultEvalInterval,
|
||||
Type: config.NewPrometheusType(),
|
||||
Name: "TestGroup", Rules: []Rule{
|
||||
Name: "TestGroup",
|
||||
Rules: []rule.Rule{
|
||||
Conns,
|
||||
ExampleAlertAlwaysFiring,
|
||||
},
|
||||
|
@ -224,9 +228,9 @@ func TestManagerUpdate(t *testing.T) {
|
|||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.TODO())
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} },
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
|
||||
cfgInit := loadCfg(t, []string{tc.initPath}, true, true)
|
||||
|
@ -255,11 +259,36 @@ func TestManagerUpdate(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
func compareGroups(t *testing.T, a, b *rule.Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := rule.CompareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparison error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestManagerUpdateNegative(t *testing.T) {
|
||||
testCases := []struct {
|
||||
notifiers []notifier.Notifier
|
||||
rw *remotewrite.Client
|
||||
rw remotewrite.RWClient
|
||||
cfg config.Group
|
||||
expErr string
|
||||
}{
|
||||
|
@ -286,7 +315,7 @@ func TestManagerUpdateNegative(t *testing.T) {
|
|||
"contains alerting rules",
|
||||
},
|
||||
{
|
||||
[]notifier.Notifier{&fakeNotifier{}},
|
||||
[]notifier.Notifier{¬ifier.FakeNotifier{}},
|
||||
nil,
|
||||
config.Group{
|
||||
Name: "Recording and alerting rules",
|
||||
|
@ -316,8 +345,8 @@ func TestManagerUpdateNegative(t *testing.T) {
|
|||
for _, tc := range testCases {
|
||||
t.Run(tc.cfg.Name, func(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*Group),
|
||||
querierBuilder: &fakeQuerier{},
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
rw: tc.rw,
|
||||
}
|
||||
if tc.notifiers != nil {
|
||||
|
@ -346,21 +375,3 @@ func loadCfg(t *testing.T, path []string, validateAnnotations, validateExpressio
|
|||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func TestUrlValuesToStrings(t *testing.T) {
|
||||
mapQueryParams := map[string][]string{
|
||||
"param1": {"param1"},
|
||||
"param2": {"anotherparam"},
|
||||
}
|
||||
expectedRes := []string{"param1=param1", "param2=anotherparam"}
|
||||
res := urlValuesToStrings(mapQueryParams)
|
||||
|
||||
if len(res) != len(expectedRes) {
|
||||
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
|
||||
}
|
||||
for ind, val := range expectedRes {
|
||||
if val != res[ind] {
|
||||
t.Errorf("Expected %v; but got %v", val, res[ind])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
59
app/vmalert/notifier/faker.go
Normal file
59
app/vmalert/notifier/faker.go
Normal file
|
@ -0,0 +1,59 @@
|
|||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FakeNotifier is a mock notifier
|
||||
type FakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []Alert
|
||||
// records number of received alerts in total
|
||||
counter int
|
||||
}
|
||||
|
||||
// Close does nothing
|
||||
func (*FakeNotifier) Close() {}
|
||||
|
||||
// Addr returns ""
|
||||
func (*FakeNotifier) Addr() string { return "" }
|
||||
|
||||
// Send sets alerts and increases counter
|
||||
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetCounter returns received alerts count
|
||||
func (fn *FakeNotifier) GetCounter() int {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.counter
|
||||
}
|
||||
|
||||
// GetAlerts returns stored alerts
|
||||
func (fn *FakeNotifier) GetAlerts() []Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
// FaultyNotifier is a mock notifier that Send() will return failed response
|
||||
type FaultyNotifier struct {
|
||||
FakeNotifier
|
||||
}
|
||||
|
||||
// Send returns failed response
|
||||
func (fn *FaultyNotifier) Send(ctx context.Context, _ []Alert, _ map[string]string) error {
|
||||
d, ok := ctx.Deadline()
|
||||
if ok {
|
||||
time.Sleep(time.Until(d))
|
||||
}
|
||||
return fmt.Errorf("send failed")
|
||||
}
|
322
app/vmalert/remotewrite/client.go
Normal file
322
app/vmalert/remotewrite/client.go
Normal file
|
@ -0,0 +1,322 @@
|
|||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write client.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer func() {
|
||||
sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
}()
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with a HTTP 2xx status code when the write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
97
app/vmalert/remotewrite/debug_client.go
Normal file
97
app/vmalert/remotewrite/debug_client.go
Normal file
|
@ -0,0 +1,97 @@
|
|||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// DebugClient won't push series periodically, but will write data to remote endpoint
|
||||
// immediately when Push() is called
|
||||
type DebugClient struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewDebugClient initiates and returns a new DebugClient
|
||||
func NewDebugClient() (*DebugClient, error) {
|
||||
if *addr == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create transport: %w", err)
|
||||
}
|
||||
c := &DebugClient{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: t,
|
||||
},
|
||||
addr: strings.TrimSuffix(*addr, "/"),
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push sends the given timeseries to the remote storage.
|
||||
func (c *DebugClient) Push(s prompbmarshal.TimeSeries) error {
|
||||
c.wg.Add(1)
|
||||
defer c.wg.Done()
|
||||
wr := &prompbmarshal.WriteRequest{Timeseries: []prompbmarshal.TimeSeries{s}}
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal the given time series: %w", err)
|
||||
}
|
||||
|
||||
return c.send(data)
|
||||
}
|
||||
|
||||
// Close stops the DebugClient
|
||||
func (c *DebugClient) Close() error {
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *DebugClient) send(data []byte) error {
|
||||
b := snappy.Encode(nil, data)
|
||||
r := bytes.NewReader(b)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode/100 == 2 {
|
||||
return nil
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
50
app/vmalert/remotewrite/debug_client_test.go
Normal file
50
app/vmalert/remotewrite/debug_client_test.go
Normal file
|
@ -0,0 +1,50 @@
|
|||
package remotewrite
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
func TestDebugClient_Push(t *testing.T) {
|
||||
testSrv := newRWServer()
|
||||
oldAddr := *addr
|
||||
*addr = testSrv.URL
|
||||
defer func() {
|
||||
*addr = oldAddr
|
||||
}()
|
||||
|
||||
client, err := NewDebugClient()
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create debug client: %s", err)
|
||||
}
|
||||
|
||||
const rowsN = 100
|
||||
var sent int
|
||||
for i := 0; i < rowsN; i++ {
|
||||
s := prompbmarshal.TimeSeries{
|
||||
Samples: []prompbmarshal.Sample{{
|
||||
Value: float64(i),
|
||||
Timestamp: time.Now().Unix(),
|
||||
}},
|
||||
}
|
||||
err := client.Push(s)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
if err == nil {
|
||||
sent++
|
||||
}
|
||||
}
|
||||
if sent == 0 {
|
||||
t.Fatalf("0 series sent")
|
||||
}
|
||||
if err := client.Close(); err != nil {
|
||||
t.Fatalf("failed to close client: %s", err)
|
||||
}
|
||||
got := testSrv.accepted()
|
||||
if got != sent {
|
||||
t.Fatalf("expected to have %d series; got %d", sent, got)
|
||||
}
|
||||
}
|
|
@ -1,322 +1,13 @@
|
|||
package remotewrite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
// timeseries via remote write protocol.
|
||||
type Client struct {
|
||||
addr string
|
||||
c *http.Client
|
||||
authCfg *promauth.Config
|
||||
input chan prompbmarshal.TimeSeries
|
||||
flushInterval time.Duration
|
||||
maxBatchSize int
|
||||
maxQueueSize int
|
||||
|
||||
wg sync.WaitGroup
|
||||
doneCh chan struct{}
|
||||
}
|
||||
|
||||
// Config is config for remote write.
|
||||
type Config struct {
|
||||
// Addr of remote storage
|
||||
Addr string
|
||||
AuthCfg *promauth.Config
|
||||
|
||||
// Concurrency defines number of readers that
|
||||
// concurrently read from the queue and flush data
|
||||
Concurrency int
|
||||
// MaxBatchSize defines max number of timeseries
|
||||
// to be flushed at once
|
||||
MaxBatchSize int
|
||||
// MaxQueueSize defines max length of input queue
|
||||
// populated by Push method.
|
||||
// Push will be rejected once queue is full.
|
||||
MaxQueueSize int
|
||||
// FlushInterval defines time interval for flushing batches
|
||||
FlushInterval time.Duration
|
||||
// Transport will be used by the underlying http.Client
|
||||
Transport *http.Transport
|
||||
}
|
||||
|
||||
const (
|
||||
defaultConcurrency = 4
|
||||
defaultMaxBatchSize = 1e3
|
||||
defaultMaxQueueSize = 1e5
|
||||
defaultFlushInterval = 5 * time.Second
|
||||
defaultWriteTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// NewClient returns asynchronous client for
|
||||
// writing timeseries via remotewrite protocol.
|
||||
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
||||
if cfg.Addr == "" {
|
||||
return nil, fmt.Errorf("config.Addr can't be empty")
|
||||
}
|
||||
if cfg.MaxBatchSize == 0 {
|
||||
cfg.MaxBatchSize = defaultMaxBatchSize
|
||||
}
|
||||
if cfg.MaxQueueSize == 0 {
|
||||
cfg.MaxQueueSize = defaultMaxQueueSize
|
||||
}
|
||||
if cfg.FlushInterval == 0 {
|
||||
cfg.FlushInterval = defaultFlushInterval
|
||||
}
|
||||
if cfg.Transport == nil {
|
||||
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
|
||||
}
|
||||
cc := defaultConcurrency
|
||||
if cfg.Concurrency > 0 {
|
||||
cc = cfg.Concurrency
|
||||
}
|
||||
c := &Client{
|
||||
c: &http.Client{
|
||||
Timeout: *sendTimeout,
|
||||
Transport: cfg.Transport,
|
||||
},
|
||||
addr: strings.TrimSuffix(cfg.Addr, "/"),
|
||||
authCfg: cfg.AuthCfg,
|
||||
flushInterval: cfg.FlushInterval,
|
||||
maxBatchSize: cfg.MaxBatchSize,
|
||||
maxQueueSize: cfg.MaxQueueSize,
|
||||
doneCh: make(chan struct{}),
|
||||
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
|
||||
}
|
||||
|
||||
for i := 0; i < cc; i++ {
|
||||
c.run(ctx)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Push adds timeseries into queue for writing into remote storage.
|
||||
// Push returns and error if client is stopped or if queue is full.
|
||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
return fmt.Errorf("client is closed")
|
||||
case c.input <- s:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||
c.maxQueueSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Close stops the client and waits for all goroutines
|
||||
// to exit.
|
||||
func (c *Client) Close() error {
|
||||
if c.doneCh == nil {
|
||||
return fmt.Errorf("client is already closed")
|
||||
}
|
||||
close(c.input)
|
||||
close(c.doneCh)
|
||||
c.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) run(ctx context.Context) {
|
||||
ticker := time.NewTicker(c.flushInterval)
|
||||
wr := &prompbmarshal.WriteRequest{}
|
||||
shutdown := func() {
|
||||
for ts := range c.input {
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
}
|
||||
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
|
||||
logger.Infof("shutting down remote write client and flushing remained %d series", len(wr.Timeseries))
|
||||
c.flush(lastCtx, wr)
|
||||
cancel()
|
||||
}
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-c.doneCh:
|
||||
shutdown()
|
||||
return
|
||||
case <-ctx.Done():
|
||||
shutdown()
|
||||
return
|
||||
case <-ticker.C:
|
||||
c.flush(ctx, wr)
|
||||
case ts, ok := <-c.input:
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
wr.Timeseries = append(wr.Timeseries, ts)
|
||||
if len(wr.Timeseries) >= c.maxBatchSize {
|
||||
c.flush(ctx, wr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
|
||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||
return float64(*concurrency)
|
||||
})
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
if len(wr.Timeseries) < 1 {
|
||||
return
|
||||
}
|
||||
defer prompbmarshal.ResetWriteRequest(wr)
|
||||
defer bufferFlushDuration.UpdateDuration(time.Now())
|
||||
|
||||
data, err := wr.Marshal()
|
||||
if err != nil {
|
||||
logger.Errorf("failed to marshal WriteRequest: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
defer func() {
|
||||
sendDuration.Add(time.Since(timeStart).Seconds())
|
||||
}()
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
sentBytes.Add(len(b))
|
||||
return
|
||||
}
|
||||
|
||||
_, isNotRetriable := err.(*nonRetriableError)
|
||||
logger.Warnf("attempt %d to send request failed: %s (retriable: %v)", attempts+1, err, !isNotRetriable)
|
||||
|
||||
if isNotRetriable {
|
||||
// exit fast if error isn't retriable
|
||||
break
|
||||
}
|
||||
|
||||
// check if request has been cancelled before backoff
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Errorf("interrupting retry attempt %d: context cancelled", attempts+1)
|
||||
break L
|
||||
default:
|
||||
}
|
||||
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
droppedBytes.Add(len(b))
|
||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||
len(wr.Timeseries))
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, data []byte) error {
|
||||
r := bytes.NewReader(data)
|
||||
req, err := http.NewRequest(http.MethodPost, c.addr, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new HTTP request: %w", err)
|
||||
}
|
||||
|
||||
// RFC standard compliant headers
|
||||
req.Header.Set("Content-Encoding", "snappy")
|
||||
req.Header.Set("Content-Type", "application/x-protobuf")
|
||||
|
||||
// Prometheus compliant headers
|
||||
req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
|
||||
if c.authCfg != nil {
|
||||
c.authCfg.SetHeaders(req, true)
|
||||
}
|
||||
if !*disablePathAppend {
|
||||
req.URL.Path = path.Join(req.URL.Path, "/api/v1/write")
|
||||
}
|
||||
resp, err := c.c.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
|
||||
req.URL.Redacted(), err, len(data), r.Size())
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
||||
// according to https://prometheus.io/docs/concepts/remote_write_spec/
|
||||
// Prometheus remote Write compatible receivers MUST
|
||||
switch resp.StatusCode / 100 {
|
||||
case 2:
|
||||
// respond with a HTTP 2xx status code when the write is successful.
|
||||
return nil
|
||||
case 4:
|
||||
if resp.StatusCode != http.StatusTooManyRequests {
|
||||
// MUST NOT retry write requests on HTTP 4xx responses other than 429
|
||||
return &nonRetriableError{fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
|
||||
resp.StatusCode, req.URL.Redacted(), body)
|
||||
}
|
||||
}
|
||||
|
||||
type nonRetriableError struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *nonRetriableError) Error() string {
|
||||
return e.err.Error()
|
||||
// RWClient represents an HTTP client for pushing data via remote write protocol
|
||||
type RWClient interface {
|
||||
// Push pushes the give time series to remote storage
|
||||
Push(s prompbmarshal.TimeSeries) error
|
||||
// Close stops the client. Client can't be reused after Close call.
|
||||
Close() error
|
||||
}
|
||||
|
|
|
@ -1,19 +1,16 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -33,7 +30,7 @@ var (
|
|||
"Progress bar rendering might be verbose or break the logs parsing, so it is recommended to be disabled when not used in interactive mode.")
|
||||
)
|
||||
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewrite.Client) error {
|
||||
func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw remotewrite.RWClient) error {
|
||||
if *replayMaxDatapoints < 1 {
|
||||
return fmt.Errorf("replay.maxDatapointsPerQuery can't be lower than 1")
|
||||
}
|
||||
|
@ -68,8 +65,8 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
|
|||
|
||||
var total int
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, qb, *evaluationInterval, labels)
|
||||
total += ng.replay(tFrom, tTo, rw)
|
||||
ng := rule.NewGroup(cfg, qb, *evaluationInterval, labels)
|
||||
total += ng.Replay(tFrom, tTo, rw, *replayMaxDatapoints, *replayRuleRetryAttempts, *replayRulesDelay, *disableProgressBar)
|
||||
}
|
||||
logger.Infof("replay finished! Imported %d samples", total)
|
||||
if rw != nil {
|
||||
|
@ -77,99 +74,3 @@ func replay(groupsCfg []config.Group, qb datasource.QuerierBuilder, rw *remotewr
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g *Group) replay(start, end time.Time, rw *remotewrite.Client) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(*replayMaxDatapoints)
|
||||
start = g.adjustReqTimestamp(start)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
iterations := int(end.Sub(start)/step) + 1
|
||||
fmt.Printf("\nGroup %q"+
|
||||
"\ninterval: \t%v"+
|
||||
"\neval_offset: \t%v"+
|
||||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, g.EvalOffset, iterations, step)
|
||||
if g.Limit > 0 {
|
||||
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
|
||||
g.Limit)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
var bar *pb.ProgressBar
|
||||
if !*disableProgressBar {
|
||||
bar = pb.StartNew(iterations)
|
||||
}
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
total += n
|
||||
if bar != nil {
|
||||
bar.Increment()
|
||||
}
|
||||
}
|
||||
if bar != nil {
|
||||
bar.Finish()
|
||||
}
|
||||
// sleep to let remote storage to flush data on-disk
|
||||
// so chained rules could be calculated correctly
|
||||
time.Sleep(*replayRulesDelay)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func replayRule(rule Rule, start, end time.Time, rw *remotewrite.Client) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < *replayRuleRetryAttempts; i++ {
|
||||
tss, err = rule.ExecRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, rule, err)
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
if err != nil { // means all attempts failed
|
||||
return 0, err
|
||||
}
|
||||
if len(tss) < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
var n int
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
return n, fmt.Errorf("remote write failure: %s", err)
|
||||
}
|
||||
n += len(ts.Samples)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
type rangeIterator struct {
|
||||
step time.Duration
|
||||
start, end time.Time
|
||||
|
||||
iter int
|
||||
s, e time.Time
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) reset() {
|
||||
ri.iter = 0
|
||||
ri.s, ri.e = time.Time{}, time.Time{}
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) next() bool {
|
||||
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
|
||||
if !ri.end.After(ri.s) {
|
||||
return false
|
||||
}
|
||||
ri.e = ri.s.Add(ri.step)
|
||||
if ri.e.After(ri.end) {
|
||||
ri.e = ri.end
|
||||
}
|
||||
ri.iter++
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ import (
|
|||
)
|
||||
|
||||
type fakeReplayQuerier struct {
|
||||
fakeQuerier
|
||||
datasource.FakeQuerier
|
||||
registry map[string]map[string]struct{}
|
||||
}
|
||||
|
||||
|
@ -170,81 +170,3 @@ func TestReplay(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeIterator(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ri rangeIterator
|
||||
result [][2]time.Time
|
||||
}{
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 5 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 45 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
|
||||
step: time.Second,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
var j int
|
||||
for tc.ri.next() {
|
||||
if len(tc.result) < j+1 {
|
||||
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
|
||||
j, tc.ri.s, tc.ri.e)
|
||||
}
|
||||
s, e := tc.ri.s, tc.ri.e
|
||||
expS, expE := tc.result[j][0], tc.result[j][1]
|
||||
if s != expS {
|
||||
t.Fatalf("expected to get start=%v; got %v", expS, s)
|
||||
}
|
||||
if e != expE {
|
||||
t.Fatalf("expected to get end=%v; got %v", expE, e)
|
||||
}
|
||||
j++
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func parseTime(t *testing.T, s string) time.Time {
|
||||
t.Helper()
|
||||
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return tt
|
||||
}
|
||||
|
|
|
@ -1,118 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// Exec executes the rule with given context at the given timestamp and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// ExecRange executes the rule on the given time range.
|
||||
ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// UpdateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
UpdateWith(Rule) error
|
||||
// ToAPI converts Rule into APIRule
|
||||
ToAPI() APIRule
|
||||
// Close performs the shutdown procedures for rule
|
||||
// such as metrics unregister
|
||||
Close()
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
|
||||
|
||||
type ruleState struct {
|
||||
sync.RWMutex
|
||||
entries []ruleStateEntry
|
||||
cur int
|
||||
}
|
||||
|
||||
type ruleStateEntry struct {
|
||||
// stores last moment of time rule.Exec was called
|
||||
time time.Time
|
||||
// stores the timestamp rule.Exec was called with
|
||||
at time.Time
|
||||
// stores the duration of the last rule.Exec call
|
||||
duration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health ruleState
|
||||
err error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
samples int
|
||||
// stores the number of time series fetched during
|
||||
// the last evaluation.
|
||||
// Is supported by VictoriaMetrics only, starting from v1.90.0
|
||||
// If seriesFetched == nil, then this attribute was missing in
|
||||
// datasource response (unsupported).
|
||||
seriesFetched *int
|
||||
// stores the curl command reflecting the HTTP request used during rule.Exec
|
||||
curl string
|
||||
}
|
||||
|
||||
func newRuleState(size int) *ruleState {
|
||||
if size < 1 {
|
||||
size = 1
|
||||
}
|
||||
return &ruleState{
|
||||
entries: make([]ruleStateEntry, size),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) getLast() ruleStateEntry {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return s.entries[s.cur]
|
||||
}
|
||||
|
||||
func (s *ruleState) size() int {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return len(s.entries)
|
||||
}
|
||||
|
||||
func (s *ruleState) getAll() []ruleStateEntry {
|
||||
entries := make([]ruleStateEntry, 0)
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
cur := s.cur
|
||||
for {
|
||||
e := s.entries[cur]
|
||||
if !e.time.IsZero() || !e.at.IsZero() {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
cur--
|
||||
if cur < 0 {
|
||||
cur = cap(s.entries) - 1
|
||||
}
|
||||
if cur == s.cur {
|
||||
return entries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) add(e ruleStateEntry) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.cur++
|
||||
if s.cur > cap(s.entries)-1 {
|
||||
s.cur = 0
|
||||
}
|
||||
s.entries[s.cur] = e
|
||||
}
|
|
@ -1,11 +1,10 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -55,7 +54,8 @@ type alertingRuleMetrics struct {
|
|||
seriesFetched *utils.Gauge
|
||||
}
|
||||
|
||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
// NewAlertingRule creates a new AlertingRule
|
||||
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||
ar := &AlertingRule{
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
|
@ -80,10 +80,15 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
metrics: &alertingRuleMetrics{},
|
||||
}
|
||||
|
||||
entrySize := *ruleUpdateEntriesLimit
|
||||
if cfg.UpdateEntriesLimit != nil {
|
||||
ar.state = newRuleState(*cfg.UpdateEntriesLimit)
|
||||
} else {
|
||||
ar.state = newRuleState(*ruleUpdateEntriesLimit)
|
||||
entrySize = *cfg.UpdateEntriesLimit
|
||||
}
|
||||
if entrySize < 1 {
|
||||
entrySize = 1
|
||||
}
|
||||
ar.state = &ruleState{
|
||||
entries: make([]StateEntry, entrySize),
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||
|
@ -114,7 +119,7 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
if e.err == nil {
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
|
@ -122,28 +127,28 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
return float64(e.samples)
|
||||
return float64(e.Samples)
|
||||
})
|
||||
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
|
||||
func() float64 {
|
||||
e := ar.state.getLast()
|
||||
if e.seriesFetched == nil {
|
||||
if e.SeriesFetched == nil {
|
||||
// means seriesFetched is unsupported
|
||||
return -1
|
||||
}
|
||||
seriesFetched := float64(*e.seriesFetched)
|
||||
if seriesFetched == 0 && e.samples > 0 {
|
||||
seriesFetched := float64(*e.SeriesFetched)
|
||||
if seriesFetched == 0 && e.Samples > 0 {
|
||||
// `alert: 0.95` will fetch no series
|
||||
// but will get one time series in response.
|
||||
seriesFetched = float64(e.samples)
|
||||
seriesFetched = float64(e.Samples)
|
||||
}
|
||||
return seriesFetched
|
||||
})
|
||||
return ar
|
||||
}
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (ar *AlertingRule) Close() {
|
||||
// close unregisters rule metrics
|
||||
func (ar *AlertingRule) close() {
|
||||
ar.metrics.active.Unregister()
|
||||
ar.metrics.pending.Unregister()
|
||||
ar.metrics.errors.Unregister()
|
||||
|
@ -162,6 +167,27 @@ func (ar *AlertingRule) ID() uint64 {
|
|||
return ar.RuleID
|
||||
}
|
||||
|
||||
// GetAlerts returns active alerts of rule
|
||||
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
var alerts []*notifier.Alert
|
||||
for _, a := range ar.alerts {
|
||||
alerts = append(alerts, a)
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
// GetAlert returns alert if id exists
|
||||
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
if ar.alerts == nil {
|
||||
return nil
|
||||
}
|
||||
return ar.alerts[id]
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
|
||||
if !ar.Debug {
|
||||
return
|
||||
|
@ -188,6 +214,26 @@ func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string
|
|||
logger.Infof("%s", prefix+msg)
|
||||
}
|
||||
|
||||
// updateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) updateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.KeepFiringFor = nr.KeepFiringFor
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
ar.Debug = nr.Debug
|
||||
ar.q = nr.q
|
||||
ar.state = nr.state
|
||||
return nil
|
||||
}
|
||||
|
||||
type labelSet struct {
|
||||
// origin labels extracted from received time series
|
||||
// plus extra labels (group labels, service labels like alertNameLabel).
|
||||
|
@ -248,11 +294,11 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
|
|||
return ls, nil
|
||||
}
|
||||
|
||||
// ExecRange executes alerting rule on the given time range similarly to Exec.
|
||||
// execRange executes alerting rule on the given time range similarly to exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
||||
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -297,19 +343,19 @@ func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]
|
|||
// is kept in memory state and consequently repeatedly sent to the AlertManager.
|
||||
const resolvedRetention = 15 * time.Minute
|
||||
|
||||
// Exec executes AlertingRule expression via the given Querier.
|
||||
// exec executes AlertingRule expression via the given Querier.
|
||||
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
||||
func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(res.Data),
|
||||
seriesFetched: res.SeriesFetched,
|
||||
err: err,
|
||||
curl: requestToCurl(req),
|
||||
curState := StateEntry{
|
||||
Time: start,
|
||||
At: ts,
|
||||
Duration: time.Since(start),
|
||||
Samples: len(res.Data),
|
||||
SeriesFetched: res.SeriesFetched,
|
||||
Err: err,
|
||||
Curl: requestToCurl(req),
|
||||
}
|
||||
|
||||
defer func() {
|
||||
|
@ -323,7 +369,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||
}
|
||||
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.samples, curState.duration)
|
||||
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
|
||||
|
||||
for h, a := range ar.alerts {
|
||||
// cleanup inactive alerts from previous Exec
|
||||
|
@ -342,15 +388,15 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
for _, m := range res.Data {
|
||||
ls, err := ar.toLabels(m, qFn)
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to expand labels: %s", err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
h := hash(ls.processed)
|
||||
if _, ok := updated[h]; ok {
|
||||
// duplicate may be caused by extra labels
|
||||
// conflicting with the metric labels
|
||||
curState.err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
||||
return nil, curState.Err
|
||||
}
|
||||
updated[h] = struct{}{}
|
||||
if a, ok := ar.alerts[h]; ok {
|
||||
|
@ -373,8 +419,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
}
|
||||
a, err := ar.newAlert(m, ls, start, qFn)
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to create alert: %w", err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to create alert: %w", err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
a.ID = h
|
||||
a.State = notifier.StatePending
|
||||
|
@ -423,8 +469,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
|
|||
}
|
||||
if limit > 0 && numActivePending > limit {
|
||||
ar.alerts = map[uint64]*notifier.Alert{}
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||
return nil, curState.Err
|
||||
}
|
||||
return ar.toTimeSeries(ts.Unix()), nil
|
||||
}
|
||||
|
@ -441,26 +487,6 @@ func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries
|
|||
return tss
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
// alerts state isn't copied since
|
||||
// it should be updated in next 2 Execs
|
||||
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
||||
nr, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
||||
}
|
||||
ar.Expr = nr.Expr
|
||||
ar.For = nr.For
|
||||
ar.KeepFiringFor = nr.KeepFiringFor
|
||||
ar.Labels = nr.Labels
|
||||
ar.Annotations = nr.Annotations
|
||||
ar.EvalInterval = nr.EvalInterval
|
||||
ar.Debug = nr.Debug
|
||||
ar.q = nr.q
|
||||
ar.state = nr.state
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: consider hashing algorithm in VM
|
||||
func hash(labels map[string]string) uint64 {
|
||||
hash := fnv.New64a()
|
||||
|
@ -503,102 +529,6 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
|
|||
return a, err
|
||||
}
|
||||
|
||||
// AlertAPI generates APIAlert object from alert by its id(hash)
|
||||
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
||||
ar.alertsMu.RLock()
|
||||
defer ar.alertsMu.RUnlock()
|
||||
a, ok := ar.alerts[id]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return ar.newAlertAPI(*a)
|
||||
}
|
||||
|
||||
// ToAPI returns Rule representation in form of APIRule
|
||||
// Isn't thread-safe. Call must be protected by AlertingRule mutex.
|
||||
func (ar *AlertingRule) ToAPI() APIRule {
|
||||
lastState := ar.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "alerting",
|
||||
DatasourceType: ar.Type.String(),
|
||||
Name: ar.Name,
|
||||
Query: ar.Expr,
|
||||
Duration: ar.For.Seconds(),
|
||||
KeepFiringFor: ar.KeepFiringFor.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
State: "inactive",
|
||||
Alerts: ar.AlertsToAPI(),
|
||||
LastSamples: lastState.samples,
|
||||
LastSeriesFetched: lastState.seriesFetched,
|
||||
MaxUpdates: ar.state.size(),
|
||||
Updates: ar.state.getAll(),
|
||||
Debug: ar.Debug,
|
||||
|
||||
// encode as strings to avoid rounding in JSON
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
}
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
// satisfy APIRule.State logic
|
||||
if len(r.Alerts) > 0 {
|
||||
r.State = notifier.StatePending.String()
|
||||
stateFiring := notifier.StateFiring.String()
|
||||
for _, a := range r.Alerts {
|
||||
if a.State == stateFiring {
|
||||
r.State = stateFiring
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// AlertsToAPI generates list of APIAlert objects from existing alerts
|
||||
func (ar *AlertingRule) AlertsToAPI() []*APIAlert {
|
||||
var alerts []*APIAlert
|
||||
ar.alertsMu.RLock()
|
||||
for _, a := range ar.alerts {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, ar.newAlertAPI(*a))
|
||||
}
|
||||
ar.alertsMu.RUnlock()
|
||||
return alerts
|
||||
}
|
||||
|
||||
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
||||
aa := &APIAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
RuleID: fmt.Sprintf("%d", ar.RuleID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.ActiveAt,
|
||||
Restored: a.Restored,
|
||||
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
|
||||
}
|
||||
if alertURLGeneratorFn != nil {
|
||||
aa.SourceLink = alertURLGeneratorFn(a)
|
||||
}
|
||||
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
|
||||
aa.Stabilizing = true
|
||||
}
|
||||
return aa
|
||||
}
|
||||
|
||||
const (
|
||||
// alertMetricName is the metric name for synthetic alert timeseries.
|
||||
alertMetricName = "ALERTS"
|
||||
|
@ -646,10 +576,10 @@ func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.Time
|
|||
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
|
||||
}
|
||||
|
||||
// Restore restores the value of ActiveAt field for active alerts,
|
||||
// restore restores the value of ActiveAt field for active alerts,
|
||||
// based on previously written time series `alertForStateMetricName`.
|
||||
// Only rules with For > 0 can be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
|
||||
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
|
||||
if ar.For < 1 {
|
||||
return nil
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
@ -303,13 +303,13 @@ func TestAlertingRule_Exec(t *testing.T) {
|
|||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
for i, step := range tc.steps {
|
||||
fq.reset()
|
||||
fq.add(step...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
fq.Reset()
|
||||
fq.Add(step...)
|
||||
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
// artificial delay between applying steps
|
||||
|
@ -482,11 +482,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||
fakeGroup := Group{Name: "TestRule_ExecRange"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.q = fq
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.data...)
|
||||
gotTS, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
fq.Add(tc.data...)
|
||||
gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
|
@ -518,24 +518,24 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||
|
||||
func TestGroup_Restore(t *testing.T) {
|
||||
defaultTS := time.Now()
|
||||
fqr := &fakeQuerierWithRegistry{}
|
||||
fqr := &datasource.FakeQuerierWithRegistry{}
|
||||
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
|
||||
t.Helper()
|
||||
defer fqr.reset()
|
||||
defer fqr.Reset()
|
||||
|
||||
for _, r := range rules {
|
||||
fqr.set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
|
||||
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
|
||||
}
|
||||
|
||||
fg := newGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
|
||||
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
nts := func() []notifier.Notifier { return []notifier.Notifier{&fakeNotifier{}} }
|
||||
fg.start(context.Background(), nts, nil, fqr)
|
||||
nts := func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} }
|
||||
fg.Start(context.Background(), nts, nil, fqr)
|
||||
wg.Done()
|
||||
}()
|
||||
fg.close()
|
||||
fg.Close()
|
||||
wg.Wait()
|
||||
|
||||
gotAlerts := make(map[uint64]*notifier.Alert)
|
||||
|
@ -582,11 +582,11 @@ func TestGroup_Restore(t *testing.T) {
|
|||
ActiveAt: defaultTS,
|
||||
},
|
||||
})
|
||||
fqr.reset()
|
||||
fqr.Reset()
|
||||
|
||||
// one active alert with state restore
|
||||
ts := time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
|
||||
|
@ -598,7 +598,7 @@ func TestGroup_Restore(t *testing.T) {
|
|||
|
||||
// two rules, two active alerts, one with state restored
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fn(
|
||||
[]config.Rule{
|
||||
|
@ -616,9 +616,9 @@ func TestGroup_Restore(t *testing.T) {
|
|||
|
||||
// two rules, two active alerts, two with state restored
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
|
||||
stateMetric("foo", ts))
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
|
||||
stateMetric("bar", ts))
|
||||
fn(
|
||||
[]config.Rule{
|
||||
|
@ -636,7 +636,7 @@ func TestGroup_Restore(t *testing.T) {
|
|||
|
||||
// one active alert but wrong state restore
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
|
||||
stateMetric("wrong alert", ts))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
|
||||
|
@ -648,7 +648,7 @@ func TestGroup_Restore(t *testing.T) {
|
|||
|
||||
// one active alert with labels
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
stateMetric("foo", ts, "env", "dev"))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
|
||||
|
@ -660,7 +660,7 @@ func TestGroup_Restore(t *testing.T) {
|
|||
|
||||
// one active alert with restore labels missmatch
|
||||
ts = time.Now().Truncate(time.Hour)
|
||||
fqr.set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
|
||||
stateMetric("foo", ts, "env", "dev", "team", "foo"))
|
||||
fn(
|
||||
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
|
||||
|
@ -672,30 +672,30 @@ func TestGroup_Restore(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestAlertingRule_Exec_Negative(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
ar := newTestAlertingRule("test", 0)
|
||||
ar.Labels = map[string]string{"job": "test"}
|
||||
ar.q = fq
|
||||
|
||||
// successful attempt
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
_, err := ar.exec(context.TODO(), time.Now(), 0)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// label `job` will collide with rule extra label and will make both time series equal
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
|
||||
_, err = ar.exec(context.TODO(), time.Now(), 0)
|
||||
if !errors.Is(err, errDuplicate) {
|
||||
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
_, err = ar.Exec(context.TODO(), time.Now(), 0)
|
||||
fq.SetErr(errors.New(expErr))
|
||||
_, err = ar.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
|
@ -705,7 +705,7 @@ func TestAlertingRule_Exec_Negative(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestAlertingRuleLimit(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
ar := newTestAlertingRule("test", 0)
|
||||
ar.Labels = map[string]string{"job": "test"}
|
||||
ar.q = fq
|
||||
|
@ -737,15 +737,15 @@ func TestAlertingRuleLimit(t *testing.T) {
|
|||
err error
|
||||
timestamp = time.Now()
|
||||
)
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
|
||||
for _, testCase := range testCases {
|
||||
_, err = ar.Exec(context.TODO(), timestamp, testCase.limit)
|
||||
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
|
||||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
}
|
||||
|
||||
func TestAlertingRule_Template(t *testing.T) {
|
||||
|
@ -870,12 +870,12 @@ func TestAlertingRule_Template(t *testing.T) {
|
|||
fakeGroup := Group{Name: "TestRule_Exec"}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
tc.rule.q = fq
|
||||
tc.rule.state = newRuleState(10)
|
||||
fq.add(tc.metrics...)
|
||||
if _, err := tc.rule.Exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
|
||||
fq.Add(tc.metrics...)
|
||||
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
for hash, expAlert := range tc.expAlerts {
|
||||
|
@ -989,7 +989,7 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
|||
For: waitFor,
|
||||
EvalInterval: waitFor,
|
||||
alerts: make(map[uint64]*notifier.Alert),
|
||||
state: newRuleState(10),
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
return &rule
|
||||
}
|
|
@ -1,8 +1,10 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"net/url"
|
||||
|
@ -11,7 +13,7 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
|
@ -21,6 +23,18 @@ import (
|
|||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
var (
|
||||
ruleUpdateEntriesLimit = flag.Int("rule.updateEntriesLimit", 20, "Defines the max number of rule's state updates stored in-memory. "+
|
||||
"Rule's updates are available on rule's Details page and are used for debugging purposes. The number of stored updates can be overridden per rule via update_entries_limit param.")
|
||||
resendDelay = flag.Duration("rule.resendDelay", 0, "MiniMum amount of time to wait before resending an alert to notifier")
|
||||
maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maxiMum duration for automatic alert expiration, "+
|
||||
"which by default is 4 times evaluationInterval of the parent ")
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
|
@ -96,7 +110,8 @@ func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[s
|
|||
return r
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
// NewGroup returns a new group
|
||||
func NewGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
g := &Group{
|
||||
Type: cfg.Type,
|
||||
Name: cfg.Name,
|
||||
|
@ -153,11 +168,11 @@ func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval ti
|
|||
return g
|
||||
}
|
||||
|
||||
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
|
||||
if rule.Alert != "" {
|
||||
return newAlertingRule(qb, g, rule)
|
||||
func (g *Group) newRule(qb datasource.QuerierBuilder, r config.Rule) Rule {
|
||||
if r.Alert != "" {
|
||||
return NewAlertingRule(qb, g, r)
|
||||
}
|
||||
return newRecordingRule(qb, g, rule)
|
||||
return NewRecordingRule(qb, g, r)
|
||||
}
|
||||
|
||||
// ID return unique group ID that consists of
|
||||
|
@ -178,8 +193,8 @@ func (g *Group) ID() uint64 {
|
|||
return hash.Sum64()
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
|
||||
// restore restores alerts state for group rules
|
||||
func (g *Group) restore(ctx context.Context, qb datasource.QuerierBuilder, ts time.Time, lookback time.Duration) error {
|
||||
for _, rule := range g.Rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
|
@ -195,7 +210,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
|
|||
Headers: g.Headers,
|
||||
Debug: ar.Debug,
|
||||
})
|
||||
if err := ar.Restore(ctx, q, ts, lookback); err != nil {
|
||||
if err := ar.restore(ctx, q, ts, lookback); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
|
@ -205,7 +220,7 @@ func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, ts ti
|
|||
// updateWith updates existing group with
|
||||
// passed group object. This function ignores group
|
||||
// evaluation interval change. It supposed to be updated
|
||||
// in group.start function.
|
||||
// in group.Start function.
|
||||
// Not thread-safe.
|
||||
func (g *Group) updateWith(newGroup *Group) error {
|
||||
rulesRegistry := make(map[uint64]Rule)
|
||||
|
@ -218,11 +233,11 @@ func (g *Group) updateWith(newGroup *Group) error {
|
|||
if !ok {
|
||||
// old rule is not present in the new list
|
||||
// so we mark it for removing
|
||||
g.Rules[i].Close()
|
||||
g.Rules[i].close()
|
||||
g.Rules[i] = nil
|
||||
continue
|
||||
}
|
||||
if err := or.UpdateWith(nr); err != nil {
|
||||
if err := or.updateWith(nr); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(rulesRegistry, nr.ID())
|
||||
|
@ -255,10 +270,10 @@ func (g *Group) updateWith(newGroup *Group) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// interruptEval interrupts in-flight rules evaluations
|
||||
// InterruptEval interrupts in-flight rules evaluations
|
||||
// within the group. It is expected that g.evalCancel
|
||||
// will be repopulated after the call.
|
||||
func (g *Group) interruptEval() {
|
||||
func (g *Group) InterruptEval() {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
|
||||
|
@ -267,12 +282,13 @@ func (g *Group) interruptEval() {
|
|||
}
|
||||
}
|
||||
|
||||
func (g *Group) close() {
|
||||
// Close stops the group and it's rules, unregisters group metrics
|
||||
func (g *Group) Close() {
|
||||
if g.doneCh == nil {
|
||||
return
|
||||
}
|
||||
close(g.doneCh)
|
||||
g.interruptEval()
|
||||
g.InterruptEval()
|
||||
<-g.finishedCh
|
||||
|
||||
g.metrics.iterationDuration.Unregister()
|
||||
|
@ -280,19 +296,21 @@ func (g *Group) close() {
|
|||
g.metrics.iterationMissed.Unregister()
|
||||
g.metrics.iterationInterval.Unregister()
|
||||
for _, rule := range g.Rules {
|
||||
rule.Close()
|
||||
rule.close()
|
||||
}
|
||||
}
|
||||
|
||||
var skipRandSleepOnGroupStart bool
|
||||
// SkipRandSleepOnGroupStart will skip random sleep delay in group first evaluation
|
||||
var SkipRandSleepOnGroupStart bool
|
||||
|
||||
func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *remotewrite.Client, rr datasource.QuerierBuilder) {
|
||||
// Start starts group's evaluation
|
||||
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
|
||||
evalTS := time.Now()
|
||||
// sleep random duration to spread group rules evaluation
|
||||
// over time in order to reduce load on datasource.
|
||||
if !skipRandSleepOnGroupStart {
|
||||
if !SkipRandSleepOnGroupStart {
|
||||
sleepBeforeStart := delayBeforeStart(evalTS, g.ID(), g.Interval, g.EvalOffset)
|
||||
g.infof("will start in %v", sleepBeforeStart)
|
||||
|
||||
|
@ -310,10 +328,10 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
|||
}
|
||||
|
||||
e := &executor{
|
||||
rw: rw,
|
||||
notifiers: nts,
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
|
||||
g.infof("started")
|
||||
|
@ -355,7 +373,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
|||
// restore the rules state after the first evaluation
|
||||
// so only active alerts can be restored.
|
||||
if rr != nil {
|
||||
err := g.Restore(ctx, rr, evalTS, *remoteReadLookBack)
|
||||
err := g.restore(ctx, rr, evalTS, *remoteReadLookBack)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring ruleState for group %q: %s", g.Name, err)
|
||||
}
|
||||
|
@ -409,6 +427,22 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r
|
|||
}
|
||||
}
|
||||
|
||||
// UpdateWith inserts new group to updateCh
|
||||
func (g *Group) UpdateWith(new *Group) {
|
||||
g.updateCh <- new
|
||||
}
|
||||
|
||||
// DeepCopy returns a deep copy of group
|
||||
func (g *Group) DeepCopy() *Group {
|
||||
g.mu.RLock()
|
||||
data, _ := json.Marshal(g)
|
||||
g.mu.RUnlock()
|
||||
newG := Group{}
|
||||
_ = json.Unmarshal(data, &newG)
|
||||
newG.Rules = g.Rules
|
||||
return &newG
|
||||
}
|
||||
|
||||
// delayBeforeStart returns a duration on the interval between [ts..ts+interval].
|
||||
// delayBeforeStart accounts for `offset`, so returned duration should be always
|
||||
// bigger than the `offset`.
|
||||
|
@ -438,6 +472,89 @@ func (g *Group) infof(format string, args ...interface{}) {
|
|||
g.Name, msg, g.Interval, g.EvalOffset, g.Concurrency)
|
||||
}
|
||||
|
||||
// Replay performs group replay
|
||||
func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoint, replayRuleRetryAttempts int, replayDelay time.Duration, disableProgressBar bool) int {
|
||||
var total int
|
||||
step := g.Interval * time.Duration(maxDataPoint)
|
||||
ri := rangeIterator{start: start, end: end, step: step}
|
||||
iterations := int(end.Sub(start)/step) + 1
|
||||
fmt.Printf("\nGroup %q"+
|
||||
"\ninterval: \t%v"+
|
||||
"\nrequests to make: \t%d"+
|
||||
"\nmax range per request: \t%v\n",
|
||||
g.Name, g.Interval, iterations, step)
|
||||
if g.Limit > 0 {
|
||||
fmt.Printf("\nPlease note, `limit: %d` param has no effect during replay.\n",
|
||||
g.Limit)
|
||||
}
|
||||
for _, rule := range g.Rules {
|
||||
fmt.Printf("> Rule %q (ID: %d)\n", rule, rule.ID())
|
||||
var bar *pb.ProgressBar
|
||||
if !disableProgressBar {
|
||||
bar = pb.StartNew(iterations)
|
||||
}
|
||||
ri.reset()
|
||||
for ri.next() {
|
||||
n, err := replayRule(rule, ri.s, ri.e, rw, replayRuleRetryAttempts)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", rule, err)
|
||||
}
|
||||
total += n
|
||||
if bar != nil {
|
||||
bar.Increment()
|
||||
}
|
||||
}
|
||||
if bar != nil {
|
||||
bar.Finish()
|
||||
}
|
||||
// sleep to let remote storage to flush data on-disk
|
||||
// so chained rules could be calculated correctly
|
||||
time.Sleep(replayDelay)
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// ExecOnce evaluates all the rules under group for once with given timestamp.
|
||||
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
|
||||
e := &executor{
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
if len(g.Rules) < 1 {
|
||||
return nil
|
||||
}
|
||||
resolveDuration := getResolveDuration(g.Interval, *resendDelay, *maxResolveDuration)
|
||||
return e.execConcurrently(ctx, g.Rules, evalTS, g.Concurrency, resolveDuration, g.Limit)
|
||||
}
|
||||
|
||||
type rangeIterator struct {
|
||||
step time.Duration
|
||||
start, end time.Time
|
||||
|
||||
iter int
|
||||
s, e time.Time
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) reset() {
|
||||
ri.iter = 0
|
||||
ri.s, ri.e = time.Time{}, time.Time{}
|
||||
}
|
||||
|
||||
func (ri *rangeIterator) next() bool {
|
||||
ri.s = ri.start.Add(ri.step * time.Duration(ri.iter))
|
||||
if !ri.end.After(ri.s) {
|
||||
return false
|
||||
}
|
||||
ri.e = ri.s.Add(ri.step)
|
||||
if ri.e.After(ri.end) {
|
||||
ri.e = ri.end
|
||||
}
|
||||
ri.iter++
|
||||
return true
|
||||
}
|
||||
|
||||
// getResolveDuration returns the duration after which firing alert
|
||||
// can be considered as resolved.
|
||||
func getResolveDuration(groupInterval, delta, maxDuration time.Duration) time.Duration {
|
||||
|
@ -477,20 +594,22 @@ func (g *Group) adjustReqTimestamp(timestamp time.Time) time.Time {
|
|||
return timestamp
|
||||
}
|
||||
|
||||
// executor contains group's notify and rw configs
|
||||
type executor struct {
|
||||
notifiers func() []notifier.Notifier
|
||||
Notifiers func() []notifier.Notifier
|
||||
notifierHeaders map[string]string
|
||||
|
||||
rw *remotewrite.Client
|
||||
Rw remotewrite.RWClient
|
||||
|
||||
previouslySentSeriesToRWMu sync.Mutex
|
||||
// previouslySentSeriesToRW stores series sent to RW on previous iteration
|
||||
// PreviouslySentSeriesToRW stores series sent to RW on previous iteration
|
||||
// map[ruleID]map[ruleLabels][]prompb.Label
|
||||
// where `ruleID` is ID of the Rule within a Group
|
||||
// and `ruleLabels` is []prompb.Label marshalled to a string
|
||||
previouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
|
||||
PreviouslySentSeriesToRW map[uint64]map[string][]prompbmarshal.Label
|
||||
}
|
||||
|
||||
// execConcurrently executes rules concurrently if concurrency>1
|
||||
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.Time, concurrency int, resolveDuration time.Duration, limit int) chan error {
|
||||
res := make(chan error, len(rules))
|
||||
if concurrency == 1 {
|
||||
|
@ -505,14 +624,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
|
|||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for _, rule := range rules {
|
||||
for _, r := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, ts, resolveDuration, limit)
|
||||
<-sem
|
||||
wg.Done()
|
||||
}(rule)
|
||||
}(r)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
|
@ -530,10 +649,10 @@ var (
|
|||
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
||||
)
|
||||
|
||||
func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
||||
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
||||
execTotal.Inc()
|
||||
|
||||
tss, err := rule.Exec(ctx, ts, limit)
|
||||
tss, err := r.exec(ctx, ts, limit)
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
// the context can be cancelled on graceful shutdown
|
||||
|
@ -541,17 +660,17 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
|||
return nil
|
||||
}
|
||||
execErrors.Inc()
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
||||
return fmt.Errorf("rule %q: failed to execute: %w", r, err)
|
||||
}
|
||||
|
||||
if e.rw != nil {
|
||||
if e.Rw != nil {
|
||||
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
|
||||
var lastErr error
|
||||
for _, ts := range tss {
|
||||
remoteWriteTotal.Inc()
|
||||
if err := e.rw.Push(ts); err != nil {
|
||||
if err := e.Rw.Push(ts); err != nil {
|
||||
remoteWriteErrors.Inc()
|
||||
lastErr = fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
||||
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
|
@ -560,13 +679,13 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
|||
return err
|
||||
}
|
||||
|
||||
staleSeries := e.getStaleSeries(rule, tss, ts)
|
||||
staleSeries := e.getStaleSeries(r, tss, ts)
|
||||
if err := pushToRW(staleSeries); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
ar, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
@ -578,11 +697,11 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
|||
|
||||
wg := sync.WaitGroup{}
|
||||
errGr := new(utils.ErrGroup)
|
||||
for _, nt := range e.notifiers() {
|
||||
for _, nt := range e.Notifiers() {
|
||||
wg.Add(1)
|
||||
go func(nt notifier.Notifier) {
|
||||
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", rule, nt.Addr(), err))
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
|
||||
}
|
||||
wg.Done()
|
||||
}(nt)
|
||||
|
@ -592,7 +711,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, ts time.Time, resolveDur
|
|||
}
|
||||
|
||||
// getStaledSeries checks whether there are stale series from previously sent ones.
|
||||
func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
func (e *executor) getStaleSeries(r Rule, tss []prompbmarshal.TimeSeries, timestamp time.Time) []prompbmarshal.TimeSeries {
|
||||
ruleLabels := make(map[string][]prompbmarshal.Label, len(tss))
|
||||
for _, ts := range tss {
|
||||
// convert labels to strings so we can compare with previously sent series
|
||||
|
@ -600,11 +719,11 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
|
|||
ruleLabels[key] = ts.Labels
|
||||
}
|
||||
|
||||
rID := rule.ID()
|
||||
rID := r.ID()
|
||||
var staleS []prompbmarshal.TimeSeries
|
||||
// check whether there are series which disappeared and need to be marked as stale
|
||||
e.previouslySentSeriesToRWMu.Lock()
|
||||
for key, labels := range e.previouslySentSeriesToRW[rID] {
|
||||
for key, labels := range e.PreviouslySentSeriesToRW[rID] {
|
||||
if _, ok := ruleLabels[key]; ok {
|
||||
continue
|
||||
}
|
||||
|
@ -613,7 +732,7 @@ func (e *executor) getStaleSeries(rule Rule, tss []prompbmarshal.TimeSeries, tim
|
|||
staleS = append(staleS, ss)
|
||||
}
|
||||
// set previous series to current
|
||||
e.previouslySentSeriesToRW[rID] = ruleLabels
|
||||
e.PreviouslySentSeriesToRW[rID] = ruleLabels
|
||||
e.previouslySentSeriesToRWMu.Unlock()
|
||||
|
||||
return staleS
|
||||
|
@ -631,14 +750,14 @@ func (e *executor) purgeStaleSeries(activeRules []Rule) {
|
|||
|
||||
for _, rule := range activeRules {
|
||||
id := rule.ID()
|
||||
prev, ok := e.previouslySentSeriesToRW[id]
|
||||
prev, ok := e.PreviouslySentSeriesToRW[id]
|
||||
if ok {
|
||||
// keep previous series for staleness detection
|
||||
newPreviouslySentSeriesToRW[id] = prev
|
||||
}
|
||||
}
|
||||
e.previouslySentSeriesToRW = nil
|
||||
e.previouslySentSeriesToRW = newPreviouslySentSeriesToRW
|
||||
e.PreviouslySentSeriesToRW = nil
|
||||
e.PreviouslySentSeriesToRW = newPreviouslySentSeriesToRW
|
||||
|
||||
e.previouslySentSeriesToRWMu.Unlock()
|
||||
}
|
|
@ -1,17 +1,22 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
|
||||
|
@ -20,7 +25,15 @@ import (
|
|||
func init() {
|
||||
// Disable rand sleep on group start during tests in order to speed up test execution.
|
||||
// Rand sleep is needed only in prod code.
|
||||
skipRandSleepOnGroupStart = true
|
||||
SkipRandSleepOnGroupStart = true
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
if err := templates.Load([]string{}, true); err != nil {
|
||||
fmt.Println("failed to load template for test")
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestUpdateWith(t *testing.T) {
|
||||
|
@ -138,7 +151,7 @@ func TestUpdateWith(t *testing.T) {
|
|||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
g := &Group{Name: "test"}
|
||||
qb := &fakeQuerier{}
|
||||
qb := &datasource.FakeQuerier{}
|
||||
for _, r := range tc.currentRules {
|
||||
r.ID = config.HashRule(r)
|
||||
g.Rules = append(g.Rules, g.newRule(qb, r))
|
||||
|
@ -170,7 +183,7 @@ func TestUpdateWith(t *testing.T) {
|
|||
if got.ID() != want.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want, got)
|
||||
}
|
||||
if err := compareRules(t, got, want); err != nil {
|
||||
if err := CompareRules(t, got, want); err != nil {
|
||||
t.Fatalf("comparison error: %s", err)
|
||||
}
|
||||
}
|
||||
|
@ -179,17 +192,31 @@ func TestUpdateWith(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
// TODO: make parsing from string instead of file
|
||||
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
|
||||
const (
|
||||
rules = `
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
`
|
||||
)
|
||||
var groups []config.Group
|
||||
err := yaml.Unmarshal([]byte(rules), &groups)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
|
||||
fs := &fakeQuerier{}
|
||||
fn := &fakeNotifier{}
|
||||
fs := &datasource.FakeQuerier{}
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
const inst1, inst2, job = "foo", "bar", "baz"
|
||||
|
@ -204,7 +231,7 @@ func TestGroupStart(t *testing.T) {
|
|||
alert1.State = notifier.StateFiring
|
||||
// add external label
|
||||
alert1.Labels["cluster"] = "east-1"
|
||||
// add rule labels - see config/testdata/rules1-good.rules
|
||||
// add rule labels
|
||||
alert1.Labels["label"] = "bar"
|
||||
alert1.Labels["host"] = inst1
|
||||
// add service labels
|
||||
|
@ -219,7 +246,7 @@ func TestGroupStart(t *testing.T) {
|
|||
alert2.State = notifier.StateFiring
|
||||
// add external label
|
||||
alert2.Labels["cluster"] = "east-1"
|
||||
// add rule labels - see config/testdata/rules1-good.rules
|
||||
// add rule labels
|
||||
alert2.Labels["label"] = "bar"
|
||||
alert2.Labels["host"] = inst2
|
||||
// add service labels
|
||||
|
@ -228,40 +255,40 @@ func TestGroupStart(t *testing.T) {
|
|||
alert2.ID = hash(alert2.Labels)
|
||||
|
||||
finished := make(chan struct{})
|
||||
fs.add(m1)
|
||||
fs.add(m2)
|
||||
fs.Add(m1)
|
||||
fs.Add(m2)
|
||||
go func() {
|
||||
g.start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
|
||||
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts := fn.getAlerts()
|
||||
gotAlerts := fn.GetAlerts()
|
||||
expectedAlerts := []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
gotAlertsNum := fn.getCounter()
|
||||
gotAlertsNum := fn.GetCounter()
|
||||
if gotAlertsNum < len(expectedAlerts)*2 {
|
||||
t.Fatalf("expected to receive at least %d alerts; got %d instead",
|
||||
len(expectedAlerts)*2, gotAlertsNum)
|
||||
}
|
||||
|
||||
// reset previous data
|
||||
fs.reset()
|
||||
fs.Reset()
|
||||
// and set only one datapoint for response
|
||||
fs.add(m1)
|
||||
fs.Add(m1)
|
||||
|
||||
// wait for multiple evals
|
||||
time.Sleep(20 * evalInterval)
|
||||
|
||||
gotAlerts = fn.getAlerts()
|
||||
gotAlerts = fn.GetAlerts()
|
||||
alert2.State = notifier.StateInactive
|
||||
expectedAlerts = []notifier.Alert{*alert1, *alert2}
|
||||
compareAlerts(t, expectedAlerts, gotAlerts)
|
||||
|
||||
g.close()
|
||||
g.Close()
|
||||
<-finished
|
||||
}
|
||||
|
||||
|
@ -294,15 +321,15 @@ func TestResolveDuration(t *testing.T) {
|
|||
func TestGetStaleSeries(t *testing.T) {
|
||||
ts := time.Now()
|
||||
e := &executor{
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
f := func(rule Rule, labels, expLabels [][]prompbmarshal.Label) {
|
||||
f := func(r Rule, labels, expLabels [][]prompbmarshal.Label) {
|
||||
t.Helper()
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for _, l := range labels {
|
||||
tss = append(tss, newTimeSeriesPB([]float64{1}, []int64{ts.Unix()}, l))
|
||||
}
|
||||
staleS := e.getStaleSeries(rule, tss, ts)
|
||||
staleS := e.getStaleSeries(r, tss, ts)
|
||||
if staleS == nil && expLabels == nil {
|
||||
return
|
||||
}
|
||||
|
@ -387,7 +414,7 @@ func TestPurgeStaleSeries(t *testing.T) {
|
|||
f := func(curRules, newRules, expStaleRules []Rule) {
|
||||
t.Helper()
|
||||
e := &executor{
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
// seed executor with series for
|
||||
// current rules
|
||||
|
@ -397,13 +424,13 @@ func TestPurgeStaleSeries(t *testing.T) {
|
|||
|
||||
e.purgeStaleSeries(newRules)
|
||||
|
||||
if len(e.previouslySentSeriesToRW) != len(expStaleRules) {
|
||||
if len(e.PreviouslySentSeriesToRW) != len(expStaleRules) {
|
||||
t.Fatalf("expected to get %d stale series, got %d",
|
||||
len(expStaleRules), len(e.previouslySentSeriesToRW))
|
||||
len(expStaleRules), len(e.PreviouslySentSeriesToRW))
|
||||
}
|
||||
|
||||
for _, exp := range expStaleRules {
|
||||
if _, ok := e.previouslySentSeriesToRW[exp.ID()]; !ok {
|
||||
if _, ok := e.PreviouslySentSeriesToRW[exp.ID()]; !ok {
|
||||
t.Fatalf("expected to have rule %d; got nil instead", exp.ID())
|
||||
}
|
||||
}
|
||||
|
@ -438,17 +465,17 @@ func TestPurgeStaleSeries(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestFaultyNotifier(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
|
||||
r := newTestAlertingRule("instant", 0)
|
||||
r.q = fq
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
e := &executor{
|
||||
notifiers: func() []notifier.Notifier {
|
||||
Notifiers: func() []notifier.Notifier {
|
||||
return []notifier.Notifier{
|
||||
&faultyNotifier{},
|
||||
¬ifier.FaultyNotifier{},
|
||||
fn,
|
||||
}
|
||||
},
|
||||
|
@ -464,7 +491,7 @@ func TestFaultyNotifier(t *testing.T) {
|
|||
tn := time.Now()
|
||||
deadline := tn.Add(delay / 2)
|
||||
for {
|
||||
if fn.getCounter() > 0 {
|
||||
if fn.GetCounter() > 0 {
|
||||
return
|
||||
}
|
||||
if tn.After(deadline) {
|
||||
|
@ -477,18 +504,18 @@ func TestFaultyNotifier(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestFaultyRW(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
|
||||
|
||||
r := &RecordingRule{
|
||||
Name: "test",
|
||||
state: newRuleState(10),
|
||||
q: fq,
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
|
||||
e := &executor{
|
||||
rw: &remotewrite.Client{},
|
||||
previouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
Rw: &remotewrite.Client{},
|
||||
PreviouslySentSeriesToRW: make(map[uint64]map[string][]prompbmarshal.Label),
|
||||
}
|
||||
|
||||
err := e.exec(context.Background(), r, time.Now(), 0, 10)
|
||||
|
@ -498,23 +525,38 @@ func TestFaultyRW(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestCloseWithEvalInterruption(t *testing.T) {
|
||||
groups, err := config.Parse([]string{"config/testdata/rules/rules1-good.rules"}, notifier.ValidateTemplates, true)
|
||||
const (
|
||||
rules = `
|
||||
- name: groupTest
|
||||
rules:
|
||||
- alert: VMRows
|
||||
for: 1ms
|
||||
expr: vm_rows > 0
|
||||
labels:
|
||||
label: bar
|
||||
host: "{{ $labels.instance }}"
|
||||
annotations:
|
||||
summary: "{{ $value }}"
|
||||
`
|
||||
)
|
||||
var groups []config.Group
|
||||
err := yaml.Unmarshal([]byte(rules), &groups)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
|
||||
const delay = time.Second * 2
|
||||
fq := &fakeQuerierWithDelay{delay: delay}
|
||||
fq := &datasource.FakeQuerierWithDelay{Delay: delay}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], fq, evalInterval, nil)
|
||||
g := NewGroup(groups[0], fq, evalInterval, nil)
|
||||
|
||||
go g.start(context.Background(), nil, nil, nil)
|
||||
go g.Start(context.Background(), nil, nil, nil)
|
||||
|
||||
time.Sleep(evalInterval * 20)
|
||||
|
||||
go func() {
|
||||
g.close()
|
||||
g.Close()
|
||||
}()
|
||||
|
||||
deadline := time.Tick(delay / 2)
|
||||
|
@ -637,3 +679,81 @@ func TestGetPrometheusReqTimestamp(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeIterator(t *testing.T) {
|
||||
testCases := []struct {
|
||||
ri rangeIterator
|
||||
result [][2]time.Time
|
||||
}{
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 5 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:05:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:05:00.000Z"), parseTime(t, "2021-01-01T12:10:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:10:00.000Z"), parseTime(t, "2021-01-01T12:15:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:15:00.000Z"), parseTime(t, "2021-01-01T12:20:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:20:00.000Z"), parseTime(t, "2021-01-01T12:25:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:25:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:00.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:30:00.000Z"),
|
||||
step: 45 * time.Minute,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:30:00.000Z"), parseTime(t, "2021-01-01T12:30:00.000Z")},
|
||||
},
|
||||
},
|
||||
{
|
||||
ri: rangeIterator{
|
||||
start: parseTime(t, "2021-01-01T12:00:12.000Z"),
|
||||
end: parseTime(t, "2021-01-01T12:00:17.000Z"),
|
||||
step: time.Second,
|
||||
},
|
||||
result: [][2]time.Time{
|
||||
{parseTime(t, "2021-01-01T12:00:12.000Z"), parseTime(t, "2021-01-01T12:00:13.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:13.000Z"), parseTime(t, "2021-01-01T12:00:14.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:14.000Z"), parseTime(t, "2021-01-01T12:00:15.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:15.000Z"), parseTime(t, "2021-01-01T12:00:16.000Z")},
|
||||
{parseTime(t, "2021-01-01T12:00:16.000Z"), parseTime(t, "2021-01-01T12:00:17.000Z")},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
var j int
|
||||
for tc.ri.next() {
|
||||
if len(tc.result) < j+1 {
|
||||
t.Fatalf("unexpected result for iterator on step %d: %v - %v",
|
||||
j, tc.ri.s, tc.ri.e)
|
||||
}
|
||||
s, e := tc.ri.s, tc.ri.e
|
||||
expS, expE := tc.result[j][0], tc.result[j][1]
|
||||
if s != expS {
|
||||
t.Fatalf("expected to get start=%v; got %v", expS, s)
|
||||
}
|
||||
if e != expE {
|
||||
t.Fatalf("expected to get end=%v; got %v", expE, e)
|
||||
}
|
||||
j++
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func parseTime(t *testing.T, s string) time.Time {
|
||||
t.Helper()
|
||||
tt, err := time.Parse("2006-01-02T15:04:05.000Z", s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
return tt
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
@ -49,7 +49,8 @@ func (rr *RecordingRule) ID() uint64 {
|
|||
return rr.RuleID
|
||||
}
|
||||
|
||||
func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
||||
// NewRecordingRule creates a new RecordingRule
|
||||
func NewRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *RecordingRule {
|
||||
rr := &RecordingRule{
|
||||
Type: group.Type,
|
||||
RuleID: cfg.ID,
|
||||
|
@ -66,17 +67,22 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||
}),
|
||||
}
|
||||
|
||||
entrySize := *ruleUpdateEntriesLimit
|
||||
if cfg.UpdateEntriesLimit != nil {
|
||||
rr.state = newRuleState(*cfg.UpdateEntriesLimit)
|
||||
} else {
|
||||
rr.state = newRuleState(*ruleUpdateEntriesLimit)
|
||||
entrySize = *cfg.UpdateEntriesLimit
|
||||
}
|
||||
if entrySize < 1 {
|
||||
entrySize = 1
|
||||
}
|
||||
rr.state = &ruleState{
|
||||
entries: make([]StateEntry, entrySize),
|
||||
}
|
||||
|
||||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||
rr.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
if e.err == nil {
|
||||
if e.Err == nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
|
@ -84,21 +90,21 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||
rr.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||
func() float64 {
|
||||
e := rr.state.getLast()
|
||||
return float64(e.samples)
|
||||
return float64(e.Samples)
|
||||
})
|
||||
return rr
|
||||
}
|
||||
|
||||
// Close unregisters rule metrics
|
||||
func (rr *RecordingRule) Close() {
|
||||
// close unregisters rule metrics
|
||||
func (rr *RecordingRule) close() {
|
||||
rr.metrics.errors.Unregister()
|
||||
rr.metrics.samples.Unregister()
|
||||
}
|
||||
|
||||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||
// execRange executes recording rule on the given time range similarly to Exec.
|
||||
// It doesn't update internal states of the Rule and meant to be used just
|
||||
// to get time series for backfilling.
|
||||
func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
func (rr *RecordingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
||||
res, err := rr.q.QueryRange(ctx, rr.Expr, start, end)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -117,17 +123,17 @@ func (rr *RecordingRule) ExecRange(ctx context.Context, start, end time.Time) ([
|
|||
return tss, nil
|
||||
}
|
||||
|
||||
// Exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
// exec executes RecordingRule expression via the given Querier.
|
||||
func (rr *RecordingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
||||
start := time.Now()
|
||||
res, req, err := rr.q.Query(ctx, rr.Expr, ts)
|
||||
curState := ruleStateEntry{
|
||||
time: start,
|
||||
at: ts,
|
||||
duration: time.Since(start),
|
||||
samples: len(res.Data),
|
||||
seriesFetched: res.SeriesFetched,
|
||||
curl: requestToCurl(req),
|
||||
curState := StateEntry{
|
||||
Time: start,
|
||||
At: ts,
|
||||
Duration: time.Since(start),
|
||||
Samples: len(res.Data),
|
||||
SeriesFetched: res.SeriesFetched,
|
||||
Curl: requestToCurl(req),
|
||||
}
|
||||
|
||||
defer func() {
|
||||
|
@ -135,15 +141,15 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
|
|||
}()
|
||||
|
||||
if err != nil {
|
||||
curState.err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||
return nil, curState.Err
|
||||
}
|
||||
|
||||
qMetrics := res.Data
|
||||
numSeries := len(qMetrics)
|
||||
if limit > 0 && numSeries > limit {
|
||||
curState.err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d series", limit, numSeries)
|
||||
return nil, curState.Err
|
||||
}
|
||||
|
||||
duplicates := make(map[string]struct{}, len(qMetrics))
|
||||
|
@ -152,8 +158,8 @@ func (rr *RecordingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]p
|
|||
ts := rr.toTimeSeries(r)
|
||||
key := stringifyLabels(ts)
|
||||
if _, ok := duplicates[key]; ok {
|
||||
curState.err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
return nil, curState.err
|
||||
curState.Err = fmt.Errorf("original metric %v; resulting labels %q: %w", r, key, errDuplicate)
|
||||
return nil, curState.Err
|
||||
}
|
||||
duplicates[key] = struct{}{}
|
||||
tss = append(tss, ts)
|
||||
|
@ -193,8 +199,8 @@ func (rr *RecordingRule) toTimeSeries(m datasource.Metric) prompbmarshal.TimeSer
|
|||
return newTimeSeries(m.Values, m.Timestamps, labels)
|
||||
}
|
||||
|
||||
// UpdateWith copies all significant fields.
|
||||
func (rr *RecordingRule) UpdateWith(r Rule) error {
|
||||
// updateWith copies all significant fields.
|
||||
func (rr *RecordingRule) updateWith(r Rule) error {
|
||||
nr, ok := r.(*RecordingRule)
|
||||
if !ok {
|
||||
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
|
||||
|
@ -204,32 +210,3 @@ func (rr *RecordingRule) UpdateWith(r Rule) error {
|
|||
rr.q = nr.q
|
||||
return nil
|
||||
}
|
||||
|
||||
// ToAPI returns Rule's representation in form
|
||||
// of APIRule
|
||||
func (rr *RecordingRule) ToAPI() APIRule {
|
||||
lastState := rr.state.getLast()
|
||||
r := APIRule{
|
||||
Type: "recording",
|
||||
DatasourceType: rr.Type.String(),
|
||||
Name: rr.Name,
|
||||
Query: rr.Expr,
|
||||
Labels: rr.Labels,
|
||||
LastEvaluation: lastState.time,
|
||||
EvaluationTime: lastState.duration.Seconds(),
|
||||
Health: "ok",
|
||||
LastSamples: lastState.samples,
|
||||
LastSeriesFetched: lastState.seriesFetched,
|
||||
MaxUpdates: rr.state.size(),
|
||||
Updates: rr.state.getAll(),
|
||||
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
}
|
||||
if lastState.err != nil {
|
||||
r.LastError = lastState.err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return r
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
@ -56,10 +56,12 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
Name: "job:foo",
|
||||
Labels: map[string]string{
|
||||
"source": "test",
|
||||
}},
|
||||
},
|
||||
},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
|
@ -76,11 +78,11 @@ func TestRecordingRule_Exec(t *testing.T) {
|
|||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tc.rule.state = newRuleState(10)
|
||||
tss, err := tc.rule.Exec(context.TODO(), time.Now(), 0)
|
||||
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
|
||||
tss, err := tc.rule.exec(context.TODO(), time.Now(), 0)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
|
@ -141,7 +143,8 @@ func TestRecordingRule_ExecRange(t *testing.T) {
|
|||
}},
|
||||
[]datasource.Metric{
|
||||
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
|
||||
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar"),
|
||||
},
|
||||
[]prompbmarshal.TimeSeries{
|
||||
newTimeSeries([]float64{2}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||
"__name__": "job:foo",
|
||||
|
@ -158,10 +161,10 @@ func TestRecordingRule_ExecRange(t *testing.T) {
|
|||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.rule.Name, func(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(tc.metrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(tc.metrics...)
|
||||
tc.rule.q = fq
|
||||
tss, err := tc.rule.ExecRange(context.TODO(), time.Now(), time.Now())
|
||||
tss, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected Exec err: %s", err)
|
||||
}
|
||||
|
@ -198,15 +201,15 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
metricWithValuesAndLabels(t, []float64{2, 3}, "__name__", "bar", "job", "bar"),
|
||||
metricWithValuesAndLabels(t, []float64{4, 5, 6}, "__name__", "baz", "job", "baz"),
|
||||
}
|
||||
rule := &RecordingRule{Name: "job:foo", state: newRuleState(10), Labels: map[string]string{
|
||||
rule := &RecordingRule{Name: "job:foo", state: &ruleState{entries: make([]StateEntry, 10)}, Labels: map[string]string{
|
||||
"source": "test_limit",
|
||||
}}
|
||||
var err error
|
||||
for _, testCase := range testCases {
|
||||
fq := &fakeQuerier{}
|
||||
fq.add(testMetrics...)
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(testMetrics...)
|
||||
rule.q = fq
|
||||
_, err = rule.Exec(context.TODO(), timestamp, testCase.limit)
|
||||
_, err = rule.exec(context.TODO(), timestamp, testCase.limit)
|
||||
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -216,17 +219,16 @@ func TestRecordingRuleLimit(t *testing.T) {
|
|||
func TestRecordingRule_ExecNegative(t *testing.T) {
|
||||
rr := &RecordingRule{
|
||||
Name: "job:foo",
|
||||
state: newRuleState(10),
|
||||
Labels: map[string]string{
|
||||
"job": "test",
|
||||
},
|
||||
state: &ruleState{entries: make([]StateEntry, 10)},
|
||||
}
|
||||
|
||||
fq := &fakeQuerier{}
|
||||
fq := &datasource.FakeQuerier{}
|
||||
expErr := "connection reset by peer"
|
||||
fq.setErr(errors.New(expErr))
|
||||
fq.SetErr(errors.New(expErr))
|
||||
rr.q = fq
|
||||
_, err := rr.Exec(context.TODO(), time.Now(), 0)
|
||||
_, err := rr.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
||||
|
@ -234,14 +236,14 @@ func TestRecordingRule_ExecNegative(t *testing.T) {
|
|||
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
|
||||
}
|
||||
|
||||
fq.reset()
|
||||
fq.Reset()
|
||||
|
||||
// add metrics which differs only by `job` label
|
||||
// which will be overridden by rule
|
||||
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
|
||||
fq.Add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
|
||||
|
||||
_, err = rr.Exec(context.TODO(), time.Now(), 0)
|
||||
_, err = rr.exec(context.TODO(), time.Now(), 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected to get err; got nil")
|
||||
}
|
174
app/vmalert/rule/rule.go
Normal file
174
app/vmalert/rule/rule.go
Normal file
|
@ -0,0 +1,174 @@
|
|||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
// Rule represents alerting or recording rule
|
||||
// that has unique ID, can be Executed and
|
||||
// updated with other Rule.
|
||||
type Rule interface {
|
||||
// ID returns unique ID that may be used for
|
||||
// identifying this Rule among others.
|
||||
ID() uint64
|
||||
// exec executes the rule with given context at the given timestamp and limit.
|
||||
// returns an err if number of resulting time series exceeds the limit.
|
||||
exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error)
|
||||
// execRange executes the rule on the given time range.
|
||||
execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error)
|
||||
// updateWith performs modification of current Rule
|
||||
// with fields of the given Rule.
|
||||
updateWith(Rule) error
|
||||
// close performs the shutdown procedures for rule
|
||||
// such as metrics unregister
|
||||
close()
|
||||
}
|
||||
|
||||
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels. See https://docs.victoriametrics.com/vmalert.html#series-with-the-same-labelset for details")
|
||||
|
||||
type ruleState struct {
|
||||
sync.RWMutex
|
||||
entries []StateEntry
|
||||
cur int
|
||||
}
|
||||
|
||||
// StateEntry stores rule's execution states
|
||||
type StateEntry struct {
|
||||
// stores last moment of time rule.Exec was called
|
||||
Time time.Time
|
||||
// stores the timesteamp with which rule.Exec was called
|
||||
At time.Time
|
||||
// stores the duration of the last rule.Exec call
|
||||
Duration time.Duration
|
||||
// stores last error that happened in Exec func
|
||||
// resets on every successful Exec
|
||||
// may be used as Health ruleState
|
||||
Err error
|
||||
// stores the number of samples returned during
|
||||
// the last evaluation
|
||||
Samples int
|
||||
// stores the number of time series fetched during
|
||||
// the last evaluation.
|
||||
// Is supported by VictoriaMetrics only, starting from v1.90.0
|
||||
// If seriesFetched == nil, then this attribute was missing in
|
||||
// datasource response (unsupported).
|
||||
SeriesFetched *int
|
||||
// stores the curl command reflecting the HTTP request used during rule.Exec
|
||||
Curl string
|
||||
}
|
||||
|
||||
// GetLastEntry returns latest stateEntry of rule
|
||||
func GetLastEntry(r Rule) StateEntry {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.getLast()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.getLast()
|
||||
}
|
||||
return StateEntry{}
|
||||
}
|
||||
|
||||
// GetRuleStateSize returns size of rule stateEntry
|
||||
func GetRuleStateSize(r Rule) int {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.size()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.size()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// GetAllRuleState returns rule entire stateEntries
|
||||
func GetAllRuleState(r Rule) []StateEntry {
|
||||
if rule, ok := r.(*AlertingRule); ok {
|
||||
return rule.state.getAll()
|
||||
}
|
||||
if rule, ok := r.(*RecordingRule); ok {
|
||||
return rule.state.getAll()
|
||||
}
|
||||
return []StateEntry{}
|
||||
}
|
||||
|
||||
func (s *ruleState) size() int {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return len(s.entries)
|
||||
}
|
||||
|
||||
func (s *ruleState) getLast() StateEntry {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
if len(s.entries) == 0 {
|
||||
return StateEntry{}
|
||||
}
|
||||
return s.entries[s.cur]
|
||||
}
|
||||
|
||||
func (s *ruleState) getAll() []StateEntry {
|
||||
entries := make([]StateEntry, 0)
|
||||
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
|
||||
cur := s.cur
|
||||
for {
|
||||
e := s.entries[cur]
|
||||
if !e.Time.IsZero() || !e.At.IsZero() {
|
||||
entries = append(entries, e)
|
||||
}
|
||||
cur--
|
||||
if cur < 0 {
|
||||
cur = cap(s.entries) - 1
|
||||
}
|
||||
if cur == s.cur {
|
||||
return entries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ruleState) add(e StateEntry) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
s.cur++
|
||||
if s.cur > cap(s.entries)-1 {
|
||||
s.cur = 0
|
||||
}
|
||||
s.entries[s.cur] = e
|
||||
}
|
||||
|
||||
func replayRule(r Rule, start, end time.Time, rw remotewrite.RWClient, replayRuleRetryAttempts int) (int, error) {
|
||||
var err error
|
||||
var tss []prompbmarshal.TimeSeries
|
||||
for i := 0; i < replayRuleRetryAttempts; i++ {
|
||||
tss, err = r.execRange(context.Background(), start, end)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
logger.Errorf("attempt %d to execute rule %q failed: %s", i+1, r, err)
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
if err != nil { // means all attempts failed
|
||||
return 0, err
|
||||
}
|
||||
if len(tss) < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
var n int
|
||||
for _, ts := range tss {
|
||||
if err := rw.Push(ts); err != nil {
|
||||
return n, fmt.Errorf("remote write failure: %s", err)
|
||||
}
|
||||
n += len(ts.Samples)
|
||||
}
|
||||
return n, nil
|
||||
}
|
81
app/vmalert/rule/rule_test.go
Normal file
81
app/vmalert/rule/rule_test.go
Normal file
|
@ -0,0 +1,81 @@
|
|||
package rule
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRule_state(t *testing.T) {
|
||||
stateEntriesN := 20
|
||||
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, stateEntriesN)}}
|
||||
e := r.state.getLast()
|
||||
if !e.At.IsZero() {
|
||||
t.Fatalf("expected entry to be zero")
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
r.state.add(StateEntry{At: now})
|
||||
|
||||
e = r.state.getLast()
|
||||
if e.At != now {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.At, now)
|
||||
}
|
||||
|
||||
time.Sleep(time.Millisecond)
|
||||
now2 := time.Now()
|
||||
r.state.add(StateEntry{At: now2})
|
||||
|
||||
e = r.state.getLast()
|
||||
if e.At != now2 {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.At, now2)
|
||||
}
|
||||
|
||||
if len(r.state.getAll()) != 2 {
|
||||
t.Fatalf("expected for state to have 2 entries only; got %d",
|
||||
len(r.state.getAll()),
|
||||
)
|
||||
}
|
||||
|
||||
var last time.Time
|
||||
for i := 0; i < stateEntriesN*2; i++ {
|
||||
last = time.Now()
|
||||
r.state.add(StateEntry{At: last})
|
||||
}
|
||||
|
||||
e = r.state.getLast()
|
||||
if e.At != last {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.At, last)
|
||||
}
|
||||
|
||||
if len(r.state.getAll()) != stateEntriesN {
|
||||
t.Fatalf("expected for state to have %d entries only; got %d",
|
||||
stateEntriesN, len(r.state.getAll()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRule_stateConcurrent supposed to test concurrent
|
||||
// execution of state updates.
|
||||
// Should be executed with -race flag
|
||||
func TestRule_stateConcurrent(_ *testing.T) {
|
||||
r := &AlertingRule{state: &ruleState{entries: make([]StateEntry, 20)}}
|
||||
const workers = 50
|
||||
const iterations = 100
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
r.state.add(StateEntry{At: time.Now()})
|
||||
r.state.getAll()
|
||||
r.state.getLast()
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
|
@ -1,239 +1,18 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
||||
)
|
||||
|
||||
type fakeQuerier struct {
|
||||
sync.Mutex
|
||||
metrics []datasource.Metric
|
||||
err error
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) setErr(err error) {
|
||||
fq.Lock()
|
||||
fq.err = err
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) reset() {
|
||||
fq.Lock()
|
||||
fq.err = nil
|
||||
fq.metrics = fq.metrics[:0]
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
|
||||
fq.Lock()
|
||||
fq.metrics = append(fq.metrics, metrics...)
|
||||
fq.Unlock()
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
|
||||
return fq
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
|
||||
req, _, err := fq.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
func (fq *fakeQuerier) Query(_ context.Context, _ string, _ time.Time) (datasource.Result, *http.Request, error) {
|
||||
fq.Lock()
|
||||
defer fq.Unlock()
|
||||
if fq.err != nil {
|
||||
return datasource.Result{}, nil, fq.err
|
||||
}
|
||||
cp := make([]datasource.Metric, len(fq.metrics))
|
||||
copy(cp, fq.metrics)
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
return datasource.Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
type fakeQuerierWithRegistry struct {
|
||||
sync.Mutex
|
||||
registry map[string][]datasource.Metric
|
||||
}
|
||||
|
||||
func (fqr *fakeQuerierWithRegistry) set(key string, metrics ...datasource.Metric) {
|
||||
fqr.Lock()
|
||||
if fqr.registry == nil {
|
||||
fqr.registry = make(map[string][]datasource.Metric)
|
||||
}
|
||||
fqr.registry[key] = metrics
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
func (fqr *fakeQuerierWithRegistry) reset() {
|
||||
fqr.Lock()
|
||||
fqr.registry = nil
|
||||
fqr.Unlock()
|
||||
}
|
||||
|
||||
func (fqr *fakeQuerierWithRegistry) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
|
||||
return fqr
|
||||
}
|
||||
|
||||
func (fqr *fakeQuerierWithRegistry) QueryRange(ctx context.Context, q string, _, _ time.Time) (datasource.Result, error) {
|
||||
req, _, err := fqr.Query(ctx, q, time.Now())
|
||||
return req, err
|
||||
}
|
||||
|
||||
func (fqr *fakeQuerierWithRegistry) Query(_ context.Context, expr string, _ time.Time) (datasource.Result, *http.Request, error) {
|
||||
fqr.Lock()
|
||||
defer fqr.Unlock()
|
||||
|
||||
req, _ := http.NewRequest(http.MethodPost, "foo.com", nil)
|
||||
metrics, ok := fqr.registry[expr]
|
||||
if !ok {
|
||||
return datasource.Result{}, req, nil
|
||||
}
|
||||
cp := make([]datasource.Metric, len(metrics))
|
||||
copy(cp, metrics)
|
||||
return datasource.Result{Data: cp}, req, nil
|
||||
}
|
||||
|
||||
type fakeQuerierWithDelay struct {
|
||||
fakeQuerier
|
||||
delay time.Duration
|
||||
}
|
||||
|
||||
func (fqd *fakeQuerierWithDelay) Query(ctx context.Context, expr string, ts time.Time) (datasource.Result, *http.Request, error) {
|
||||
timer := time.NewTimer(fqd.delay)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-timer.C:
|
||||
}
|
||||
return fqd.fakeQuerier.Query(ctx, expr, ts)
|
||||
}
|
||||
|
||||
func (fqd *fakeQuerierWithDelay) BuildWithParams(_ datasource.QuerierParams) datasource.Querier {
|
||||
return fqd
|
||||
}
|
||||
|
||||
type fakeNotifier struct {
|
||||
sync.Mutex
|
||||
alerts []notifier.Alert
|
||||
// records number of received alerts in total
|
||||
counter int
|
||||
}
|
||||
|
||||
func (*fakeNotifier) Close() {}
|
||||
func (*fakeNotifier) Addr() string { return "" }
|
||||
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert, _ map[string]string) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
fn.alerts = alerts
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getCounter() int {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.counter
|
||||
}
|
||||
|
||||
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
return fn.alerts
|
||||
}
|
||||
|
||||
type faultyNotifier struct {
|
||||
fakeNotifier
|
||||
}
|
||||
|
||||
func (fn *faultyNotifier) Send(ctx context.Context, _ []notifier.Alert, _ map[string]string) error {
|
||||
d, ok := ctx.Deadline()
|
||||
if ok {
|
||||
time.Sleep(time.Until(d))
|
||||
}
|
||||
return fmt.Errorf("send failed")
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
return metricWithValuesAndLabels(t, []float64{value}, labels...)
|
||||
}
|
||||
|
||||
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Values = values
|
||||
for i := range values {
|
||||
m.Timestamps = append(m.Timestamps, int64(i))
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
var ls []prompbmarshal.Label
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
ls = append(ls, prompbmarshal.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return ls
|
||||
}
|
||||
|
||||
func compareGroups(t *testing.T, a, b *Group) {
|
||||
t.Helper()
|
||||
if a.Name != b.Name {
|
||||
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
|
||||
}
|
||||
if a.File != b.File {
|
||||
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
|
||||
}
|
||||
if a.Interval != b.Interval {
|
||||
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
|
||||
}
|
||||
if len(a.Rules) != len(b.Rules) {
|
||||
t.Fatalf("expected group %s to have %d rules; got: %d",
|
||||
a.Name, len(a.Rules), len(b.Rules))
|
||||
}
|
||||
for i, r := range a.Rules {
|
||||
got, want := r, b.Rules[i]
|
||||
if a.ID() != b.ID() {
|
||||
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
|
||||
}
|
||||
if err := compareRules(t, want, got); err != nil {
|
||||
t.Fatalf("comparison error: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareRules(t *testing.T, a, b Rule) error {
|
||||
// CompareRules is a test helper func for other tests
|
||||
func CompareRules(t *testing.T, a, b Rule) error {
|
||||
t.Helper()
|
||||
switch v := a.(type) {
|
||||
case *AlertingRule:
|
||||
|
@ -287,6 +66,50 @@ func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
|
||||
return metricWithValuesAndLabels(t, []float64{value}, labels...)
|
||||
}
|
||||
|
||||
func metricWithValuesAndLabels(t *testing.T, values []float64, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
m := metricWithLabels(t, labels...)
|
||||
m.Values = values
|
||||
for i := range values {
|
||||
m.Timestamps = append(m.Timestamps, int64(i))
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
m := datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
m.Labels = append(m.Labels, datasource.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func toPromLabels(t *testing.T, labels ...string) []prompbmarshal.Label {
|
||||
t.Helper()
|
||||
if len(labels) == 0 || len(labels)%2 != 0 {
|
||||
t.Fatalf("expected to get even number of labels")
|
||||
}
|
||||
var ls []prompbmarshal.Label
|
||||
for i := 0; i < len(labels); i += 2 {
|
||||
ls = append(ls, prompbmarshal.Label{
|
||||
Name: labels[i],
|
||||
Value: labels[i+1],
|
||||
})
|
||||
}
|
||||
return ls
|
||||
}
|
||||
|
||||
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
|
||||
t.Helper()
|
||||
if len(a) != len(b) {
|
|
@ -1,4 +1,4 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"fmt"
|
|
@ -1,4 +1,4 @@
|
|||
package main
|
||||
package rule
|
||||
|
||||
import (
|
||||
"net/http"
|
|
@ -1,100 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRule_stateDisabled(t *testing.T) {
|
||||
state := newRuleState(-1)
|
||||
e := state.getLast()
|
||||
if !e.at.IsZero() {
|
||||
t.Fatalf("expected entry to be zero")
|
||||
}
|
||||
|
||||
state.add(ruleStateEntry{at: time.Now()})
|
||||
state.add(ruleStateEntry{at: time.Now()})
|
||||
state.add(ruleStateEntry{at: time.Now()})
|
||||
|
||||
if len(state.getAll()) != 1 {
|
||||
// state should store at least one update at any circumstances
|
||||
t.Fatalf("expected for state to have %d entries; got %d",
|
||||
1, len(state.getAll()),
|
||||
)
|
||||
}
|
||||
}
|
||||
func TestRule_state(t *testing.T) {
|
||||
stateEntriesN := 20
|
||||
state := newRuleState(stateEntriesN)
|
||||
e := state.getLast()
|
||||
if !e.at.IsZero() {
|
||||
t.Fatalf("expected entry to be zero")
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
state.add(ruleStateEntry{at: now})
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != now {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, now)
|
||||
}
|
||||
|
||||
time.Sleep(time.Millisecond)
|
||||
now2 := time.Now()
|
||||
state.add(ruleStateEntry{at: now2})
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != now2 {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, now2)
|
||||
}
|
||||
|
||||
if len(state.getAll()) != 2 {
|
||||
t.Fatalf("expected for state to have 2 entries only; got %d",
|
||||
len(state.getAll()),
|
||||
)
|
||||
}
|
||||
|
||||
var last time.Time
|
||||
for i := 0; i < stateEntriesN*2; i++ {
|
||||
last = time.Now()
|
||||
state.add(ruleStateEntry{at: last})
|
||||
}
|
||||
|
||||
e = state.getLast()
|
||||
if e.at != last {
|
||||
t.Fatalf("expected entry at %v to be equal to %v",
|
||||
e.at, last)
|
||||
}
|
||||
|
||||
if len(state.getAll()) != stateEntriesN {
|
||||
t.Fatalf("expected for state to have %d entries only; got %d",
|
||||
stateEntriesN, len(state.getAll()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRule_stateConcurrent supposed to test concurrent
|
||||
// execution of state updates.
|
||||
// Should be executed with -race flag
|
||||
func TestRule_stateConcurrent(_ *testing.T) {
|
||||
state := newRuleState(20)
|
||||
|
||||
const workers = 50
|
||||
const iterations = 100
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < iterations; i++ {
|
||||
state.add(ruleStateEntry{at: time.Now()})
|
||||
state.getAll()
|
||||
state.getLast()
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
|
@ -10,6 +10,7 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/tpl"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
|
@ -143,38 +144,32 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
|
|||
}
|
||||
}
|
||||
|
||||
const (
|
||||
paramGroupID = "group_id"
|
||||
paramAlertID = "alert_id"
|
||||
paramRuleID = "rule_id"
|
||||
)
|
||||
|
||||
func (rh *requestHandler) getRule(r *http.Request) (APIRule, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
|
||||
func (rh *requestHandler) getRule(r *http.Request) (apiRule, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
|
||||
if err != nil {
|
||||
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
|
||||
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
|
||||
}
|
||||
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 0)
|
||||
ruleID, err := strconv.ParseUint(r.FormValue(paramRuleID), 10, 64)
|
||||
if err != nil {
|
||||
return APIRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
|
||||
return apiRule{}, fmt.Errorf("failed to read %q param: %s", paramRuleID, err)
|
||||
}
|
||||
rule, err := rh.m.RuleAPI(groupID, ruleID)
|
||||
obj, err := rh.m.ruleAPI(groupID, ruleID)
|
||||
if err != nil {
|
||||
return APIRule{}, errResponse(err, http.StatusNotFound)
|
||||
return apiRule{}, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
return rule, nil
|
||||
return obj, nil
|
||||
}
|
||||
|
||||
func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 0)
|
||||
func (rh *requestHandler) getAlert(r *http.Request) (*apiAlert, error) {
|
||||
groupID, err := strconv.ParseUint(r.FormValue(paramGroupID), 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q param: %s", paramGroupID, err)
|
||||
}
|
||||
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 0)
|
||||
alertID, err := strconv.ParseUint(r.FormValue(paramAlertID), 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q param: %s", paramAlertID, err)
|
||||
}
|
||||
a, err := rh.m.AlertAPI(groupID, alertID)
|
||||
a, err := rh.m.alertAPI(groupID, alertID)
|
||||
if err != nil {
|
||||
return nil, errResponse(err, http.StatusNotFound)
|
||||
}
|
||||
|
@ -184,17 +179,17 @@ func (rh *requestHandler) getAlert(r *http.Request) (*APIAlert, error) {
|
|||
type listGroupsResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
Groups []APIGroup `json:"groups"`
|
||||
Groups []apiGroup `json:"groups"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) groups() []APIGroup {
|
||||
func (rh *requestHandler) groups() []apiGroup {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
groups := make([]APIGroup, 0)
|
||||
groups := make([]apiGroup, 0)
|
||||
for _, g := range rh.m.groups {
|
||||
groups = append(groups, g.toAPI())
|
||||
groups = append(groups, groupToAPI(g))
|
||||
}
|
||||
|
||||
// sort list of alerts for deterministic output
|
||||
|
@ -221,35 +216,35 @@ func (rh *requestHandler) listGroups() ([]byte, error) {
|
|||
type listAlertsResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
Alerts []*APIAlert `json:"alerts"`
|
||||
Alerts []*apiAlert `json:"alerts"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
func (rh *requestHandler) groupAlerts() []GroupAlerts {
|
||||
func (rh *requestHandler) groupAlerts() []groupAlerts {
|
||||
rh.m.groupsMu.RLock()
|
||||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
var groupAlerts []GroupAlerts
|
||||
var gAlerts []groupAlerts
|
||||
for _, g := range rh.m.groups {
|
||||
var alerts []*APIAlert
|
||||
var alerts []*apiAlert
|
||||
for _, r := range g.Rules {
|
||||
a, ok := r.(*AlertingRule)
|
||||
a, ok := r.(*rule.AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, a.AlertsToAPI()...)
|
||||
alerts = append(alerts, ruleToAPIAlert(a)...)
|
||||
}
|
||||
if len(alerts) > 0 {
|
||||
groupAlerts = append(groupAlerts, GroupAlerts{
|
||||
Group: g.toAPI(),
|
||||
gAlerts = append(gAlerts, groupAlerts{
|
||||
Group: groupToAPI(g),
|
||||
Alerts: alerts,
|
||||
})
|
||||
}
|
||||
}
|
||||
sort.Slice(groupAlerts, func(i, j int) bool {
|
||||
return groupAlerts[i].Group.Name < groupAlerts[j].Group.Name
|
||||
sort.Slice(gAlerts, func(i, j int) bool {
|
||||
return gAlerts[i].Group.Name < gAlerts[j].Group.Name
|
||||
})
|
||||
return groupAlerts
|
||||
return gAlerts
|
||||
}
|
||||
|
||||
func (rh *requestHandler) listAlerts() ([]byte, error) {
|
||||
|
@ -257,14 +252,14 @@ func (rh *requestHandler) listAlerts() ([]byte, error) {
|
|||
defer rh.m.groupsMu.RUnlock()
|
||||
|
||||
lr := listAlertsResponse{Status: "success"}
|
||||
lr.Data.Alerts = make([]*APIAlert, 0)
|
||||
lr.Data.Alerts = make([]*apiAlert, 0)
|
||||
for _, g := range rh.m.groups {
|
||||
for _, r := range g.Rules {
|
||||
a, ok := r.(*AlertingRule)
|
||||
a, ok := r.(*rule.AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsToAPI()...)
|
||||
lr.Data.Alerts = append(lr.Data.Alerts, ruleToAPIAlert(a)...)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ btn-primary
|
|||
{% endif %}
|
||||
{% endfunc %}
|
||||
|
||||
{% func ListGroups(r *http.Request, originGroups []APIGroup) %}
|
||||
{% func ListGroups(r *http.Request, originGroups []apiGroup) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Groups", getLastConfigError()) %}
|
||||
{%code
|
||||
|
@ -46,9 +46,9 @@ btn-primary
|
|||
rOk := make(map[string]int)
|
||||
rNotOk := make(map[string]int)
|
||||
rNoMatch := make(map[string]int)
|
||||
var groups []APIGroup
|
||||
var groups []apiGroup
|
||||
for _, g := range originGroups {
|
||||
var rules []APIRule
|
||||
var rules []apiRule
|
||||
for _, r := range g.Rules {
|
||||
if r.LastError != "" {
|
||||
rNotOk[g.ID]++
|
||||
|
@ -166,7 +166,7 @@ btn-primary
|
|||
{% endfunc %}
|
||||
|
||||
|
||||
{% func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) %}
|
||||
{% func ListAlerts(r *http.Request, groupAlerts []groupAlerts) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "Alerts", getLastConfigError()) %}
|
||||
{% if len(groupAlerts) > 0 %}
|
||||
|
@ -183,7 +183,7 @@ btn-primary
|
|||
</div>
|
||||
{%code
|
||||
var keys []string
|
||||
alertsByRule := make(map[string][]*APIAlert)
|
||||
alertsByRule := make(map[string][]*apiAlert)
|
||||
for _, alert := range ga.Alerts {
|
||||
if len(alertsByRule[alert.RuleID]) < 1 {
|
||||
keys = append(keys, alert.RuleID)
|
||||
|
@ -310,7 +310,7 @@ btn-primary
|
|||
|
||||
{% endfunc %}
|
||||
|
||||
{% func Alert(r *http.Request, alert *APIAlert) %}
|
||||
{% func Alert(r *http.Request, alert *apiAlert) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
|
@ -397,7 +397,7 @@ btn-primary
|
|||
{% endfunc %}
|
||||
|
||||
|
||||
{% func RuleDetails(r *http.Request, rule APIRule) %}
|
||||
{% func RuleDetails(r *http.Request, rule apiRule) %}
|
||||
{%code prefix := utils.Prefix(r.URL.Path) %}
|
||||
{%= tpl.Header(r, navItems, "", getLastConfigError()) %}
|
||||
{%code
|
||||
|
@ -416,9 +416,9 @@ btn-primary
|
|||
var seriesFetchedEnabled bool
|
||||
var seriesFetchedWarning bool
|
||||
for _, u := range rule.Updates {
|
||||
if u.seriesFetched != nil {
|
||||
if u.SeriesFetched != nil {
|
||||
seriesFetchedEnabled = true
|
||||
if *u.seriesFetched == 0 && u.samples == 0{
|
||||
if *u.SeriesFetched == 0 && u.Samples == 0{
|
||||
seriesFetchedWarning = true
|
||||
}
|
||||
}
|
||||
|
@ -537,23 +537,23 @@ btn-primary
|
|||
<tbody>
|
||||
|
||||
{% for _, u := range rule.Updates %}
|
||||
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
|
||||
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
|
||||
<td>
|
||||
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.time.Format(time.RFC3339) %}</span>
|
||||
<span class="badge bg-primary rounded-pill me-3" title="Updated at">{%s u.Time.Format(time.RFC3339) %}</span>
|
||||
</td>
|
||||
<td class="text-center">{%d u.samples %}</td>
|
||||
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.seriesFetched != nil %}{%d *u.seriesFetched %}{% endif %}</td>{% endif %}
|
||||
<td class="text-center">{%f.3 u.duration.Seconds() %}s</td>
|
||||
<td class="text-center">{%s u.at.Format(time.RFC3339) %}</td>
|
||||
<td class="text-center">{%d u.Samples %}</td>
|
||||
{% if seriesFetchedEnabled %}<td class="text-center">{% if u.SeriesFetched != nil %}{%d *u.SeriesFetched %}{% endif %}</td>{% endif %}
|
||||
<td class="text-center">{%f.3 u.Duration.Seconds() %}s</td>
|
||||
<td class="text-center">{%s u.At.Format(time.RFC3339) %}</td>
|
||||
<td>
|
||||
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.curl %}</textarea>
|
||||
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">{%s u.Curl %}</textarea>
|
||||
</td>
|
||||
</tr>
|
||||
</li>
|
||||
{% if u.err != nil %}
|
||||
<tr{% if u.err != nil %} class="alert-danger"{% endif %}>
|
||||
{% if u.Err != nil %}
|
||||
<tr{% if u.Err != nil %} class="alert-danger"{% endif %}>
|
||||
<td colspan="{% if seriesFetchedEnabled %}6{%else%}5{%endif%}">
|
||||
<span class="alert-danger">{%v u.err %}</span>
|
||||
<span class="alert-danger">{%v u.Err %}</span>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
|
@ -582,7 +582,7 @@ btn-primary
|
|||
<span class="badge bg-warning text-dark" title="This firing state is kept because of `keep_firing_for`">stabilizing</span>
|
||||
{% endfunc %}
|
||||
|
||||
{% func seriesFetchedWarn(r APIRule) %}
|
||||
{% func seriesFetchedWarn(r apiRule) %}
|
||||
{% if isNoMatch(r) %}
|
||||
<svg xmlns="http://www.w3.org/2000/svg"
|
||||
data-bs-toggle="tooltip"
|
||||
|
@ -596,7 +596,7 @@ btn-primary
|
|||
{% endfunc %}
|
||||
|
||||
{%code
|
||||
func isNoMatch (r APIRule) bool {
|
||||
func isNoMatch (r apiRule) bool {
|
||||
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
|
||||
}
|
||||
%}
|
||||
|
|
|
@ -196,7 +196,7 @@ func buttonActive(filter, expValue string) string {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:41
|
||||
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []APIGroup) {
|
||||
func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups []apiGroup) {
|
||||
//line app/vmalert/web.qtpl:41
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -216,9 +216,9 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
|
|||
rOk := make(map[string]int)
|
||||
rNotOk := make(map[string]int)
|
||||
rNoMatch := make(map[string]int)
|
||||
var groups []APIGroup
|
||||
var groups []apiGroup
|
||||
for _, g := range originGroups {
|
||||
var rules []APIRule
|
||||
var rules []apiRule
|
||||
for _, r := range g.Rules {
|
||||
if r.LastError != "" {
|
||||
rNotOk[g.ID]++
|
||||
|
@ -610,7 +610,7 @@ func StreamListGroups(qw422016 *qt422016.Writer, r *http.Request, originGroups [
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:166
|
||||
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []APIGroup) {
|
||||
func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups []apiGroup) {
|
||||
//line app/vmalert/web.qtpl:166
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/web.qtpl:166
|
||||
|
@ -621,7 +621,7 @@ func WriteListGroups(qq422016 qtio422016.Writer, r *http.Request, originGroups [
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:166
|
||||
func ListGroups(r *http.Request, originGroups []APIGroup) string {
|
||||
func ListGroups(r *http.Request, originGroups []apiGroup) string {
|
||||
//line app/vmalert/web.qtpl:166
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/web.qtpl:166
|
||||
|
@ -636,7 +636,7 @@ func ListGroups(r *http.Request, originGroups []APIGroup) string {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:169
|
||||
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
|
||||
func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
|
||||
//line app/vmalert/web.qtpl:169
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -712,7 +712,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
|
|||
`)
|
||||
//line app/vmalert/web.qtpl:185
|
||||
var keys []string
|
||||
alertsByRule := make(map[string][]*APIAlert)
|
||||
alertsByRule := make(map[string][]*apiAlert)
|
||||
for _, alert := range ga.Alerts {
|
||||
if len(alertsByRule[alert.RuleID]) < 1 {
|
||||
keys = append(keys, alert.RuleID)
|
||||
|
@ -891,7 +891,7 @@ func StreamListAlerts(qw422016 *qt422016.Writer, r *http.Request, groupAlerts []
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:255
|
||||
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []GroupAlerts) {
|
||||
func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []groupAlerts) {
|
||||
//line app/vmalert/web.qtpl:255
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/web.qtpl:255
|
||||
|
@ -902,7 +902,7 @@ func WriteListAlerts(qq422016 qtio422016.Writer, r *http.Request, groupAlerts []
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:255
|
||||
func ListAlerts(r *http.Request, groupAlerts []GroupAlerts) string {
|
||||
func ListAlerts(r *http.Request, groupAlerts []groupAlerts) string {
|
||||
//line app/vmalert/web.qtpl:255
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/web.qtpl:255
|
||||
|
@ -1091,7 +1091,7 @@ func ListTargets(r *http.Request, targets map[notifier.TargetType][]notifier.Tar
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:313
|
||||
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
|
||||
func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *apiAlert) {
|
||||
//line app/vmalert/web.qtpl:313
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -1274,7 +1274,7 @@ func StreamAlert(qw422016 *qt422016.Writer, r *http.Request, alert *APIAlert) {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:397
|
||||
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
|
||||
func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *apiAlert) {
|
||||
//line app/vmalert/web.qtpl:397
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/web.qtpl:397
|
||||
|
@ -1285,7 +1285,7 @@ func WriteAlert(qq422016 qtio422016.Writer, r *http.Request, alert *APIAlert) {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:397
|
||||
func Alert(r *http.Request, alert *APIAlert) string {
|
||||
func Alert(r *http.Request, alert *apiAlert) string {
|
||||
//line app/vmalert/web.qtpl:397
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/web.qtpl:397
|
||||
|
@ -1300,7 +1300,7 @@ func Alert(r *http.Request, alert *APIAlert) string {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:400
|
||||
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule) {
|
||||
func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule apiRule) {
|
||||
//line app/vmalert/web.qtpl:400
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -1331,9 +1331,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
var seriesFetchedEnabled bool
|
||||
var seriesFetchedWarning bool
|
||||
for _, u := range rule.Updates {
|
||||
if u.seriesFetched != nil {
|
||||
if u.SeriesFetched != nil {
|
||||
seriesFetchedEnabled = true
|
||||
if *u.seriesFetched == 0 && u.samples == 0 {
|
||||
if *u.SeriesFetched == 0 && u.Samples == 0 {
|
||||
seriesFetchedWarning = true
|
||||
}
|
||||
}
|
||||
|
@ -1587,7 +1587,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
qw422016.N().S(`
|
||||
<tr`)
|
||||
//line app/vmalert/web.qtpl:540
|
||||
if u.err != nil {
|
||||
if u.Err != nil {
|
||||
//line app/vmalert/web.qtpl:540
|
||||
qw422016.N().S(` class="alert-danger"`)
|
||||
//line app/vmalert/web.qtpl:540
|
||||
|
@ -1597,13 +1597,13 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
<td>
|
||||
<span class="badge bg-primary rounded-pill me-3" title="Updated at">`)
|
||||
//line app/vmalert/web.qtpl:542
|
||||
qw422016.E().S(u.time.Format(time.RFC3339))
|
||||
qw422016.E().S(u.Time.Format(time.RFC3339))
|
||||
//line app/vmalert/web.qtpl:542
|
||||
qw422016.N().S(`</span>
|
||||
</td>
|
||||
<td class="text-center">`)
|
||||
//line app/vmalert/web.qtpl:544
|
||||
qw422016.N().D(u.samples)
|
||||
qw422016.N().D(u.Samples)
|
||||
//line app/vmalert/web.qtpl:544
|
||||
qw422016.N().S(`</td>
|
||||
`)
|
||||
|
@ -1612,9 +1612,9 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
//line app/vmalert/web.qtpl:545
|
||||
qw422016.N().S(`<td class="text-center">`)
|
||||
//line app/vmalert/web.qtpl:545
|
||||
if u.seriesFetched != nil {
|
||||
if u.SeriesFetched != nil {
|
||||
//line app/vmalert/web.qtpl:545
|
||||
qw422016.N().D(*u.seriesFetched)
|
||||
qw422016.N().D(*u.SeriesFetched)
|
||||
//line app/vmalert/web.qtpl:545
|
||||
}
|
||||
//line app/vmalert/web.qtpl:545
|
||||
|
@ -1625,18 +1625,18 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
qw422016.N().S(`
|
||||
<td class="text-center">`)
|
||||
//line app/vmalert/web.qtpl:546
|
||||
qw422016.N().FPrec(u.duration.Seconds(), 3)
|
||||
qw422016.N().FPrec(u.Duration.Seconds(), 3)
|
||||
//line app/vmalert/web.qtpl:546
|
||||
qw422016.N().S(`s</td>
|
||||
<td class="text-center">`)
|
||||
//line app/vmalert/web.qtpl:547
|
||||
qw422016.E().S(u.at.Format(time.RFC3339))
|
||||
qw422016.E().S(u.At.Format(time.RFC3339))
|
||||
//line app/vmalert/web.qtpl:547
|
||||
qw422016.N().S(`</td>
|
||||
<td>
|
||||
<textarea class="curl-area" rows="1" onclick="this.focus();this.select()">`)
|
||||
//line app/vmalert/web.qtpl:549
|
||||
qw422016.E().S(u.curl)
|
||||
qw422016.E().S(u.Curl)
|
||||
//line app/vmalert/web.qtpl:549
|
||||
qw422016.N().S(`</textarea>
|
||||
</td>
|
||||
|
@ -1644,12 +1644,12 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
</li>
|
||||
`)
|
||||
//line app/vmalert/web.qtpl:553
|
||||
if u.err != nil {
|
||||
if u.Err != nil {
|
||||
//line app/vmalert/web.qtpl:553
|
||||
qw422016.N().S(`
|
||||
<tr`)
|
||||
//line app/vmalert/web.qtpl:554
|
||||
if u.err != nil {
|
||||
if u.Err != nil {
|
||||
//line app/vmalert/web.qtpl:554
|
||||
qw422016.N().S(` class="alert-danger"`)
|
||||
//line app/vmalert/web.qtpl:554
|
||||
|
@ -1671,7 +1671,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
qw422016.N().S(`">
|
||||
<span class="alert-danger">`)
|
||||
//line app/vmalert/web.qtpl:556
|
||||
qw422016.E().V(u.err)
|
||||
qw422016.E().V(u.Err)
|
||||
//line app/vmalert/web.qtpl:556
|
||||
qw422016.N().S(`</span>
|
||||
</td>
|
||||
|
@ -1697,7 +1697,7 @@ func StreamRuleDetails(qw422016 *qt422016.Writer, r *http.Request, rule APIRule)
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:563
|
||||
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule) {
|
||||
func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule apiRule) {
|
||||
//line app/vmalert/web.qtpl:563
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/web.qtpl:563
|
||||
|
@ -1708,7 +1708,7 @@ func WriteRuleDetails(qq422016 qtio422016.Writer, r *http.Request, rule APIRule)
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:563
|
||||
func RuleDetails(r *http.Request, rule APIRule) string {
|
||||
func RuleDetails(r *http.Request, rule apiRule) string {
|
||||
//line app/vmalert/web.qtpl:563
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/web.qtpl:563
|
||||
|
@ -1853,7 +1853,7 @@ func badgeStabilizing() string {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:585
|
||||
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
|
||||
func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r apiRule) {
|
||||
//line app/vmalert/web.qtpl:585
|
||||
qw422016.N().S(`
|
||||
`)
|
||||
|
@ -1879,7 +1879,7 @@ func streamseriesFetchedWarn(qw422016 *qt422016.Writer, r APIRule) {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:596
|
||||
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
|
||||
func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r apiRule) {
|
||||
//line app/vmalert/web.qtpl:596
|
||||
qw422016 := qt422016.AcquireWriter(qq422016)
|
||||
//line app/vmalert/web.qtpl:596
|
||||
|
@ -1890,7 +1890,7 @@ func writeseriesFetchedWarn(qq422016 qtio422016.Writer, r APIRule) {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:596
|
||||
func seriesFetchedWarn(r APIRule) string {
|
||||
func seriesFetchedWarn(r apiRule) string {
|
||||
//line app/vmalert/web.qtpl:596
|
||||
qb422016 := qt422016.AcquireByteBuffer()
|
||||
//line app/vmalert/web.qtpl:596
|
||||
|
@ -1905,6 +1905,6 @@ func seriesFetchedWarn(r APIRule) string {
|
|||
}
|
||||
|
||||
//line app/vmalert/web.qtpl:599
|
||||
func isNoMatch(r APIRule) bool {
|
||||
func isNoMatch(r apiRule) bool {
|
||||
return r.LastSamples == 0 && r.LastSeriesFetched != nil && *r.LastSeriesFetched == 0
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
@ -9,32 +10,29 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
)
|
||||
|
||||
func TestHandler(t *testing.T) {
|
||||
ar := &AlertingRule{
|
||||
Name: "alert",
|
||||
alerts: map[uint64]*notifier.Alert{
|
||||
0: {State: notifier.StateFiring},
|
||||
},
|
||||
state: newRuleState(10),
|
||||
}
|
||||
ar.state.add(ruleStateEntry{
|
||||
time: time.Now(),
|
||||
at: time.Now(),
|
||||
samples: 10,
|
||||
fq := &datasource.FakeQuerier{}
|
||||
fq.Add(datasource.Metric{
|
||||
Values: []float64{1}, Timestamps: []int64{0},
|
||||
})
|
||||
rr := &RecordingRule{
|
||||
Name: "record",
|
||||
state: newRuleState(10),
|
||||
}
|
||||
g := &Group{
|
||||
g := &rule.Group{
|
||||
Name: "group",
|
||||
Rules: []Rule{ar, rr},
|
||||
Concurrency: 1,
|
||||
}
|
||||
m := &manager{groups: make(map[uint64]*Group)}
|
||||
m.groups[0] = g
|
||||
ar := rule.NewAlertingRule(fq, g, config.Rule{ID: 0, Alert: "alert"})
|
||||
rr := rule.NewRecordingRule(fq, g, config.Rule{ID: 1, Record: "record"})
|
||||
g.Rules = []rule.Rule{ar, rr}
|
||||
g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, nil, time.Time{})
|
||||
|
||||
m := &manager{groups: map[uint64]*rule.Group{
|
||||
g.ID(): g,
|
||||
}}
|
||||
rh := &requestHandler{m: m}
|
||||
|
||||
getResp := func(url string, to interface{}, code int) {
|
||||
|
@ -70,13 +68,13 @@ func TestHandler(t *testing.T) {
|
|||
})
|
||||
|
||||
t.Run("/vmalert/rule", func(t *testing.T) {
|
||||
a := ar.ToAPI()
|
||||
a := ruleToAPI(ar)
|
||||
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
|
||||
r := rr.ToAPI()
|
||||
r := ruleToAPI(rr)
|
||||
getResp(ts.URL+"/vmalert/"+r.WebLink(), nil, 200)
|
||||
})
|
||||
t.Run("/vmalert/alert", func(t *testing.T) {
|
||||
alerts := ar.AlertsToAPI()
|
||||
alerts := ruleToAPIAlert(ar)
|
||||
for _, a := range alerts {
|
||||
getResp(ts.URL+"/vmalert/"+a.WebLink(), nil, 200)
|
||||
}
|
||||
|
@ -103,14 +101,14 @@ func TestHandler(t *testing.T) {
|
|||
}
|
||||
})
|
||||
t.Run("/api/v1/alert?alertID&groupID", func(t *testing.T) {
|
||||
expAlert := ar.newAlertAPI(*ar.alerts[0])
|
||||
alert := &APIAlert{}
|
||||
expAlert := newAlertAPI(ar, ar.GetAlerts()[0])
|
||||
alert := &apiAlert{}
|
||||
getResp(ts.URL+"/"+expAlert.APILink(), alert, 200)
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
}
|
||||
|
||||
alert = &APIAlert{}
|
||||
alert = &apiAlert{}
|
||||
getResp(ts.URL+"/vmalert/"+expAlert.APILink(), alert, 200)
|
||||
if !reflect.DeepEqual(alert, expAlert) {
|
||||
t.Errorf("expected %v is equal to %v", alert, expAlert)
|
||||
|
@ -148,7 +146,7 @@ func TestHandler(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestEmptyResponse(t *testing.T) {
|
||||
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*Group)}}
|
||||
rhWithNoGroups := &requestHandler{m: &manager{groups: make(map[uint64]*rule.Group)}}
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithNoGroups.handler(w, r) }))
|
||||
defer ts.Close()
|
||||
|
||||
|
@ -201,7 +199,7 @@ func TestEmptyResponse(t *testing.T) {
|
|||
}
|
||||
})
|
||||
|
||||
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*Group{0: {Name: "test"}}}}
|
||||
rhWithEmptyGroup := &requestHandler{m: &manager{groups: map[uint64]*rule.Group{0: {Name: "test"}}}}
|
||||
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rhWithEmptyGroup.handler(w, r) })
|
||||
|
||||
t.Run("empty group /api/v1/rules", func(t *testing.T) {
|
||||
|
|
|
@ -2,13 +2,28 @@ package main
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
||||
)
|
||||
|
||||
// APIAlert represents a notifier.AlertingRule state
|
||||
const (
|
||||
// ParamGroupID is group id key in url parameter
|
||||
paramGroupID = "group_id"
|
||||
// ParamAlertID is alert id key in url parameter
|
||||
paramAlertID = "alert_id"
|
||||
// ParamRuleID is rule id key in url parameter
|
||||
paramRuleID = "rule_id"
|
||||
)
|
||||
|
||||
// apiAlert represents a notifier.AlertingRule state
|
||||
// for WEB view
|
||||
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIAlert struct {
|
||||
type apiAlert struct {
|
||||
State string `json:"state"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
|
@ -38,24 +53,24 @@ type APIAlert struct {
|
|||
}
|
||||
|
||||
// WebLink returns a link to the alert which can be used in UI.
|
||||
func (aa *APIAlert) WebLink() string {
|
||||
func (aa *apiAlert) WebLink() string {
|
||||
return fmt.Sprintf("alert?%s=%s&%s=%s",
|
||||
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
|
||||
}
|
||||
|
||||
// APILink returns a link to the alert's JSON representation.
|
||||
func (aa *APIAlert) APILink() string {
|
||||
func (aa *apiAlert) APILink() string {
|
||||
return fmt.Sprintf("api/v1/alert?%s=%s&%s=%s",
|
||||
paramGroupID, aa.GroupID, paramAlertID, aa.ID)
|
||||
}
|
||||
|
||||
// APIGroup represents Group for WEB view
|
||||
// apiGroup represents Group for web view
|
||||
// https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIGroup struct {
|
||||
type apiGroup struct {
|
||||
// Name is the group name as present in the config
|
||||
Name string `json:"name"`
|
||||
// Rules contains both recording and alerting rules
|
||||
Rules []APIRule `json:"rules"`
|
||||
Rules []apiRule `json:"rules"`
|
||||
// Interval is the Group's evaluation interval in float seconds as present in the file.
|
||||
Interval float64 `json:"interval"`
|
||||
// LastEvaluation is the timestamp of the last time the Group was executed
|
||||
|
@ -81,15 +96,15 @@ type APIGroup struct {
|
|||
Labels map[string]string `json:"labels,omitempty"`
|
||||
}
|
||||
|
||||
// GroupAlerts represents a group of alerts for WEB view
|
||||
type GroupAlerts struct {
|
||||
Group APIGroup
|
||||
Alerts []*APIAlert
|
||||
// groupAlerts represents a group of alerts for WEB view
|
||||
type groupAlerts struct {
|
||||
Group apiGroup
|
||||
Alerts []*apiAlert
|
||||
}
|
||||
|
||||
// APIRule represents a Rule for WEB view
|
||||
// apiRule represents a Rule for web view
|
||||
// see https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#get-apiv1rules
|
||||
type APIRule struct {
|
||||
type apiRule struct {
|
||||
// State must be one of these under following scenarios
|
||||
// "pending": at least 1 alert in the rule in pending state and no other alert in firing ruleState.
|
||||
// "firing": at least 1 alert in the rule in firing state.
|
||||
|
@ -111,7 +126,7 @@ type APIRule struct {
|
|||
// LastEvaluation is the timestamp of the last time the rule was executed
|
||||
LastEvaluation time.Time `json:"lastEvaluation"`
|
||||
// Alerts is the list of all the alerts in this rule that are currently pending or firing
|
||||
Alerts []*APIAlert `json:"alerts,omitempty"`
|
||||
Alerts []*apiAlert `json:"alerts,omitempty"`
|
||||
// Health is the health of rule evaluation.
|
||||
// It MUST be one of "ok", "err", "unknown"
|
||||
Health string `json:"health"`
|
||||
|
@ -138,11 +153,206 @@ type APIRule struct {
|
|||
// MaxUpdates is the max number of recorded ruleStateEntry objects
|
||||
MaxUpdates int `json:"max_updates_entries"`
|
||||
// Updates contains the ordered list of recorded ruleStateEntry objects
|
||||
Updates []ruleStateEntry `json:"-"`
|
||||
Updates []rule.StateEntry `json:"-"`
|
||||
}
|
||||
|
||||
// WebLink returns a link to the alert which can be used in UI.
|
||||
func (ar APIRule) WebLink() string {
|
||||
func (ar apiRule) WebLink() string {
|
||||
return fmt.Sprintf("rule?%s=%s&%s=%s",
|
||||
paramGroupID, ar.GroupID, paramRuleID, ar.ID)
|
||||
}
|
||||
|
||||
func ruleToAPI(r interface{}) apiRule {
|
||||
if ar, ok := r.(*rule.AlertingRule); ok {
|
||||
return alertingToAPI(ar)
|
||||
}
|
||||
if rr, ok := r.(*rule.RecordingRule); ok {
|
||||
return recordingToAPI(rr)
|
||||
}
|
||||
return apiRule{}
|
||||
}
|
||||
|
||||
func recordingToAPI(rr *rule.RecordingRule) apiRule {
|
||||
lastState := rule.GetLastEntry(rr)
|
||||
r := apiRule{
|
||||
Type: "recording",
|
||||
DatasourceType: rr.Type.String(),
|
||||
Name: rr.Name,
|
||||
Query: rr.Expr,
|
||||
Labels: rr.Labels,
|
||||
LastEvaluation: lastState.Time,
|
||||
EvaluationTime: lastState.Duration.Seconds(),
|
||||
Health: "ok",
|
||||
LastSamples: lastState.Samples,
|
||||
LastSeriesFetched: lastState.SeriesFetched,
|
||||
MaxUpdates: rule.GetRuleStateSize(rr),
|
||||
Updates: rule.GetAllRuleState(rr),
|
||||
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", rr.ID()),
|
||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||
}
|
||||
if lastState.Err != nil {
|
||||
r.LastError = lastState.Err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// alertingToAPI returns Rule representation in form of apiRule
|
||||
func alertingToAPI(ar *rule.AlertingRule) apiRule {
|
||||
lastState := rule.GetLastEntry(ar)
|
||||
r := apiRule{
|
||||
Type: "alerting",
|
||||
DatasourceType: ar.Type.String(),
|
||||
Name: ar.Name,
|
||||
Query: ar.Expr,
|
||||
Duration: ar.For.Seconds(),
|
||||
KeepFiringFor: ar.KeepFiringFor.Seconds(),
|
||||
Labels: ar.Labels,
|
||||
Annotations: ar.Annotations,
|
||||
LastEvaluation: lastState.Time,
|
||||
EvaluationTime: lastState.Duration.Seconds(),
|
||||
Health: "ok",
|
||||
State: "inactive",
|
||||
Alerts: ruleToAPIAlert(ar),
|
||||
LastSamples: lastState.Samples,
|
||||
LastSeriesFetched: lastState.SeriesFetched,
|
||||
MaxUpdates: rule.GetRuleStateSize(ar),
|
||||
Updates: rule.GetAllRuleState(ar),
|
||||
Debug: ar.Debug,
|
||||
|
||||
// encode as strings to avoid rounding in JSON
|
||||
ID: fmt.Sprintf("%d", ar.ID()),
|
||||
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
||||
}
|
||||
if lastState.Err != nil {
|
||||
r.LastError = lastState.Err.Error()
|
||||
r.Health = "err"
|
||||
}
|
||||
// satisfy apiRule.State logic
|
||||
if len(r.Alerts) > 0 {
|
||||
r.State = notifier.StatePending.String()
|
||||
stateFiring := notifier.StateFiring.String()
|
||||
for _, a := range r.Alerts {
|
||||
if a.State == stateFiring {
|
||||
r.State = stateFiring
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// ruleToAPIAlert generates list of apiAlert objects from existing alerts
|
||||
func ruleToAPIAlert(ar *rule.AlertingRule) []*apiAlert {
|
||||
var alerts []*apiAlert
|
||||
for _, a := range ar.GetAlerts() {
|
||||
if a.State == notifier.StateInactive {
|
||||
continue
|
||||
}
|
||||
alerts = append(alerts, newAlertAPI(ar, a))
|
||||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
// alertToAPI generates apiAlert object from alert by its id(hash)
|
||||
func alertToAPI(ar *rule.AlertingRule, id uint64) *apiAlert {
|
||||
a := ar.GetAlert(id)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
return newAlertAPI(ar, a)
|
||||
}
|
||||
|
||||
// NewAlertAPI creates apiAlert for notifier.Alert
|
||||
func newAlertAPI(ar *rule.AlertingRule, a *notifier.Alert) *apiAlert {
|
||||
aa := &apiAlert{
|
||||
// encode as strings to avoid rounding
|
||||
ID: fmt.Sprintf("%d", a.ID),
|
||||
GroupID: fmt.Sprintf("%d", a.GroupID),
|
||||
RuleID: fmt.Sprintf("%d", ar.RuleID),
|
||||
|
||||
Name: a.Name,
|
||||
Expression: ar.Expr,
|
||||
Labels: a.Labels,
|
||||
Annotations: a.Annotations,
|
||||
State: a.State.String(),
|
||||
ActiveAt: a.ActiveAt,
|
||||
Restored: a.Restored,
|
||||
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
|
||||
}
|
||||
if alertURLGeneratorFn != nil {
|
||||
aa.SourceLink = alertURLGeneratorFn(*a)
|
||||
}
|
||||
if a.State == notifier.StateFiring && !a.KeepFiringSince.IsZero() {
|
||||
aa.Stabilizing = true
|
||||
}
|
||||
return aa
|
||||
}
|
||||
|
||||
func groupToAPI(g *rule.Group) apiGroup {
|
||||
g = g.DeepCopy()
|
||||
ag := apiGroup{
|
||||
// encode as string to avoid rounding
|
||||
ID: fmt.Sprintf("%d", g.ID()),
|
||||
|
||||
Name: g.Name,
|
||||
Type: g.Type.String(),
|
||||
File: g.File,
|
||||
Interval: g.Interval.Seconds(),
|
||||
LastEvaluation: g.LastEvaluation,
|
||||
Concurrency: g.Concurrency,
|
||||
Params: urlValuesToStrings(g.Params),
|
||||
Headers: headersToStrings(g.Headers),
|
||||
NotifierHeaders: headersToStrings(g.NotifierHeaders),
|
||||
|
||||
Labels: g.Labels,
|
||||
}
|
||||
ag.Rules = make([]apiRule, 0)
|
||||
for _, r := range g.Rules {
|
||||
ag.Rules = append(ag.Rules, ruleToAPI(r))
|
||||
}
|
||||
return ag
|
||||
}
|
||||
|
||||
func urlValuesToStrings(values url.Values) []string {
|
||||
if len(values) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(values))
|
||||
for k := range values {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
params := values[k]
|
||||
for _, v := range params {
|
||||
res = append(res, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func headersToStrings(headers map[string]string) []string {
|
||||
if len(headers) < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := make([]string, 0, len(headers))
|
||||
for k := range headers {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
var res []string
|
||||
for _, k := range keys {
|
||||
v := headers[k]
|
||||
res = append(res, fmt.Sprintf("%s: %s", k, v))
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
|
|
23
app/vmalert/web_types_test.go
Normal file
23
app/vmalert/web_types_test.go
Normal file
|
@ -0,0 +1,23 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestUrlValuesToStrings(t *testing.T) {
|
||||
mapQueryParams := map[string][]string{
|
||||
"param1": {"param1"},
|
||||
"param2": {"anotherparam"},
|
||||
}
|
||||
expectedRes := []string{"param1=param1", "param2=anotherparam"}
|
||||
res := urlValuesToStrings(mapQueryParams)
|
||||
|
||||
if len(res) != len(expectedRes) {
|
||||
t.Errorf("Expected length %d, but got %d", len(expectedRes), len(res))
|
||||
}
|
||||
for ind, val := range expectedRes {
|
||||
if val != res[ind] {
|
||||
t.Errorf("Expected %v; but got %v", val, res[ind])
|
||||
}
|
||||
}
|
||||
}
|
|
@ -33,7 +33,7 @@ var (
|
|||
)
|
||||
|
||||
var (
|
||||
saCfgReloaderStopCh = make(chan struct{})
|
||||
saCfgReloaderStopCh chan struct{}
|
||||
saCfgReloaderWG sync.WaitGroup
|
||||
|
||||
saCfgReloads = metrics.NewCounter(`vminsert_streamagg_config_reloads_total`)
|
||||
|
@ -62,6 +62,8 @@ func CheckStreamAggrConfig() error {
|
|||
//
|
||||
// MustStopStreamAggr must be called when stream aggr is no longer needed.
|
||||
func InitStreamAggr() {
|
||||
saCfgReloaderStopCh = make(chan struct{})
|
||||
|
||||
if *streamAggrConfig == "" {
|
||||
return
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 20
|
||||
weight: 20
|
||||
sort: 29
|
||||
weight: 29
|
||||
title: Articles
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 20
|
||||
parent: 'victoriametrics'
|
||||
weight: 29
|
||||
aliases:
|
||||
- /Articles.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 23
|
||||
weight: 23
|
||||
sort: 32
|
||||
weight: 32
|
||||
title: VictoriaMetrics best practices
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 23
|
||||
parent: 'victoriametrics'
|
||||
weight: 32
|
||||
aliases:
|
||||
- /BestPractices.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 16
|
||||
weight: 16
|
||||
sort: 25
|
||||
weight: 25
|
||||
title: CHANGELOG
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 16
|
||||
parent: 'victoriametrics'
|
||||
weight: 25
|
||||
aliases:
|
||||
- /CHANGELOG.html
|
||||
---
|
||||
|
@ -44,8 +44,10 @@ The sandbox cluster installation is running under the constant load generated by
|
|||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve repeated VMUI page load times by enabling caching of static js and css at web browser side according to [these recommendations](https://developer.chrome.com/docs/lighthouse/performance/uses-long-cache-ttl/).
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show information about lines with bigger values at the top of the legend under the graph in order to simplify graph analysis.
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): reduce vertical space usage, so more information is visible on the screen without scrolling.
|
||||
* FEATURE: [vmalert-tool](https://docs.victoriametrics.com/#vmalert-tool): add `unittest` command to run unittest for alerting and recording rules. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4789) for details.
|
||||
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
|
||||
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087) for details.
|
||||
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 19
|
||||
weight: 19
|
||||
sort: 28
|
||||
weight: 28
|
||||
title: CHANGELOG for the year 2020
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 19
|
||||
parent: 'victoriametrics'
|
||||
weight: 28
|
||||
aliases:
|
||||
- /CHANGELOG.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 18
|
||||
weight: 18
|
||||
sort: 27
|
||||
weight: 27
|
||||
title: CHANGELOG for the year 2021
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 18
|
||||
parent: 'victoriametrics'
|
||||
weight: 27
|
||||
aliases:
|
||||
- /CHANGELOG.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 17
|
||||
weight: 17
|
||||
sort: 26
|
||||
weight: 26
|
||||
title: CHANGELOG for the year 2022
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 17
|
||||
parent: 'victoriametrics'
|
||||
weight: 26
|
||||
aliases:
|
||||
- /CHANGELOG.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 12
|
||||
weight: 12
|
||||
sort: 21
|
||||
weight: 21
|
||||
title: Case studies and talks
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 12
|
||||
parent: 'victoriametrics'
|
||||
weight: 21
|
||||
aliases:
|
||||
- /CaseStudies.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 15
|
||||
weight: 15
|
||||
sort: 24
|
||||
weight: 24
|
||||
title: FAQ
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 15
|
||||
parent: 'victoriametrics'
|
||||
weight: 24
|
||||
aliases:
|
||||
- /FAQ.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 14
|
||||
weight: 14
|
||||
sort: 23
|
||||
weight: 23
|
||||
title: MetricsQL
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 14
|
||||
parent: 'victoriametrics'
|
||||
weight: 23
|
||||
aliases:
|
||||
- /ExtendedPromQL.html
|
||||
- /MetricsQL.html
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 22
|
||||
weight: 22
|
||||
sort: 31
|
||||
weight: 31
|
||||
title: VictoriaMetrics Cluster Per Tenant Statistic
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 22
|
||||
parent: 'victoriametrics'
|
||||
weight: 31
|
||||
aliases:
|
||||
- /PerTenantStatistic.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 13
|
||||
weight: 13
|
||||
sort: 22
|
||||
weight: 22
|
||||
title: Quick start
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 13
|
||||
parent: 'victoriametrics'
|
||||
weight: 22
|
||||
aliases:
|
||||
- /Quick-Start.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 21
|
||||
weight: 21
|
||||
sort: 30
|
||||
weight: 30
|
||||
title: Release process guidance
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 21
|
||||
parent: 'victoriametrics'
|
||||
weight: 30
|
||||
aliases:
|
||||
- /Release-Guide.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 26
|
||||
weight: 26
|
||||
sort: 35
|
||||
weight: 35
|
||||
title: Troubleshooting
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 26
|
||||
parent: 'victoriametrics'
|
||||
weight: 35
|
||||
aliases:
|
||||
- /Troubleshooting.html
|
||||
---
|
||||
|
|
|
@ -4,7 +4,7 @@ weight: 99
|
|||
title: VictoriaMetrics Enterprise
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
parent: 'victoriametrics'
|
||||
weight: 99
|
||||
aliases:
|
||||
- /enterprise.html
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 25
|
||||
weight: 25
|
||||
sort: 34
|
||||
weight: 34
|
||||
title: Key concepts
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 25
|
||||
parent: 'victoriametrics'
|
||||
weight: 34
|
||||
aliases:
|
||||
- /keyConcepts.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 28
|
||||
weight: 28
|
||||
sort: 37
|
||||
weight: 37
|
||||
title: Relabeling cookbook
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 25
|
||||
parent: 'victoriametrics'
|
||||
weight: 37
|
||||
aliases:
|
||||
- /relabeling.html
|
||||
---
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 27
|
||||
weight: 27
|
||||
sort: 36
|
||||
weight: 36
|
||||
title: Prometheus service discovery
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 27
|
||||
parent: 'victoriametrics'
|
||||
weight: 36
|
||||
aliases:
|
||||
- /sd_configs.html
|
||||
---
|
||||
|
|
|
@ -4,7 +4,7 @@ weight: 98
|
|||
title: Streaming aggregation
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
parent: 'victoriametrics'
|
||||
weight: 98
|
||||
aliases:
|
||||
- /stream-aggregation.html
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
---
|
||||
sort: 24
|
||||
weight: 24
|
||||
sort: 33
|
||||
weight: 33
|
||||
title: VictoriaMetrics API examples
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
weight: 24
|
||||
parent: 'victoriametrics'
|
||||
weight: 33
|
||||
---
|
||||
|
||||
# VictoriaMetrics API examples
|
||||
|
|
253
docs/vmalert-tool.md
Normal file
253
docs/vmalert-tool.md
Normal file
|
@ -0,0 +1,253 @@
|
|||
---
|
||||
sort: 12
|
||||
weight: 12
|
||||
menu:
|
||||
docs:
|
||||
parent: 'victoriametrics'
|
||||
weight: 12
|
||||
title: vmalert-tool
|
||||
---
|
||||
|
||||
# vmalert-tool
|
||||
|
||||
VMAlert command-line tool
|
||||
|
||||
## Unit testing for rules
|
||||
|
||||
You can use `vmalert-tool` to run unit tests for alerting and recording rules.
|
||||
It will perform the following actions:
|
||||
* sets up an isolated VictoriaMetrics instance;
|
||||
* simulates the periodic ingestion of time series;
|
||||
* queries the ingested data for recording and alerting rules evaluation like [vmalert](https://docs.victoriametrics.com/vmalert.html);
|
||||
* checks whether the firing alerts or resulting recording rules match the expected results.
|
||||
|
||||
See how to run vmalert-tool for unit test below:
|
||||
```
|
||||
# Run vmalert-tool with one or multiple test files via --files cmd-line flag
|
||||
./vmalert-tool unittest --files test1.yaml --files test2.yaml
|
||||
```
|
||||
|
||||
vmalert-tool unittest is compatible with [Prometheus config format for tests](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-file-format)
|
||||
except `promql_expr_test` field. Use `metricsql_expr_test` field name instead. The name is different because vmalert-tool
|
||||
validates and executes [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html) expressions,
|
||||
which aren't always backward compatible with [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/).
|
||||
|
||||
### Test file format
|
||||
|
||||
The configuration format for files specified in `--files` cmd-line flag is the following:
|
||||
```
|
||||
# Path to the files or http url containing [rule groups](https://docs.victoriametrics.com/vmalert.html#groups) configuration.
|
||||
# Enterprise version of vmalert-tool supports S3 and GCS paths to rules.
|
||||
rule_files:
|
||||
[ - <string> ]
|
||||
|
||||
# The evaluation interval for rules specified in `rule_files`
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
# Groups listed below will be evaluated by order.
|
||||
# Not All the groups need not be mentioned, if not, they will be evaluated by define order in rule_files.
|
||||
group_eval_order:
|
||||
[ - <string> ]
|
||||
|
||||
# The list of unit test files to be checked during evaluation.
|
||||
tests:
|
||||
[ - <test_group> ]
|
||||
```
|
||||
|
||||
#### `<test_group>`
|
||||
|
||||
```
|
||||
# Interval between samples for input series
|
||||
interval: <duration>
|
||||
# Time series to persist into the database according to configured <interval> before running tests.
|
||||
input_series:
|
||||
[ - <series> ]
|
||||
|
||||
# Name of the test group, optional
|
||||
[ name: <string> ]
|
||||
|
||||
# Unit tests for alerting rules
|
||||
alert_rule_test:
|
||||
[ - <alert_test_case> ]
|
||||
|
||||
# Unit tests for Metricsql expressions.
|
||||
metricsql_expr_test:
|
||||
[ - <metricsql_expr_test> ]
|
||||
|
||||
# External labels accessible for templating.
|
||||
external_labels:
|
||||
[ <labelname>: <string> ... ]
|
||||
|
||||
```
|
||||
|
||||
#### `<series>`
|
||||
|
||||
```
|
||||
# series in the following format '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
series: <string>
|
||||
|
||||
# values support several special equations:
|
||||
# 'a+bxc' becomes 'a a+b a+(2*b) a+(3*b) … a+(c*b)'
|
||||
# Read this as series starts at a, then c further samples incrementing by b.
|
||||
# 'a-bxc' becomes 'a a-b a-(2*b) a-(3*b) … a-(c*b)'
|
||||
# Read this as series starts at a, then c further samples decrementing by b (or incrementing by negative b).
|
||||
# '_' represents a missing sample from scrape
|
||||
# 'stale' indicates a stale sample
|
||||
# Examples:
|
||||
# 1. '-2+4x3' becomes '-2 2 6 10' - series starts at -2, then 3 further samples incrementing by 4.
|
||||
# 2. ' 1-2x4' becomes '1 -1 -3 -5 -7' - series starts at 1, then 4 further samples decrementing by 2.
|
||||
# 3. ' 1x4' becomes '1 1 1 1 1' - shorthand for '1+0x4', series starts at 1, then 4 further samples incrementing by 0.
|
||||
# 4. ' 1 _x3 stale' becomes '1 _ _ _ stale' - the missing sample cannot increment, so 3 missing samples are produced by the '_x3' expression.
|
||||
values: <string>
|
||||
```
|
||||
|
||||
#### `<alert_test_case>`
|
||||
|
||||
vmalert by default adds `alertgroup` and `alertname` to the generated alerts and time series.
|
||||
So you will need to specify both `groupname` and `alertname` under a single `<alert_test_case>`,
|
||||
but no need to add them under `exp_alerts`.
|
||||
You can also pass `--disableAlertgroupLabel` to skip `alertgroup` check.
|
||||
|
||||
```
|
||||
# The time elapsed from time=0s when this alerting rule should be checked.
|
||||
# Means this rule should be firing at this point, or shouldn't be firing if 'exp_alerts' is empty.
|
||||
eval_time: <duration>
|
||||
|
||||
# Name of the group name to be tested.
|
||||
groupname: <string>
|
||||
|
||||
# Name of the alert to be tested.
|
||||
alertname: <string>
|
||||
|
||||
# List of the expected alerts that are firing under the given alertname at
|
||||
# the given evaluation time. If you want to test if an alerting rule should
|
||||
# not be firing, then you can mention only the fields above and leave 'exp_alerts' empty.
|
||||
exp_alerts:
|
||||
[ - <alert> ]
|
||||
```
|
||||
|
||||
#### `<alert>`
|
||||
|
||||
```
|
||||
# These are the expanded labels and annotations of the expected alert.
|
||||
# Note: labels also include the labels of the sample associated with the alert
|
||||
exp_labels:
|
||||
[ <labelname>: <string> ]
|
||||
exp_annotations:
|
||||
[ <labelname>: <string> ]
|
||||
```
|
||||
|
||||
#### `<metricsql_expr_test>`
|
||||
|
||||
```
|
||||
# Expression to evaluate
|
||||
expr: <string>
|
||||
|
||||
# The time elapsed from time=0s when this expression be evaluated.
|
||||
eval_time: <duration>
|
||||
|
||||
# Expected samples at the given evaluation time.
|
||||
exp_samples:
|
||||
[ - <sample> ]
|
||||
```
|
||||
|
||||
#### `<sample>`
|
||||
|
||||
```
|
||||
# Labels of the sample in usual series notation '<metric name>{<label name>=<label value>, ...}'
|
||||
# Examples:
|
||||
# series_name{label1="value1", label2="value2"}
|
||||
# go_goroutines{job="prometheus", instance="localhost:9090"}
|
||||
labels: <string>
|
||||
|
||||
# The expected value of the Metricsql expression.
|
||||
value: <number>
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
This is an example input file for unit testing which will pass.
|
||||
`test.yaml` is the test file which follows the syntax above and `alerts.yaml` contains the alerting rules.
|
||||
|
||||
With `rules.yaml` in the same directory, run `./vmalert-tool unittest --files=./unittest/testdata/test.yaml`.
|
||||
|
||||
#### `test.yaml`
|
||||
|
||||
```
|
||||
rule_files:
|
||||
- rules.yaml
|
||||
|
||||
evaluation_interval: 1m
|
||||
|
||||
tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="prometheus", instance="localhost:9090"}'
|
||||
values: "0+0x1440"
|
||||
|
||||
metricsql_expr_test:
|
||||
- expr: suquery_interval_test
|
||||
eval_time: 4m
|
||||
exp_samples:
|
||||
- labels: '{__name__="suquery_interval_test", datacenter="dc-123", instance="localhost:9090", job="prometheus"}'
|
||||
value: 1
|
||||
|
||||
alert_rule_test:
|
||||
- eval_time: 2h
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: prometheus
|
||||
severity: page
|
||||
instance: localhost:9090
|
||||
datacenter: dc-123
|
||||
exp_annotations:
|
||||
summary: "Instance localhost:9090 down"
|
||||
description: "localhost:9090 of job prometheus has been down for more than 5 minutes."
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: AlwaysFiring
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
datacenter: dc-123
|
||||
|
||||
- eval_time: 0
|
||||
groupname: group1
|
||||
alertname: InstanceDown
|
||||
exp_alerts: []
|
||||
|
||||
external_labels:
|
||||
datacenter: dc-123
|
||||
```
|
||||
|
||||
#### `alerts.yaml`
|
||||
|
||||
```
|
||||
# This is the rules file.
|
||||
|
||||
groups:
|
||||
- name: group1
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- alert: AlwaysFiring
|
||||
expr: 1
|
||||
|
||||
- name: group2
|
||||
rules:
|
||||
- record: job:test:count_over_time1m
|
||||
expr: sum without(instance) (count_over_time(test[1m]))
|
||||
- record: suquery_interval_test
|
||||
expr: count_over_time(up[5m:])
|
||||
```
|
|
@ -765,6 +765,11 @@ See full description for these flags in `./vmalert -help`.
|
|||
* `limit` group's param has no effect during replay (might be changed in future);
|
||||
* `keep_firing_for` alerting rule param has no effect during replay (might be changed in future).
|
||||
|
||||
## Unit Testing for Rules
|
||||
|
||||
You can use `vmalert-tool` to test your alerting and recording rules like [promtool does](https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/).
|
||||
See more details [here](https://docs.victoriametrics.com/vmalert-tool.html#Unit-testing-for-rules).
|
||||
|
||||
## Monitoring
|
||||
|
||||
`vmalert` exports various metrics in Prometheus exposition format at `http://vmalert-host:8880/metrics` page.
|
||||
|
|
|
@ -4,7 +4,7 @@ weight: 11
|
|||
title: vmanomaly
|
||||
menu:
|
||||
docs:
|
||||
parent: "victoriametrics"
|
||||
parent: 'victoriametrics'
|
||||
weight: 11
|
||||
aliases:
|
||||
- /vmanomaly.html
|
||||
|
|
|
@ -53,3 +53,11 @@ func ParseDuration(s string) (time.Duration, error) {
|
|||
}
|
||||
return time.Duration(ms) * time.Millisecond, nil
|
||||
}
|
||||
|
||||
// ParseTime returns time for pd.
|
||||
func (pd *Duration) ParseTime() time.Time {
|
||||
if pd == nil {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.UnixMilli(pd.Duration().Milliseconds())
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue